Blame - HTMLparser.c - fp2-dev/platform/external/libxml2

blob: af941a0ec3d90fb90a786f9d0ab7404da8efae2f [file] [log] [blame]

Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1	/*
				2	* HTMLparser.c : an HTML 4.0 non-verifying parser
				3	*
				4	* See Copyright for the status of this software.
				5	*
Daniel Veillard	c5d6434	2001-06-24 12:13:24 +0000	[diff] [blame]	6	* daniel@veillard.com
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	7	*/
				8
Bjorn Reese	70a9da5	2001-04-21 16:57:29 +0000	[diff] [blame]	9	#include "libxml.h"
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	10	#ifdef LIBXML_HTML_ENABLED
Bjorn Reese	70a9da5	2001-04-21 16:57:29 +0000	[diff] [blame]	11
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	12	#include <string.h>
				13	#ifdef HAVE_CTYPE_H
				14	#include <ctype.h>
				15	#endif
				16	#ifdef HAVE_STDLIB_H
				17	#include <stdlib.h>
				18	#endif
				19	#ifdef HAVE_SYS_STAT_H
				20	#include <sys/stat.h>
				21	#endif
				22	#ifdef HAVE_FCNTL_H
				23	#include <fcntl.h>
				24	#endif
				25	#ifdef HAVE_UNISTD_H
				26	#include <unistd.h>
				27	#endif
				28	#ifdef HAVE_ZLIB_H
				29	#include <zlib.h>
				30	#endif
				31
				32	#include <libxml/xmlmemory.h>
				33	#include <libxml/tree.h>
				34	#include <libxml/parser.h>
				35	#include <libxml/parserInternals.h>
				36	#include <libxml/xmlerror.h>
				37	#include <libxml/HTMLparser.h>
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	38	#include <libxml/HTMLtree.h>
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	39	#include <libxml/entities.h>
				40	#include <libxml/encoding.h>
				41	#include <libxml/valid.h>
				42	#include <libxml/xmlIO.h>
Daniel Veillard	3c01b1d	2001-10-17 15:58:35 +0000	[diff] [blame]	43	#include <libxml/globals.h>
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	44
				45	#define HTML_MAX_NAMELEN 1000
				46	#define HTML_PARSER_BIG_BUFFER_SIZE 1000
				47	#define HTML_PARSER_BUFFER_SIZE 100
				48
				49	/* #define DEBUG */
				50	/* #define DEBUG_PUSH */
				51
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	52	static int htmlOmittedDefaultValue = 1;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	53
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	54	xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
				55	xmlChar end, xmlChar end2, xmlChar end3);
				56
				57	/************************************************************************
				58	* *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	59	* Parser stacks related functions and macros *
				60	* *
				61	************************************************************************/
				62
				63	/*
				64	* Generic function for accessing stacks in the Parser Context
				65	*/
				66
				67	#define PUSH_AND_POP(scope, type, name) \
				68	scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
				69	if (ctxt->name##Nr >= ctxt->name##Max) { \
				70	ctxt->name##Max *= 2; \
				71	ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
				72	ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
				73	if (ctxt->name##Tab == NULL) { \
				74	xmlGenericError(xmlGenericErrorContext, \
				75	"realloc failed !\n"); \
				76	return(0); \
				77	} \
				78	} \
				79	ctxt->name##Tab[ctxt->name##Nr] = value; \
				80	ctxt->name = value; \
				81	return(ctxt->name##Nr++); \
				82	} \
				83	scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
				84	type ret; \
				85	if (ctxt->name##Nr < 0) return(0); \
				86	ctxt->name##Nr--; \
				87	if (ctxt->name##Nr < 0) return(0); \
				88	if (ctxt->name##Nr > 0) \
				89	ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
				90	else \
				91	ctxt->name = NULL; \
				92	ret = ctxt->name##Tab[ctxt->name##Nr]; \
				93	ctxt->name##Tab[ctxt->name##Nr] = 0; \
				94	return(ret); \
				95	} \
				96
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	97	/* PUSH_AND_POP(static, xmlNodePtr, node) */
				98	PUSH_AND_POP(static, xmlChar*, name)
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	99
				100	/*
				101	* Macros for accessing the content. Those should be used only by the parser,
				102	* and not exported.
				103	*
				104	* Dirty macros, i.e. one need to make assumption on the context to use them
				105	*
				106	* CUR_PTR return the current pointer to the xmlChar to be parsed.
				107	* CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
				108	* in ISO-Latin or UTF-8, and the current 16 bit value if compiled
				109	* in UNICODE mode. This should be used internally by the parser
				110	* only to compare to ASCII values otherwise it would break when
				111	* running with UTF-8 encoding.
				112	* NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
				113	* to compare on ASCII based substring.
				114	* UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
				115	* it should be used only to compare on ASCII based substring.
				116	* SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
				117	* strings within the parser.
				118	*
				119	* Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
				120	*
				121	* CURRENT Returns the current char value, with the full decoding of
				122	* UTF-8 if we are using this mode. It returns an int.
				123	* NEXT Skip to the next character, this does the proper decoding
				124	* in UTF-8 mode. It also pop-up unfinished entities on the fly.
				125	* COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
				126	*/
				127
				128	#define UPPER (toupper(*ctxt->input->cur))
				129
				130	#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
				131
				132	#define NXT(val) ctxt->input->cur[(val)]
				133
				134	#define UPP(val) (toupper(ctxt->input->cur[(val)]))
				135
				136	#define CUR_PTR ctxt->input->cur
				137
				138	#define SHRINK xmlParserInputShrink(ctxt->input)
				139
				140	#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
				141
				142	#define CURRENT ((int) (*ctxt->input->cur))
				143
				144	#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
				145
				146	/* Inported from XML */
				147
				148	/* #define CUR (ctxt->token ? ctxt->token : (int) (ctxt->input->cur)) /
				149	#define CUR ((int) (*ctxt->input->cur))
				150	#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
				151
				152	#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
				153	#define NXT(val) ctxt->input->cur[(val)]
				154	#define CUR_PTR ctxt->input->cur
				155
				156
				157	#define NEXTL(l) do { \
				158	if (*(ctxt->input->cur) == '\n') { \
				159	ctxt->input->line++; ctxt->input->col = 1; \
				160	} else ctxt->input->col++; \
				161	ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
				162	} while (0)
				163
				164	/************
				165	\
				166	if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
				167	if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
				168	************/
				169
				170	#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
				171	#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
				172
				173	#define COPY_BUF(l,b,i,v) \
				174	if (l == 1) b[i++] = (xmlChar) v; \
				175	else i += xmlCopyChar(l,&b[i],v)
				176
				177	/**
				178	* htmlCurrentChar:
				179	* @ctxt: the HTML parser context
				180	* @len: pointer to the length of the char read
				181	*
				182	* The current char value, if using UTF-8 this may actaully span multiple
				183	* bytes in the input buffer. Implement the end of line normalization:
				184	* 2.11 End-of-Line Handling
				185	* If the encoding is unspecified, in the case we find an ISO-Latin-1
				186	* char, then the encoding converter is plugged in automatically.
				187	*
Daniel Veillard	60087f3	2001-10-10 09:45:09 +0000	[diff] [blame]	188	* Returns the current char value and its length
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	189	*/
				190
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	191	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	192	htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
				193	if (ctxt->instate == XML_PARSER_EOF)
				194	return(0);
				195
				196	if (ctxt->token != 0) {
				197	*len = 0;
				198	return(ctxt->token);
				199	}
				200	if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
				201	/*
				202	* We are supposed to handle UTF8, check it's valid
				203	* From rfc2044: encoding of the Unicode values on UTF-8:
				204	*
				205	* UCS-4 range (hex.) UTF-8 octet sequence (binary)
				206	* 0000 0000-0000 007F 0xxxxxxx
				207	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
				208	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
				209	*
				210	* Check for the 0x110000 limit too
				211	*/
				212	const unsigned char *cur = ctxt->input->cur;
				213	unsigned char c;
				214	unsigned int val;
				215
				216	c = *cur;
				217	if (c & 0x80) {
				218	if (cur[1] == 0)
				219	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				220	if ((cur[1] & 0xc0) != 0x80)
				221	goto encoding_error;
				222	if ((c & 0xe0) == 0xe0) {
				223
				224	if (cur[2] == 0)
				225	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				226	if ((cur[2] & 0xc0) != 0x80)
				227	goto encoding_error;
				228	if ((c & 0xf0) == 0xf0) {
				229	if (cur[3] == 0)
				230	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				231	if (((c & 0xf8) != 0xf0) \|\|
				232	((cur[3] & 0xc0) != 0x80))
				233	goto encoding_error;
				234	/* 4-byte code */
				235	*len = 4;
				236	val = (cur[0] & 0x7) << 18;
				237	val \|= (cur[1] & 0x3f) << 12;
				238	val \|= (cur[2] & 0x3f) << 6;
				239	val \|= cur[3] & 0x3f;
				240	} else {
				241	/* 3-byte code */
				242	*len = 3;
				243	val = (cur[0] & 0xf) << 12;
				244	val \|= (cur[1] & 0x3f) << 6;
				245	val \|= cur[2] & 0x3f;
				246	}
				247	} else {
				248	/* 2-byte code */
				249	*len = 2;
				250	val = (cur[0] & 0x1f) << 6;
				251	val \|= cur[1] & 0x3f;
				252	}
				253	if (!IS_CHAR(val)) {
				254	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				255	if ((ctxt->sax != NULL) &&
				256	(ctxt->sax->error != NULL))
				257	ctxt->sax->error(ctxt->userData,
				258	"Char 0x%X out of allowed range\n", val);
				259	ctxt->wellFormed = 0;
				260	ctxt->disableSAX = 1;
				261	}
				262	return(val);
				263	} else {
				264	/* 1-byte code */
				265	*len = 1;
				266	return((int) *ctxt->input->cur);
				267	}
				268	}
				269	/*
Daniel Veillard	60087f3	2001-10-10 09:45:09 +0000	[diff] [blame]	270	* Assume it's a fixed length encoding (1) with
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	271	* a compatibke encoding for the ASCII set, since
				272	* XML constructs only use < 128 chars
				273	*/
				274	*len = 1;
				275	if ((int) *ctxt->input->cur < 0x80)
				276	return((int) *ctxt->input->cur);
				277
				278	/*
				279	* Humm this is bad, do an automatic flow conversion
				280	*/
				281	xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
				282	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				283	return(xmlCurrentChar(ctxt, len));
				284
				285	encoding_error:
				286	/*
				287	* If we detect an UTF8 error that probably mean that the
				288	* input encoding didn't get properly advertized in the
				289	* declaration header. Report the error and switch the encoding
				290	* to ISO-Latin-1 (if you don't like this policy, just declare the
				291	* encoding !)
				292	*/
				293	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				294	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
				295	ctxt->sax->error(ctxt->userData,
				296	"Input is not proper UTF-8, indicate encoding !\n");
				297	ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
				298	ctxt->input->cur[0], ctxt->input->cur[1],
				299	ctxt->input->cur[2], ctxt->input->cur[3]);
				300	}
				301
				302	ctxt->charset = XML_CHAR_ENCODING_8859_1;
				303	*len = 1;
				304	return((int) *ctxt->input->cur);
				305	}
				306
				307	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	308	* htmlSkipBlankChars:
				309	* @ctxt: the HTML parser context
				310	*
				311	* skip all blanks character found at that point in the input streams.
				312	*
				313	* Returns the number of space chars skipped
				314	*/
				315
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	316	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	317	htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
				318	int res = 0;
				319
				320	while (IS_BLANK(*(ctxt->input->cur))) {
				321	if ((*ctxt->input->cur == 0) &&
				322	(xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
				323	xmlPopInput(ctxt);
				324	} else {
				325	if (*(ctxt->input->cur) == '\n') {
				326	ctxt->input->line++; ctxt->input->col = 1;
				327	} else ctxt->input->col++;
				328	ctxt->input->cur++;
				329	ctxt->nbChars++;
				330	if (*ctxt->input->cur == 0)
				331	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				332	}
				333	res++;
				334	}
				335	return(res);
				336	}
				337
				338
				339
				340	/************************************************************************
				341	* *
				342	* The list of HTML elements and their properties *
				343	* *
				344	************************************************************************/
				345
				346	/*
				347	* Start Tag: 1 means the start tag can be ommited
				348	* End Tag: 1 means the end tag can be ommited
				349	* 2 means it's forbidden (empty elements)
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	350	* 3 means the tag is stylistic and should be closed easilly
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	351	* Depr: this element is deprecated
				352	* DTD: 1 means that this element is valid only in the Loose DTD
				353	* 2 means that this element is valid only in the Frameset DTD
				354	*
Daniel Veillard	02bb170	2001-06-13 21:11:59 +0000	[diff] [blame]	355	* Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	356	*/
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	357	static const htmlElemDesc
				358	html40ElementTable[] = {
Daniel Veillard	02bb170	2001-06-13 21:11:59 +0000	[diff] [blame]	359	{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor " },
				360	{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form" },
				361	{ "acronym", 0, 0, 0, 0, 0, 0, 1, "" },
				362	{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author " },
				363	{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet " },
				364	{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area " },
				365	{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style" },
				366	{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri " },
				367	{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " },
				368	{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride " },
				369	{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style" },
				370	{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation " },
				371	{ "body", 1, 1, 0, 0, 0, 0, 0, "document body " },
				372	{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break " },
				373	{ "button", 0, 0, 0, 0, 0, 0, 2, "push button " },
				374	{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption " },
				375	{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center " },
				376	{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation" },
				377	{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment" },
				378	{ "col", 0, 2, 2, 1, 0, 0, 0, "table column " },
				379	{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group " },
				380	{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description " },
				381	{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text " },
				382	{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition" },
				383	{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list" },
				384	{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container"},
				385	{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list " },
				386	{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term " },
				387	{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis" },
				388	{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group " },
				389	{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font " },
				390	{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form " },
				391	{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " },
				392	{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" },
				393	{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading " },
				394	{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading " },
				395	{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading " },
				396	{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading " },
				397	{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading " },
				398	{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading " },
				399	{ "head", 1, 1, 0, 0, 0, 0, 0, "document head " },
				400	{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " },
				401	{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element " },
				402	{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style" },
				403	{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow " },
				404	{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image " },
				405	{ "input", 0, 2, 2, 1, 0, 0, 1, "form control " },
				406	{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text" },
				407	{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt " },
				408	{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user" },
				409	{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text " },
				410	{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend " },
				411	{ "li", 0, 1, 1, 0, 0, 0, 0, "list item " },
				412	{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link " },
				413	{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map " },
				414	{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list " },
				415	{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation " },
				416	{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering " },
				417	{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
				418	{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object " },
				419	{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list " },
				420	{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group " },
				421	{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " },
				422	{ "p", 0, 1, 1, 0, 0, 0, 0, "paragraph " },
				423	{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value " },
				424	{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text " },
				425	{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation " },
				426	{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style" },
				427	{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc." },
				428	{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements " },
				429	{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector " },
				430	{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style" },
				431	{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container " },
				432	{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text" },
				433	{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis" },
				434	{ "style", 0, 0, 0, 0, 0, 0, 0, "style info " },
				435	{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript" },
				436	{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript " },
				437	{ "table", 0, 0, 0, 0, 0, 0, 0, " " },
				438	{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body " },
				439	{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell" },
				440	{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field " },
				441	{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer " },
				442	{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell" },
				443	{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header " },
				444	{ "title", 0, 0, 0, 0, 0, 0, 0, "document title " },
				445	{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row " },
				446	{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style" },
				447	{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style" },
				448	{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list " },
				449	{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	450	};
				451
				452	/*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	453	* start tags that imply the end of current element
				454	*/
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	455	static const char *htmlStartClose[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	456	"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
				457	"dl", "ul", "ol", "menu", "dir", "address", "pre",
				458	"listing", "xmp", "head", NULL,
				459	"head", "p", NULL,
				460	"title", "p", NULL,
				461	"body", "head", "style", "link", "title", "p", NULL,
				462	"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
				463	"pre", "listing", "xmp", "head", "li", NULL,
				464	"hr", "p", "head", NULL,
				465	"h1", "p", "head", NULL,
				466	"h2", "p", "head", NULL,
				467	"h3", "p", "head", NULL,
				468	"h4", "p", "head", NULL,
				469	"h5", "p", "head", NULL,
				470	"h6", "p", "head", NULL,
				471	"dir", "p", "head", NULL,
				472	"address", "p", "head", "ul", NULL,
				473	"pre", "p", "head", "ul", NULL,
				474	"listing", "p", "head", NULL,
				475	"xmp", "p", "head", NULL,
				476	"blockquote", "p", "head", NULL,
				477	"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
				478	"xmp", "head", NULL,
				479	"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
				480	"head", "dd", NULL,
				481	"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
				482	"head", "dt", NULL,
				483	"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
				484	"listing", "xmp", NULL,
				485	"ol", "p", "head", "ul", NULL,
				486	"menu", "p", "head", "ul", NULL,
				487	"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
				488	"div", "p", "head", NULL,
				489	"noscript", "p", "head", NULL,
				490	"center", "font", "b", "i", "p", "head", NULL,
				491	"a", "a", NULL,
				492	"caption", "p", NULL,
				493	"colgroup", "caption", "colgroup", "col", "p", NULL,
				494	"col", "caption", "col", "p", NULL,
				495	"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
				496	"listing", "xmp", "a", NULL,
Daniel Veillard	43dadeb	2001-04-24 11:23:35 +0000	[diff] [blame]	497	"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
				498	"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	499	"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
				500	"thead", "caption", "col", "colgroup", NULL,
				501	"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
				502	"tbody", "p", NULL,
				503	"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
				504	"tfoot", "tbody", "p", NULL,
				505	"optgroup", "option", NULL,
				506	"option", "option", NULL,
				507	"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
				508	"pre", "listing", "xmp", "a", NULL,
				509	NULL
				510	};
				511
				512	/*
				513	* The list of HTML elements which are supposed not to have
				514	* CDATA content and where a p element will be implied
				515	*
				516	* TODO: extend that list by reading the HTML SGML DtD on
				517	* implied paragraph
				518	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	519	static const char *htmlNoContentElements[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	520	"html",
				521	"head",
				522	"body",
				523	NULL
				524	};
				525
				526	/*
				527	* The list of HTML attributes which are of content %Script;
				528	* NOTE: when adding ones, check htmlIsScriptAttribute() since
				529	* it assumes the name starts with 'on'
				530	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	531	static const char *htmlScriptAttributes[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	532	"onclick",
				533	"ondblclick",
				534	"onmousedown",
				535	"onmouseup",
				536	"onmouseover",
				537	"onmousemove",
				538	"onmouseout",
				539	"onkeypress",
				540	"onkeydown",
				541	"onkeyup",
				542	"onload",
				543	"onunload",
				544	"onfocus",
				545	"onblur",
				546	"onsubmit",
				547	"onrest",
				548	"onchange",
				549	"onselect"
				550	};
				551
Daniel Veillard	a2bc368	2001-05-03 08:27:20 +0000	[diff] [blame]	552	/*
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	553	* This table is used by the htmlparser to know what to do with
				554	* broken html pages. By assigning different priorities to different
				555	* elements the parser can decide how to handle extra endtags.
				556	* Endtags are only allowed to close elements with lower or equal
				557	* priority.
				558	*/
Daniel Veillard	a2bc368	2001-05-03 08:27:20 +0000	[diff] [blame]	559
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	560	typedef struct {
				561	const char *name;
				562	int priority;
				563	} elementPriority;
				564
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	565	static const elementPriority htmlEndPriority[] = {
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	566	{"div", 150},
				567	{"td", 160},
				568	{"th", 160},
				569	{"tr", 170},
				570	{"thead", 180},
				571	{"tbody", 180},
				572	{"tfoot", 180},
				573	{"table", 190},
				574	{"head", 200},
				575	{"body", 200},
				576	{"html", 220},
				577	{NULL, 100} /* Default priority */
				578	};
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	579
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	580	static const char** htmlStartCloseIndex[100];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	581	static int htmlStartCloseIndexinitialized = 0;
				582
				583	/************************************************************************
				584	* *
				585	* functions to handle HTML specific data *
				586	* *
				587	************************************************************************/
				588
				589	/**
				590	* htmlInitAutoClose:
				591	*
				592	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				593	* This is not reentrant. Call xmlInitParser() once before processing in
				594	* case of use in multithreaded programs.
				595	*/
				596	void
				597	htmlInitAutoClose(void) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	598	int indx, i = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	599
				600	if (htmlStartCloseIndexinitialized) return;
				601
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	602	for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
				603	indx = 0;
				604	while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
				605	htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	606	while (htmlStartClose[i] != NULL) i++;
				607	i++;
				608	}
				609	htmlStartCloseIndexinitialized = 1;
				610	}
				611
				612	/**
				613	* htmlTagLookup:
				614	* @tag: The tag name in lowercase
				615	*
				616	* Lookup the HTML tag in the ElementTable
				617	*
				618	* Returns the related htmlElemDescPtr or NULL if not found.
				619	*/
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	620	const htmlElemDesc *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	621	htmlTagLookup(const xmlChar *tag) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	622	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	623
				624	for (i = 0; i < (sizeof(html40ElementTable) /
				625	sizeof(html40ElementTable[0]));i++) {
Daniel Veillard	1ed3f88	2001-04-18 09:45:35 +0000	[diff] [blame]	626	if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	627	return((const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	628	}
				629	return(NULL);
				630	}
				631
				632	/**
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	633	* htmlGetEndPriority:
				634	* @name: The name of the element to look up the priority for.
				635	*
				636	* Return value: The "endtag" priority.
				637	**/
				638	static int
				639	htmlGetEndPriority (const xmlChar *name) {
				640	int i = 0;
				641
				642	while ((htmlEndPriority[i].name != NULL) &&
				643	(!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
				644	i++;
				645
				646	return(htmlEndPriority[i].priority);
				647	}
				648
				649	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	650	* htmlCheckAutoClose:
				651	* @newtag: The new tag name
				652	* @oldtag: The old tag name
				653	*
				654	* Checks wether the new tag is one of the registered valid tags for closing old.
				655	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				656	*
				657	* Returns 0 if no, 1 if yes.
				658	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	659	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	660	htmlCheckAutoClose(const xmlChar newtag, const xmlChar oldtag) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	661	int i, indx;
				662	const char **closed = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	663
				664	if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
				665
				666	/* inefficient, but not a big deal */
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	667	for (indx = 0; indx < 100;indx++) {
				668	closed = htmlStartCloseIndex[indx];
				669	if (closed == NULL) return(0);
				670	if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	671	}
				672
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	673	i = closed - htmlStartClose;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	674	i++;
				675	while (htmlStartClose[i] != NULL) {
				676	if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
				677	return(1);
				678	}
				679	i++;
				680	}
				681	return(0);
				682	}
				683
				684	/**
				685	* htmlAutoCloseOnClose:
				686	* @ctxt: an HTML parser context
				687	* @newtag: The new tag name
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	688	* @force: force the tag closure
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	689	*
				690	* The HTmL DtD allows an ending tag to implicitely close other tags.
				691	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	692	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	693	htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	694	const htmlElemDesc * info;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	695	xmlChar *oldname;
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	696	int i, priority;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	697
				698	#ifdef DEBUG
				699	xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
				700	for (i = 0;i < ctxt->nameNr;i++)
				701	xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
				702	#endif
				703
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	704	priority = htmlGetEndPriority (newtag);
				705
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	706	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	707
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	708	if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	709	/*
				710	* A missplaced endtagad can only close elements with lower
				711	* or equal priority, so if we find an element with higher
				712	* priority before we find an element with
				713	* matching name, we just ignore this endtag
				714	*/
				715	if (htmlGetEndPriority (ctxt->nameTab[i]) > priority) return;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	716	}
				717	if (i < 0) return;
				718
				719	while (!xmlStrEqual(newtag, ctxt->name)) {
				720	info = htmlTagLookup(ctxt->name);
				721	if ((info == NULL) \|\| (info->endTag == 1)) {
				722	#ifdef DEBUG
				723	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
				724	#endif
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	725	} else if (info->endTag == 3) {
				726	#ifdef DEBUG
Daniel Veillard	f69bb4b	2001-05-19 13:24:56 +0000	[diff] [blame]	727	xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", newtag, ctxt->name);
William M. Brack	1633d18	2001-10-05 15:41:19 +0000	[diff] [blame]	728
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	729	#endif
				730	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				731	ctxt->sax->error(ctxt->userData,
				732	"Opening and ending tag mismatch: %s and %s\n",
				733	newtag, ctxt->name);
				734	ctxt->wellFormed = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	735	}
				736	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				737	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				738	oldname = htmlnamePop(ctxt);
				739	if (oldname != NULL) {
				740	#ifdef DEBUG
				741	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
				742	#endif
				743	xmlFree(oldname);
				744	}
				745	}
				746	}
				747
				748	/**
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	749	* htmlAutoCloseOnEnd:
				750	* @ctxt: an HTML parser context
				751	*
				752	* Close all remaining tags at the end of the stream
				753	*/
				754	static void
				755	htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
				756	xmlChar *oldname;
				757	int i;
				758
				759	if (ctxt->nameNr == 0)
				760	return;
				761	#ifdef DEBUG
				762	xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
				763	#endif
				764
				765	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
				766	#ifdef DEBUG
				767	xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
				768	#endif
				769	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				770	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				771	oldname = htmlnamePop(ctxt);
				772	if (oldname != NULL) {
				773	#ifdef DEBUG
				774	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
				775	#endif
				776	xmlFree(oldname);
				777	}
				778	}
				779	}
				780
				781	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	782	* htmlAutoClose:
				783	* @ctxt: an HTML parser context
				784	* @newtag: The new tag name or NULL
				785	*
				786	* The HTmL DtD allows a tag to implicitely close other tags.
				787	* The list is kept in htmlStartClose array. This function is
				788	* called when a new tag has been detected and generates the
				789	* appropriates closes if possible/needed.
				790	* If newtag is NULL this mean we are at the end of the resource
				791	* and we should check
				792	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	793	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	794	htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				795	xmlChar *oldname;
				796	while ((newtag != NULL) && (ctxt->name != NULL) &&
				797	(htmlCheckAutoClose(newtag, ctxt->name))) {
				798	#ifdef DEBUG
				799	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
				800	#endif
				801	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				802	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				803	oldname = htmlnamePop(ctxt);
				804	if (oldname != NULL) {
				805	#ifdef DEBUG
				806	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
				807	#endif
				808	xmlFree(oldname);
				809	}
				810	}
				811	if (newtag == NULL) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	812	htmlAutoCloseOnEnd(ctxt);
				813	return;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	814	}
				815	while ((newtag == NULL) && (ctxt->name != NULL) &&
				816	((xmlStrEqual(ctxt->name, BAD_CAST"head")) \|\|
				817	(xmlStrEqual(ctxt->name, BAD_CAST"body")) \|\|
				818	(xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
				819	#ifdef DEBUG
				820	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
				821	#endif
				822	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				823	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				824	oldname = htmlnamePop(ctxt);
				825	if (oldname != NULL) {
				826	#ifdef DEBUG
				827	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
				828	#endif
				829	xmlFree(oldname);
				830	}
				831	}
				832
				833	}
				834
				835	/**
				836	* htmlAutoCloseTag:
				837	* @doc: the HTML document
				838	* @name: The tag name
				839	* @elem: the HTML element
				840	*
				841	* The HTmL DtD allows a tag to implicitely close other tags.
				842	* The list is kept in htmlStartClose array. This function checks
				843	* if the element or one of it's children would autoclose the
				844	* given tag.
				845	*
				846	* Returns 1 if autoclose, 0 otherwise
				847	*/
				848	int
				849	htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
				850	htmlNodePtr child;
				851
				852	if (elem == NULL) return(1);
				853	if (xmlStrEqual(name, elem->name)) return(0);
				854	if (htmlCheckAutoClose(elem->name, name)) return(1);
				855	child = elem->children;
				856	while (child != NULL) {
				857	if (htmlAutoCloseTag(doc, name, child)) return(1);
				858	child = child->next;
				859	}
				860	return(0);
				861	}
				862
				863	/**
				864	* htmlIsAutoClosed:
				865	* @doc: the HTML document
				866	* @elem: the HTML element
				867	*
				868	* The HTmL DtD allows a tag to implicitely close other tags.
				869	* The list is kept in htmlStartClose array. This function checks
				870	* if a tag is autoclosed by one of it's child
				871	*
				872	* Returns 1 if autoclosed, 0 otherwise
				873	*/
				874	int
				875	htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
				876	htmlNodePtr child;
				877
				878	if (elem == NULL) return(1);
				879	child = elem->children;
				880	while (child != NULL) {
				881	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
				882	child = child->next;
				883	}
				884	return(0);
				885	}
				886
				887	/**
				888	* htmlCheckImplied:
				889	* @ctxt: an HTML parser context
				890	* @newtag: The new tag name
				891	*
				892	* The HTML DtD allows a tag to exists only implicitely
				893	* called when a new tag has been detected and generates the
				894	* appropriates implicit tags if missing
				895	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	896	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	897	htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				898	if (!htmlOmittedDefaultValue)
				899	return;
				900	if (xmlStrEqual(newtag, BAD_CAST"html"))
				901	return;
				902	if (ctxt->nameNr <= 0) {
				903	#ifdef DEBUG
				904	xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
				905	#endif
				906	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
				907	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				908	ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
				909	}
				910	if ((xmlStrEqual(newtag, BAD_CAST"body")) \|\| (xmlStrEqual(newtag, BAD_CAST"head")))
				911	return;
				912	if ((ctxt->nameNr <= 1) &&
				913	((xmlStrEqual(newtag, BAD_CAST"script")) \|\|
				914	(xmlStrEqual(newtag, BAD_CAST"style")) \|\|
				915	(xmlStrEqual(newtag, BAD_CAST"meta")) \|\|
				916	(xmlStrEqual(newtag, BAD_CAST"link")) \|\|
				917	(xmlStrEqual(newtag, BAD_CAST"title")) \|\|
				918	(xmlStrEqual(newtag, BAD_CAST"base")))) {
				919	/*
				920	* dropped OBJECT ... i you put it first BODY will be
				921	* assumed !
				922	*/
				923	#ifdef DEBUG
				924	xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
				925	#endif
				926	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
				927	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				928	ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
				929	} else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
				930	(!xmlStrEqual(newtag, BAD_CAST"frame")) &&
				931	(!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
				932	int i;
				933	for (i = 0;i < ctxt->nameNr;i++) {
				934	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
				935	return;
				936	}
				937	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
				938	return;
				939	}
				940	}
				941
				942	#ifdef DEBUG
				943	xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
				944	#endif
				945	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
				946	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				947	ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
				948	}
				949	}
				950
				951	/**
				952	* htmlCheckParagraph
				953	* @ctxt: an HTML parser context
				954	*
				955	* Check whether a p element need to be implied before inserting
				956	* characters in the current element.
				957	*
				958	* Returns 1 if a paragraph has been inserted, 0 if not and -1
				959	* in case of error.
				960	*/
				961
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	962	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	963	htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
				964	const xmlChar *tag;
				965	int i;
				966
				967	if (ctxt == NULL)
				968	return(-1);
				969	tag = ctxt->name;
				970	if (tag == NULL) {
				971	htmlAutoClose(ctxt, BAD_CAST"p");
				972	htmlCheckImplied(ctxt, BAD_CAST"p");
				973	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
				974	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				975	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
				976	return(1);
				977	}
				978	if (!htmlOmittedDefaultValue)
				979	return(0);
				980	for (i = 0; htmlNoContentElements[i] != NULL; i++) {
				981	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
				982	#ifdef DEBUG
				983	xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
				984	#endif
				985	htmlAutoClose(ctxt, BAD_CAST"p");
				986	htmlCheckImplied(ctxt, BAD_CAST"p");
				987	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
				988	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				989	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
				990	return(1);
				991	}
				992	}
				993	return(0);
				994	}
				995
				996	/**
				997	* htmlIsScriptAttribute:
				998	* @name: an attribute name
				999	*
				1000	* Check if an attribute is of content type Script
				1001	*
				1002	* Returns 1 is the attribute is a script 0 otherwise
				1003	*/
				1004	int
				1005	htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1006	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1007
				1008	if (name == NULL)
				1009	return(0);
				1010	/*
				1011	* all script attributes start with 'on'
				1012	*/
				1013	if ((name[0] != 'o') \|\| (name[1] != 'n'))
				1014	return(0);
				1015	for (i = 0;
				1016	i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
				1017	i++) {
				1018	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
				1019	return(1);
				1020	}
				1021	return(0);
				1022	}
				1023
				1024	/************************************************************************
				1025	* *
				1026	* The list of HTML predefined entities *
				1027	* *
				1028	************************************************************************/
				1029
				1030
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	1031	static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1032	/*
				1033	* the 4 absolute ones, plus apostrophe.
				1034	*/
				1035	{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
				1036	{ 38, "amp", "ampersand, U+0026 ISOnum" },
				1037	{ 39, "apos", "single quote" },
				1038	{ 60, "lt", "less-than sign, U+003C ISOnum" },
				1039	{ 62, "gt", "greater-than sign, U+003E ISOnum" },
				1040
				1041	/*
				1042	* A bunch still in the 128-255 range
				1043	* Replacing them depend really on the charset used.
				1044	*/
				1045	{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
				1046	{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
				1047	{ 162, "cent", "cent sign, U+00A2 ISOnum" },
				1048	{ 163, "pound","pound sign, U+00A3 ISOnum" },
				1049	{ 164, "curren","currency sign, U+00A4 ISOnum" },
				1050	{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
				1051	{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
				1052	{ 167, "sect", "section sign, U+00A7 ISOnum" },
				1053	{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
				1054	{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
				1055	{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
				1056	{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
				1057	{ 172, "not", "not sign, U+00AC ISOnum" },
				1058	{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
				1059	{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
				1060	{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
				1061	{ 176, "deg", "degree sign, U+00B0 ISOnum" },
				1062	{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
				1063	{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
				1064	{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
				1065	{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
				1066	{ 181, "micro","micro sign, U+00B5 ISOnum" },
				1067	{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
				1068	{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
				1069	{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
				1070	{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
				1071	{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
				1072	{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
				1073	{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
				1074	{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
				1075	{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
				1076	{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
				1077	{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
				1078	{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
				1079	{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
				1080	{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
				1081	{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
				1082	{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
				1083	{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
				1084	{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
				1085	{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
				1086	{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
				1087	{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
				1088	{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
				1089	{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
				1090	{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
				1091	{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
				1092	{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
				1093	{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
				1094	{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
				1095	{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
				1096	{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
				1097	{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
				1098	{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
				1099	{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
				1100	{ 215, "times","multiplication sign, U+00D7 ISOnum" },
				1101	{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
				1102	{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
				1103	{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
				1104	{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
				1105	{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
				1106	{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
				1107	{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
				1108	{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
				1109	{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
				1110	{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
				1111	{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
				1112	{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
				1113	{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
				1114	{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
				1115	{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
				1116	{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
				1117	{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
				1118	{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
				1119	{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
				1120	{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
				1121	{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
				1122	{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
				1123	{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
				1124	{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
				1125	{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
				1126	{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
				1127	{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
				1128	{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
				1129	{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
				1130	{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
				1131	{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
				1132	{ 247, "divide","division sign, U+00F7 ISOnum" },
				1133	{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
				1134	{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
				1135	{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
				1136	{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
				1137	{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
				1138	{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
				1139	{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
				1140	{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
				1141
				1142	{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
				1143	{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
				1144	{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
				1145	{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
				1146	{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
				1147
				1148	/*
				1149	* Anything below should really be kept as entities references
				1150	*/
				1151	{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
				1152
				1153	{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
				1154	{ 732, "tilde","small tilde, U+02DC ISOdia" },
				1155
				1156	{ 913, "Alpha","greek capital letter alpha, U+0391" },
				1157	{ 914, "Beta", "greek capital letter beta, U+0392" },
				1158	{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
				1159	{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
				1160	{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
				1161	{ 918, "Zeta", "greek capital letter zeta, U+0396" },
				1162	{ 919, "Eta", "greek capital letter eta, U+0397" },
				1163	{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
				1164	{ 921, "Iota", "greek capital letter iota, U+0399" },
				1165	{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1166	{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1167	{ 924, "Mu", "greek capital letter mu, U+039C" },
				1168	{ 925, "Nu", "greek capital letter nu, U+039D" },
				1169	{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
				1170	{ 927, "Omicron","greek capital letter omicron, U+039F" },
				1171	{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
				1172	{ 929, "Rho", "greek capital letter rho, U+03A1" },
				1173	{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
				1174	{ 932, "Tau", "greek capital letter tau, U+03A4" },
				1175	{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
				1176	{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
				1177	{ 935, "Chi", "greek capital letter chi, U+03A7" },
				1178	{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
				1179	{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
				1180
				1181	{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
				1182	{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
				1183	{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
				1184	{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
				1185	{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
				1186	{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
				1187	{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
				1188	{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
				1189	{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
				1190	{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
				1191	{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
				1192	{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
				1193	{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
				1194	{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
				1195	{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
				1196	{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
				1197	{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
				1198	{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
				1199	{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
				1200	{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
				1201	{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
				1202	{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
				1203	{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
				1204	{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
				1205	{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
				1206	{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
				1207	{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
				1208	{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
				1209
				1210	{ 8194, "ensp", "en space, U+2002 ISOpub" },
				1211	{ 8195, "emsp", "em space, U+2003 ISOpub" },
				1212	{ 8201, "thinsp","thin space, U+2009 ISOpub" },
				1213	{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
				1214	{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
				1215	{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
				1216	{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
				1217	{ 8211, "ndash","en dash, U+2013 ISOpub" },
				1218	{ 8212, "mdash","em dash, U+2014 ISOpub" },
				1219	{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
				1220	{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
				1221	{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
				1222	{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
				1223	{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
				1224	{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
				1225	{ 8224, "dagger","dagger, U+2020 ISOpub" },
				1226	{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
				1227
				1228	{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
				1229	{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
				1230
				1231	{ 8240, "permil","per mille sign, U+2030 ISOtech" },
				1232
				1233	{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
				1234	{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
				1235
				1236	{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
				1237	{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
				1238
				1239	{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
				1240	{ 8260, "frasl","fraction slash, U+2044 NEW" },
				1241
				1242	{ 8364, "euro", "euro sign, U+20AC NEW" },
				1243
				1244	{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
				1245	{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
				1246	{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
				1247	{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
				1248	{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
				1249	{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
				1250	{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
				1251	{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
				1252	{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
				1253	{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
				1254	{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
				1255	{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
				1256	{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
				1257	{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
				1258	{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
				1259	{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
				1260
				1261	{ 8704, "forall","for all, U+2200 ISOtech" },
				1262	{ 8706, "part", "partial differential, U+2202 ISOtech" },
				1263	{ 8707, "exist","there exists, U+2203 ISOtech" },
				1264	{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
				1265	{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
				1266	{ 8712, "isin", "element of, U+2208 ISOtech" },
				1267	{ 8713, "notin","not an element of, U+2209 ISOtech" },
				1268	{ 8715, "ni", "contains as member, U+220B ISOtech" },
				1269	{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
				1270	{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
				1271	{ 8722, "minus","minus sign, U+2212 ISOtech" },
				1272	{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
				1273	{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
				1274	{ 8733, "prop", "proportional to, U+221D ISOtech" },
				1275	{ 8734, "infin","infinity, U+221E ISOtech" },
				1276	{ 8736, "ang", "angle, U+2220 ISOamso" },
				1277	{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
				1278	{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
				1279	{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
				1280	{ 8746, "cup", "union = cup, U+222A ISOtech" },
				1281	{ 8747, "int", "integral, U+222B ISOtech" },
				1282	{ 8756, "there4","therefore, U+2234 ISOtech" },
				1283	{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
				1284	{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
				1285	{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
				1286	{ 8800, "ne", "not equal to, U+2260 ISOtech" },
				1287	{ 8801, "equiv","identical to, U+2261 ISOtech" },
				1288	{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
				1289	{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
				1290	{ 8834, "sub", "subset of, U+2282 ISOtech" },
				1291	{ 8835, "sup", "superset of, U+2283 ISOtech" },
				1292	{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
				1293	{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
				1294	{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
				1295	{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
				1296	{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
				1297	{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
				1298	{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
				1299	{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
				1300	{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
				1301	{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
				1302	{ 8971, "rfloor","right floor, U+230B ISOamsc" },
				1303	{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
				1304	{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
				1305	{ 9674, "loz", "lozenge, U+25CA ISOpub" },
				1306
				1307	{ 9824, "spades","black spade suit, U+2660 ISOpub" },
				1308	{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
				1309	{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
				1310	{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
				1311
				1312	};
				1313
				1314	/************************************************************************
				1315	* *
				1316	* Commodity functions to handle entities *
				1317	* *
				1318	************************************************************************/
				1319
				1320	/*
				1321	* Macro used to grow the current buffer.
				1322	*/
				1323	#define growBuffer(buffer) { \
				1324	buffer##_size *= 2; \
				1325	buffer = (xmlChar ) xmlRealloc(buffer, buffer##_size sizeof(xmlChar)); \
				1326	if (buffer == NULL) { \
				1327	perror("realloc failed"); \
				1328	return(NULL); \
				1329	} \
				1330	}
				1331
				1332	/**
				1333	* htmlEntityLookup:
				1334	* @name: the entity name
				1335	*
				1336	* Lookup the given entity in EntitiesTable
				1337	*
				1338	* TODO: the linear scan is really ugly, an hash table is really needed.
				1339	*
				1340	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
				1341	*/
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	1342	const htmlEntityDesc *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1343	htmlEntityLookup(const xmlChar *name) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1344	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1345
				1346	for (i = 0;i < (sizeof(html40EntitiesTable)/
				1347	sizeof(html40EntitiesTable[0]));i++) {
				1348	if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
				1349	#ifdef DEBUG
				1350	xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
				1351	#endif
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	1352	return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1353	}
				1354	}
				1355	return(NULL);
				1356	}
				1357
				1358	/**
				1359	* htmlEntityValueLookup:
				1360	* @value: the entity's unicode value
				1361	*
				1362	* Lookup the given entity in EntitiesTable
				1363	*
				1364	* TODO: the linear scan is really ugly, an hash table is really needed.
				1365	*
				1366	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
				1367	*/
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	1368	const htmlEntityDesc *
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1369	htmlEntityValueLookup(unsigned int value) {
				1370	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1371	#ifdef DEBUG
Daniel Veillard	f69bb4b	2001-05-19 13:24:56 +0000	[diff] [blame]	1372	unsigned int lv = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1373	#endif
				1374
				1375	for (i = 0;i < (sizeof(html40EntitiesTable)/
				1376	sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1377	if (html40EntitiesTable[i].value >= value) {
				1378	if (html40EntitiesTable[i].value > value)
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1379	break;
				1380	#ifdef DEBUG
				1381	xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
				1382	#endif
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	1383	return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1384	}
				1385	#ifdef DEBUG
				1386	if (lv > html40EntitiesTable[i].value) {
				1387	xmlGenericError(xmlGenericErrorContext,
				1388	"html40EntitiesTable[] is not sorted (%d > %d)!\n",
				1389	lv, html40EntitiesTable[i].value);
				1390	}
				1391	lv = html40EntitiesTable[i].value;
				1392	#endif
				1393	}
				1394	return(NULL);
				1395	}
				1396
				1397	/**
				1398	* UTF8ToHtml:
				1399	* @out: a pointer to an array of bytes to store the result
				1400	* @outlen: the length of @out
				1401	* @in: a pointer to an array of UTF-8 chars
				1402	* @inlen: the length of @in
				1403	*
				1404	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				1405	* plus HTML entities block of chars out.
				1406	*
				1407	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				1408	* The value of @inlen after return is the number of octets consumed
				1409	* as the return value is positive, else unpredictiable.
				1410	* The value of @outlen after return is the number of octets consumed.
				1411	*/
				1412	int
				1413	UTF8ToHtml(unsigned char* out, int *outlen,
				1414	const unsigned char* in, int *inlen) {
				1415	const unsigned char* processed = in;
				1416	const unsigned char* outend;
				1417	const unsigned char* outstart = out;
				1418	const unsigned char* instart = in;
				1419	const unsigned char* inend;
				1420	unsigned int c, d;
				1421	int trailing;
				1422
				1423	if (in == NULL) {
				1424	/*
				1425	* initialization nothing to do
				1426	*/
				1427	*outlen = 0;
				1428	*inlen = 0;
				1429	return(0);
				1430	}
				1431	inend = in + (*inlen);
				1432	outend = out + (*outlen);
				1433	while (in < inend) {
				1434	d = *in++;
				1435	if (d < 0x80) { c= d; trailing= 0; }
				1436	else if (d < 0xC0) {
				1437	/* trailing byte in leading position */
				1438	*outlen = out - outstart;
				1439	*inlen = processed - instart;
				1440	return(-2);
				1441	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				1442	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				1443	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				1444	else {
				1445	/* no chance for this in Ascii */
				1446	*outlen = out - outstart;
				1447	*inlen = processed - instart;
				1448	return(-2);
				1449	}
				1450
				1451	if (inend - in < trailing) {
				1452	break;
				1453	}
				1454
				1455	for ( ; trailing; trailing--) {
				1456	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
				1457	break;
				1458	c <<= 6;
				1459	c \|= d & 0x3F;
				1460	}
				1461
				1462	/* assertion: c is a single UTF-4 value */
				1463	if (c < 0x80) {
				1464	if (out + 1 >= outend)
				1465	break;
				1466	*out++ = c;
				1467	} else {
				1468	int len;
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	1469	const htmlEntityDesc * ent;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1470
				1471	/*
				1472	* Try to lookup a predefined HTML entity for it
				1473	*/
				1474
				1475	ent = htmlEntityValueLookup(c);
				1476	if (ent == NULL) {
				1477	/* no chance for this in Ascii */
				1478	*outlen = out - outstart;
				1479	*inlen = processed - instart;
				1480	return(-2);
				1481	}
				1482	len = strlen(ent->name);
				1483	if (out + 2 + len >= outend)
				1484	break;
				1485	*out++ = '&';
				1486	memcpy(out, ent->name, len);
				1487	out += len;
				1488	*out++ = ';';
				1489	}
				1490	processed = in;
				1491	}
				1492	*outlen = out - outstart;
				1493	*inlen = processed - instart;
				1494	return(0);
				1495	}
				1496
				1497	/**
				1498	* htmlEncodeEntities:
				1499	* @out: a pointer to an array of bytes to store the result
				1500	* @outlen: the length of @out
				1501	* @in: a pointer to an array of UTF-8 chars
				1502	* @inlen: the length of @in
				1503	* @quoteChar: the quote character to escape (' or ") or zero.
				1504	*
				1505	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				1506	* plus HTML entities block of chars out.
				1507	*
				1508	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				1509	* The value of @inlen after return is the number of octets consumed
				1510	* as the return value is positive, else unpredictiable.
				1511	* The value of @outlen after return is the number of octets consumed.
				1512	*/
				1513	int
				1514	htmlEncodeEntities(unsigned char* out, int *outlen,
				1515	const unsigned char* in, int *inlen, int quoteChar) {
				1516	const unsigned char* processed = in;
				1517	const unsigned char* outend = out + (*outlen);
				1518	const unsigned char* outstart = out;
				1519	const unsigned char* instart = in;
				1520	const unsigned char* inend = in + (*inlen);
				1521	unsigned int c, d;
				1522	int trailing;
				1523
				1524	while (in < inend) {
				1525	d = *in++;
				1526	if (d < 0x80) { c= d; trailing= 0; }
				1527	else if (d < 0xC0) {
				1528	/* trailing byte in leading position */
				1529	*outlen = out - outstart;
				1530	*inlen = processed - instart;
				1531	return(-2);
				1532	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				1533	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				1534	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				1535	else {
				1536	/* no chance for this in Ascii */
				1537	*outlen = out - outstart;
				1538	*inlen = processed - instart;
				1539	return(-2);
				1540	}
				1541
				1542	if (inend - in < trailing)
				1543	break;
				1544
				1545	while (trailing--) {
				1546	if (((d= *in++) & 0xC0) != 0x80) {
				1547	*outlen = out - outstart;
				1548	*inlen = processed - instart;
				1549	return(-2);
				1550	}
				1551	c <<= 6;
				1552	c \|= d & 0x3F;
				1553	}
				1554
				1555	/* assertion: c is a single UTF-4 value */
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1556	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
				1557	(c != '&') && (c != '<') && (c != '>')) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1558	if (out >= outend)
				1559	break;
				1560	*out++ = c;
				1561	} else {
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	1562	const htmlEntityDesc * ent;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1563	const char *cp;
				1564	char nbuf[16];
				1565	int len;
				1566
				1567	/*
				1568	* Try to lookup a predefined HTML entity for it
				1569	*/
				1570	ent = htmlEntityValueLookup(c);
				1571	if (ent == NULL) {
				1572	sprintf(nbuf, "#%u", c);
				1573	cp = nbuf;
				1574	}
				1575	else
				1576	cp = ent->name;
				1577	len = strlen(cp);
				1578	if (out + 2 + len > outend)
				1579	break;
				1580	*out++ = '&';
				1581	memcpy(out, cp, len);
				1582	out += len;
				1583	*out++ = ';';
				1584	}
				1585	processed = in;
				1586	}
				1587	*outlen = out - outstart;
				1588	*inlen = processed - instart;
				1589	return(0);
				1590	}
				1591
				1592	/**
				1593	* htmlDecodeEntities:
				1594	* @ctxt: the parser context
				1595	* @len: the len to decode (in bytes !), -1 for no size limit
				1596	* @end: an end marker xmlChar, 0 if none
				1597	* @end2: an end marker xmlChar, 0 if none
				1598	* @end3: an end marker xmlChar, 0 if none
				1599	*
				1600	* Subtitute the HTML entities by their value
				1601	*
				1602	* DEPRECATED !!!!
				1603	*
				1604	* Returns A newly allocated string with the substitution done. The caller
				1605	* must deallocate it !
				1606	*/
				1607	xmlChar *
Daniel Veillard	c86a4fa	2001-03-26 16:28:29 +0000	[diff] [blame]	1608	htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
				1609	xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1610	static int deprecated = 0;
				1611	if (!deprecated) {
				1612	xmlGenericError(xmlGenericErrorContext,
				1613	"htmlDecodeEntities() deprecated function reached\n");
				1614	deprecated = 1;
				1615	}
				1616	return(NULL);
				1617	#if 0
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1618	xmlChar *name = NULL;
				1619	xmlChar *buffer = NULL;
				1620	unsigned int buffer_size = 0;
				1621	unsigned int nbchars = 0;
				1622	htmlEntityDescPtr ent;
				1623	unsigned int max = (unsigned int) len;
				1624	int c,l;
				1625
				1626	if (ctxt->depth > 40) {
				1627	ctxt->errNo = XML_ERR_ENTITY_LOOP;
				1628	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1629	ctxt->sax->error(ctxt->userData,
				1630	"Detected entity reference loop\n");
				1631	ctxt->wellFormed = 0;
				1632	ctxt->disableSAX = 1;
				1633	return(NULL);
				1634	}
				1635
				1636	/*
				1637	* allocate a translation buffer.
				1638	*/
				1639	buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
				1640	buffer = (xmlChar ) xmlMalloc(buffer_size sizeof(xmlChar));
				1641	if (buffer == NULL) {
				1642	perror("xmlDecodeEntities: malloc failed");
				1643	return(NULL);
				1644	}
				1645
				1646	/*
				1647	* Ok loop until we reach one of the ending char or a size limit.
				1648	*/
				1649	c = CUR_CHAR(l);
				1650	while ((nbchars < max) && (c != end) &&
				1651	(c != end2) && (c != end3)) {
				1652
				1653	if (c == 0) break;
				1654	if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
				1655	int val = htmlParseCharRef(ctxt);
				1656	COPY_BUF(0,buffer,nbchars,val);
				1657	NEXTL(l);
				1658	} else if ((c == '&') && (ctxt->token != '&')) {
				1659	ent = htmlParseEntityRef(ctxt, &name);
				1660	if (name != NULL) {
				1661	if (ent != NULL) {
				1662	int val = ent->value;
				1663	COPY_BUF(0,buffer,nbchars,val);
				1664	NEXTL(l);
				1665	} else {
				1666	const xmlChar *cur = name;
				1667
				1668	buffer[nbchars++] = '&';
				1669	if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
				1670	growBuffer(buffer);
				1671	}
				1672	while (*cur != 0) {
				1673	buffer[nbchars++] = *cur++;
				1674	}
				1675	buffer[nbchars++] = ';';
				1676	}
				1677	}
				1678	} else {
				1679	COPY_BUF(l,buffer,nbchars,c);
				1680	NEXTL(l);
				1681	if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
				1682	growBuffer(buffer);
				1683	}
				1684	}
				1685	c = CUR_CHAR(l);
				1686	}
				1687	buffer[nbchars++] = 0;
				1688	return(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1689	#endif
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1690	}
				1691
				1692	/************************************************************************
				1693	* *
				1694	* Commodity functions to handle streams *
				1695	* *
				1696	************************************************************************/
				1697
				1698	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1699	* htmlNewInputStream:
				1700	* @ctxt: an HTML parser context
				1701	*
				1702	* Create a new input stream structure
				1703	* Returns the new input stream or NULL
				1704	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1705	static htmlParserInputPtr
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1706	htmlNewInputStream(htmlParserCtxtPtr ctxt) {
				1707	htmlParserInputPtr input;
				1708
				1709	input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				1710	if (input == NULL) {
				1711	ctxt->errNo = XML_ERR_NO_MEMORY;
				1712	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1713	ctxt->sax->error(ctxt->userData,
				1714	"malloc: couldn't allocate a new input stream\n");
				1715	return(NULL);
				1716	}
				1717	memset(input, 0, sizeof(htmlParserInput));
				1718	input->filename = NULL;
				1719	input->directory = NULL;
				1720	input->base = NULL;
				1721	input->cur = NULL;
				1722	input->buf = NULL;
				1723	input->line = 1;
				1724	input->col = 1;
				1725	input->buf = NULL;
				1726	input->free = NULL;
				1727	input->version = NULL;
				1728	input->consumed = 0;
				1729	input->length = 0;
				1730	return(input);
				1731	}
				1732
				1733
				1734	/************************************************************************
				1735	* *
				1736	* Commodity functions, cleanup needed ? *
				1737	* *
				1738	************************************************************************/
				1739
				1740	/**
				1741	* areBlanks:
				1742	* @ctxt: an HTML parser context
				1743	* @str: a xmlChar *
				1744	* @len: the size of @str
				1745	*
				1746	* Is this a sequence of blank chars that one can ignore ?
				1747	*
				1748	* Returns 1 if ignorable 0 otherwise.
				1749	*/
				1750
				1751	static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
				1752	int i;
				1753	xmlNodePtr lastChild;
				1754
				1755	for (i = 0;i < len;i++)
				1756	if (!(IS_BLANK(str[i]))) return(0);
				1757
				1758	if (CUR == 0) return(1);
				1759	if (CUR != '<') return(0);
				1760	if (ctxt->name == NULL)
				1761	return(1);
				1762	if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
				1763	return(1);
				1764	if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
				1765	return(1);
				1766	if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
				1767	return(1);
				1768	if (ctxt->node == NULL) return(0);
				1769	lastChild = xmlGetLastChild(ctxt->node);
				1770	if (lastChild == NULL) {
Daniel Veillard	7db3773	2001-07-12 01:20:08 +0000	[diff] [blame]	1771	if ((ctxt->node->type != XML_ELEMENT_NODE) &&
				1772	(ctxt->node->content != NULL)) return(0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1773	} else if (xmlNodeIsText(lastChild)) {
				1774	return(0);
				1775	} else if (xmlStrEqual(lastChild->name, BAD_CAST"b")) {
				1776	return(0);
				1777	} else if (xmlStrEqual(lastChild->name, BAD_CAST"bold")) {
				1778	return(0);
				1779	} else if (xmlStrEqual(lastChild->name, BAD_CAST"em")) {
				1780	return(0);
				1781	}
				1782	return(1);
				1783	}
				1784
				1785	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1786	* htmlNewDocNoDtD:
				1787	* @URI: URI for the dtd, or NULL
				1788	* @ExternalID: the external ID of the DTD, or NULL
				1789	*
Daniel Veillard	5e2dace	2001-07-18 19:30:27 +0000	[diff] [blame]	1790	* Creates a new HTML document without a DTD node if @URI and @ExternalID
				1791	* are NULL
				1792	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1793	* Returns a new document, do not intialize the DTD if not provided
				1794	*/
				1795	htmlDocPtr
				1796	htmlNewDocNoDtD(const xmlChar URI, const xmlChar ExternalID) {
				1797	xmlDocPtr cur;
				1798
				1799	/*
				1800	* Allocate a new document and fill the fields.
				1801	*/
				1802	cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
				1803	if (cur == NULL) {
				1804	xmlGenericError(xmlGenericErrorContext,
				1805	"xmlNewDoc : malloc failed\n");
				1806	return(NULL);
				1807	}
				1808	memset(cur, 0, sizeof(xmlDoc));
				1809
				1810	cur->type = XML_HTML_DOCUMENT_NODE;
				1811	cur->version = NULL;
				1812	cur->intSubset = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1813	cur->doc = cur;
				1814	cur->name = NULL;
				1815	cur->children = NULL;
				1816	cur->extSubset = NULL;
				1817	cur->oldNs = NULL;
				1818	cur->encoding = NULL;
				1819	cur->standalone = 1;
				1820	cur->compression = 0;
				1821	cur->ids = NULL;
				1822	cur->refs = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1823	cur->_private = NULL;
Daniel Veillard	b6b0fd8	2001-10-22 12:31:11 +0000	[diff] [blame]	1824	if ((ExternalID != NULL) \|\|
				1825	(URI != NULL))
Daniel Veillard	5151c06	2001-10-23 13:10:19 +0000	[diff] [blame]	1826	xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1827	return(cur);
				1828	}
				1829
				1830	/**
				1831	* htmlNewDoc:
				1832	* @URI: URI for the dtd, or NULL
				1833	* @ExternalID: the external ID of the DTD, or NULL
				1834	*
Daniel Veillard	5e2dace	2001-07-18 19:30:27 +0000	[diff] [blame]	1835	* Creates a new HTML document
				1836	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1837	* Returns a new document
				1838	*/
				1839	htmlDocPtr
				1840	htmlNewDoc(const xmlChar URI, const xmlChar ExternalID) {
				1841	if ((URI == NULL) && (ExternalID == NULL))
				1842	return(htmlNewDocNoDtD(
Daniel Veillard	6426935	2001-05-04 17:52:34 +0000	[diff] [blame]	1843	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
				1844	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1845
				1846	return(htmlNewDocNoDtD(URI, ExternalID));
				1847	}
				1848
				1849
				1850	/************************************************************************
				1851	* *
				1852	* The parser itself *
				1853	* Relates to http://www.w3.org/TR/html40 *
				1854	* *
				1855	************************************************************************/
				1856
				1857	/************************************************************************
				1858	* *
				1859	* The parser itself *
				1860	* *
				1861	************************************************************************/
				1862
				1863	/**
				1864	* htmlParseHTMLName:
				1865	* @ctxt: an HTML parser context
				1866	*
				1867	* parse an HTML tag or attribute name, note that we convert it to lowercase
				1868	* since HTML names are not case-sensitive.
				1869	*
				1870	* Returns the Tag Name parsed or NULL
				1871	*/
				1872
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1873	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1874	htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
				1875	xmlChar *ret = NULL;
				1876	int i = 0;
				1877	xmlChar loc[HTML_PARSER_BUFFER_SIZE];
				1878
				1879	if (!IS_LETTER(CUR) && (CUR != '_') &&
				1880	(CUR != ':')) return(NULL);
				1881
				1882	while ((i < HTML_PARSER_BUFFER_SIZE) &&
				1883	((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1884	(CUR == ':') \|\| (CUR == '-') \|\| (CUR == '_'))) {
				1885	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
				1886	else loc[i] = CUR;
				1887	i++;
				1888
				1889	NEXT;
				1890	}
				1891
				1892	ret = xmlStrndup(loc, i);
				1893
				1894	return(ret);
				1895	}
				1896
				1897	/**
				1898	* htmlParseName:
				1899	* @ctxt: an HTML parser context
				1900	*
				1901	* parse an HTML name, this routine is case sensistive.
				1902	*
				1903	* Returns the Name parsed or NULL
				1904	*/
				1905
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1906	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1907	htmlParseName(htmlParserCtxtPtr ctxt) {
				1908	xmlChar buf[HTML_MAX_NAMELEN];
				1909	int len = 0;
				1910
				1911	GROW;
				1912	if (!IS_LETTER(CUR) && (CUR != '_')) {
				1913	return(NULL);
				1914	}
				1915
				1916	while ((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1917	(CUR == '.') \|\| (CUR == '-') \|\|
				1918	(CUR == '_') \|\| (CUR == ':') \|\|
				1919	(IS_COMBINING(CUR)) \|\|
				1920	(IS_EXTENDER(CUR))) {
				1921	buf[len++] = CUR;
				1922	NEXT;
				1923	if (len >= HTML_MAX_NAMELEN) {
				1924	xmlGenericError(xmlGenericErrorContext,
				1925	"htmlParseName: reached HTML_MAX_NAMELEN limit\n");
				1926	while ((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1927	(CUR == '.') \|\| (CUR == '-') \|\|
				1928	(CUR == '_') \|\| (CUR == ':') \|\|
				1929	(IS_COMBINING(CUR)) \|\|
				1930	(IS_EXTENDER(CUR)))
				1931	NEXT;
				1932	break;
				1933	}
				1934	}
				1935	return(xmlStrndup(buf, len));
				1936	}
				1937
				1938	/**
				1939	* htmlParseHTMLAttribute:
				1940	* @ctxt: an HTML parser context
				1941	* @stop: a char stop value
				1942	*
				1943	* parse an HTML attribute value till the stop (quote), if
				1944	* stop is 0 then it stops at the first space
				1945	*
				1946	* Returns the attribute parsed or NULL
				1947	*/
				1948
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1949	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1950	htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
				1951	xmlChar *buffer = NULL;
				1952	int buffer_size = 0;
				1953	xmlChar *out = NULL;
				1954	xmlChar *name = NULL;
				1955
				1956	xmlChar *cur = NULL;
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	1957	const htmlEntityDesc * ent;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1958
				1959	/*
				1960	* allocate a translation buffer.
				1961	*/
				1962	buffer_size = HTML_PARSER_BUFFER_SIZE;
				1963	buffer = (xmlChar ) xmlMalloc(buffer_size sizeof(xmlChar));
				1964	if (buffer == NULL) {
				1965	perror("htmlParseHTMLAttribute: malloc failed");
				1966	return(NULL);
				1967	}
				1968	out = buffer;
				1969
				1970	/*
				1971	* Ok loop until we reach one of the ending chars
				1972	*/
Daniel Veillard	957fdcf	2001-11-06 22:50:19 +0000	[diff] [blame^]	1973	while ((CUR != 0) && (CUR != stop)) {
				1974	if ((stop == 0) && (CUR == '>')) break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1975	if ((stop == 0) && (IS_BLANK(CUR))) break;
				1976	if (CUR == '&') {
				1977	if (NXT(1) == '#') {
				1978	unsigned int c;
				1979	int bits;
				1980
				1981	c = htmlParseCharRef(ctxt);
				1982	if (c < 0x80)
				1983	{ *out++ = c; bits= -6; }
				1984	else if (c < 0x800)
				1985	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				1986	else if (c < 0x10000)
				1987	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				1988	else
				1989	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				1990
				1991	for ( ; bits >= 0; bits-= 6) {
				1992	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				1993	}
				1994	} else {
				1995	ent = htmlParseEntityRef(ctxt, &name);
				1996	if (name == NULL) {
				1997	*out++ = '&';
				1998	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1999	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2000
				2001	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2002	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2003	}
				2004	} else if (ent == NULL) {
				2005	*out++ = '&';
				2006	cur = name;
				2007	while (*cur != 0) {
				2008	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2009	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2010
				2011	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2012	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2013	}
				2014	out++ = cur++;
				2015	}
				2016	xmlFree(name);
				2017	} else {
				2018	unsigned int c;
				2019	int bits;
				2020
				2021	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2022	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2023
				2024	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2025	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2026	}
				2027	c = (xmlChar)ent->value;
				2028	if (c < 0x80)
				2029	{ *out++ = c; bits= -6; }
				2030	else if (c < 0x800)
				2031	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2032	else if (c < 0x10000)
				2033	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2034	else
				2035	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2036
				2037	for ( ; bits >= 0; bits-= 6) {
				2038	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2039	}
				2040	xmlFree(name);
				2041	}
				2042	}
				2043	} else {
				2044	unsigned int c;
				2045	int bits, l;
				2046
				2047	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2048	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2049
				2050	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2051	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2052	}
				2053	c = CUR_CHAR(l);
				2054	if (c < 0x80)
				2055	{ *out++ = c; bits= -6; }
				2056	else if (c < 0x800)
				2057	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2058	else if (c < 0x10000)
				2059	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2060	else
				2061	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2062
				2063	for ( ; bits >= 0; bits-= 6) {
				2064	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2065	}
				2066	NEXT;
				2067	}
				2068	}
				2069	*out++ = 0;
				2070	return(buffer);
				2071	}
				2072
				2073	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2074	* htmlParseEntityRef:
				2075	* @ctxt: an HTML parser context
				2076	* @str: location to store the entity name
				2077	*
				2078	* parse an HTML ENTITY references
				2079	*
				2080	* [68] EntityRef ::= '&' Name ';'
				2081	*
				2082	* Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
				2083	* if non-NULL *str will have to be freed by the caller.
				2084	*/
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	2085	const htmlEntityDesc *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2086	htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
				2087	xmlChar *name;
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	2088	const htmlEntityDesc * ent = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2089	*str = NULL;
				2090
				2091	if (CUR == '&') {
				2092	NEXT;
				2093	name = htmlParseName(ctxt);
				2094	if (name == NULL) {
				2095	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2096	ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
				2097	ctxt->wellFormed = 0;
				2098	} else {
				2099	GROW;
				2100	if (CUR == ';') {
				2101	*str = name;
				2102
				2103	/*
				2104	* Lookup the entity in the table.
				2105	*/
				2106	ent = htmlEntityLookup(name);
				2107	if (ent != NULL) /* OK that's ugly !!! */
				2108	NEXT;
				2109	} else {
				2110	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2111	ctxt->sax->error(ctxt->userData,
				2112	"htmlParseEntityRef: expecting ';'\n");
				2113	*str = name;
				2114	}
				2115	}
				2116	}
				2117	return(ent);
				2118	}
				2119
				2120	/**
				2121	* htmlParseAttValue:
				2122	* @ctxt: an HTML parser context
				2123	*
				2124	* parse a value for an attribute
				2125	* Note: the parser won't do substitution of entities here, this
				2126	* will be handled later in xmlStringGetNodeList, unless it was
				2127	* asked for ctxt->replaceEntities != 0
				2128	*
				2129	* Returns the AttValue parsed or NULL.
				2130	*/
				2131
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2132	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2133	htmlParseAttValue(htmlParserCtxtPtr ctxt) {
				2134	xmlChar *ret = NULL;
				2135
				2136	if (CUR == '"') {
				2137	NEXT;
				2138	ret = htmlParseHTMLAttribute(ctxt, '"');
				2139	if (CUR != '"') {
				2140	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2141	ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
				2142	ctxt->wellFormed = 0;
				2143	} else
				2144	NEXT;
				2145	} else if (CUR == '\'') {
				2146	NEXT;
				2147	ret = htmlParseHTMLAttribute(ctxt, '\'');
				2148	if (CUR != '\'') {
				2149	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2150	ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
				2151	ctxt->wellFormed = 0;
				2152	} else
				2153	NEXT;
				2154	} else {
				2155	/*
				2156	* That's an HTMLism, the attribute value may not be quoted
				2157	*/
				2158	ret = htmlParseHTMLAttribute(ctxt, 0);
				2159	if (ret == NULL) {
				2160	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2161	ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
				2162	ctxt->wellFormed = 0;
				2163	}
				2164	}
				2165	return(ret);
				2166	}
				2167
				2168	/**
				2169	* htmlParseSystemLiteral:
				2170	* @ctxt: an HTML parser context
				2171	*
				2172	* parse an HTML Literal
				2173	*
				2174	* [11] SystemLiteral ::= ('"' [^"]* '"') \| ("'" [^']* "'")
				2175	*
				2176	* Returns the SystemLiteral parsed or NULL
				2177	*/
				2178
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2179	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2180	htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
				2181	const xmlChar *q;
				2182	xmlChar *ret = NULL;
				2183
				2184	if (CUR == '"') {
				2185	NEXT;
				2186	q = CUR_PTR;
				2187	while ((IS_CHAR(CUR)) && (CUR != '"'))
				2188	NEXT;
				2189	if (!IS_CHAR(CUR)) {
				2190	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2191	ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
				2192	ctxt->wellFormed = 0;
				2193	} else {
				2194	ret = xmlStrndup(q, CUR_PTR - q);
				2195	NEXT;
				2196	}
				2197	} else if (CUR == '\'') {
				2198	NEXT;
				2199	q = CUR_PTR;
				2200	while ((IS_CHAR(CUR)) && (CUR != '\''))
				2201	NEXT;
				2202	if (!IS_CHAR(CUR)) {
				2203	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2204	ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
				2205	ctxt->wellFormed = 0;
				2206	} else {
				2207	ret = xmlStrndup(q, CUR_PTR - q);
				2208	NEXT;
				2209	}
				2210	} else {
				2211	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2212	ctxt->sax->error(ctxt->userData,
				2213	"SystemLiteral \" or ' expected\n");
				2214	ctxt->wellFormed = 0;
				2215	}
				2216
				2217	return(ret);
				2218	}
				2219
				2220	/**
				2221	* htmlParsePubidLiteral:
				2222	* @ctxt: an HTML parser context
				2223	*
				2224	* parse an HTML public literal
				2225	*
				2226	* [12] PubidLiteral ::= '"' PubidChar* '"' \| "'" (PubidChar - "'")* "'"
				2227	*
				2228	* Returns the PubidLiteral parsed or NULL.
				2229	*/
				2230
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2231	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2232	htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
				2233	const xmlChar *q;
				2234	xmlChar *ret = NULL;
				2235	/*
				2236	* Name ::= (Letter \| '_') (NameChar)*
				2237	*/
				2238	if (CUR == '"') {
				2239	NEXT;
				2240	q = CUR_PTR;
				2241	while (IS_PUBIDCHAR(CUR)) NEXT;
				2242	if (CUR != '"') {
				2243	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2244	ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
				2245	ctxt->wellFormed = 0;
				2246	} else {
				2247	ret = xmlStrndup(q, CUR_PTR - q);
				2248	NEXT;
				2249	}
				2250	} else if (CUR == '\'') {
				2251	NEXT;
				2252	q = CUR_PTR;
				2253	while ((IS_LETTER(CUR)) && (CUR != '\''))
				2254	NEXT;
				2255	if (!IS_LETTER(CUR)) {
				2256	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2257	ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
				2258	ctxt->wellFormed = 0;
				2259	} else {
				2260	ret = xmlStrndup(q, CUR_PTR - q);
				2261	NEXT;
				2262	}
				2263	} else {
				2264	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2265	ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
				2266	ctxt->wellFormed = 0;
				2267	}
				2268
				2269	return(ret);
				2270	}
				2271
				2272	/**
				2273	* htmlParseScript:
				2274	* @ctxt: an HTML parser context
				2275	*
				2276	* parse the content of an HTML SCRIPT or STYLE element
				2277	* http://www.w3.org/TR/html4/sgml/dtd.html#Script
				2278	* http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
				2279	* http://www.w3.org/TR/html4/types.html#type-script
				2280	* http://www.w3.org/TR/html4/types.html#h-6.15
				2281	* http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
				2282	*
				2283	* Script data ( %Script; in the DTD) can be the content of the SCRIPT
				2284	* element and the value of intrinsic event attributes. User agents must
				2285	* not evaluate script data as HTML markup but instead must pass it on as
				2286	* data to a script engine.
				2287	* NOTES:
				2288	* - The content is passed like CDATA
				2289	* - the attributes for style and scripting "onXXX" are also described
				2290	* as CDATA but SGML allows entities references in attributes so their
				2291	* processing is identical as other attributes
				2292	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2293	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2294	htmlParseScript(htmlParserCtxtPtr ctxt) {
				2295	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
				2296	int nbchar = 0;
				2297	xmlChar cur;
				2298
				2299	SHRINK;
				2300	cur = CUR;
				2301	while (IS_CHAR(cur)) {
				2302	if ((cur == '<') && (NXT(1) == '/')) {
				2303	/*
				2304	* One should break here, the specification is clear:
				2305	* Authors should therefore escape "</" within the content.
				2306	* Escape mechanisms are specific to each scripting or
				2307	* style sheet language.
				2308	*/
				2309	if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) \|\|
				2310	((NXT(2) >= 'a') && (NXT(2) <= 'z')))
				2311	break; /* while */
				2312	}
				2313	buf[nbchar++] = cur;
				2314	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
				2315	if (ctxt->sax->cdataBlock!= NULL) {
				2316	/*
				2317	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2318	*/
				2319	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2320	}
				2321	nbchar = 0;
				2322	}
				2323	NEXT;
				2324	cur = CUR;
				2325	}
				2326	if (!(IS_CHAR(cur))) {
				2327	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2328	ctxt->sax->error(ctxt->userData,
				2329	"Invalid char in CDATA 0x%X\n", cur);
				2330	ctxt->wellFormed = 0;
				2331	NEXT;
				2332	}
				2333
				2334	if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2335	if (ctxt->sax->cdataBlock!= NULL) {
				2336	/*
				2337	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2338	*/
				2339	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2340	}
				2341	}
				2342	}
				2343
				2344
				2345	/**
				2346	* htmlParseCharData:
				2347	* @ctxt: an HTML parser context
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2348	*
				2349	* parse a CharData section.
				2350	* if we are within a CDATA section ']]>' marks an end of section.
				2351	*
				2352	* [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
				2353	*/
				2354
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2355	static void
				2356	htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2357	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
				2358	int nbchar = 0;
				2359	int cur, l;
				2360
				2361	SHRINK;
				2362	cur = CUR_CHAR(l);
				2363	while (((cur != '<') \|\| (ctxt->token == '<')) &&
				2364	((cur != '&') \|\| (ctxt->token == '&')) &&
				2365	(IS_CHAR(cur))) {
				2366	COPY_BUF(l,buf,nbchar,cur);
				2367	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
				2368	/*
				2369	* Ok the segment is to be consumed as chars.
				2370	*/
				2371	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2372	if (areBlanks(ctxt, buf, nbchar)) {
				2373	if (ctxt->sax->ignorableWhitespace != NULL)
				2374	ctxt->sax->ignorableWhitespace(ctxt->userData,
				2375	buf, nbchar);
				2376	} else {
				2377	htmlCheckParagraph(ctxt);
				2378	if (ctxt->sax->characters != NULL)
				2379	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				2380	}
				2381	}
				2382	nbchar = 0;
				2383	}
				2384	NEXTL(l);
				2385	cur = CUR_CHAR(l);
				2386	}
				2387	if (nbchar != 0) {
				2388	/*
				2389	* Ok the segment is to be consumed as chars.
				2390	*/
				2391	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2392	if (areBlanks(ctxt, buf, nbchar)) {
				2393	if (ctxt->sax->ignorableWhitespace != NULL)
				2394	ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
				2395	} else {
				2396	htmlCheckParagraph(ctxt);
				2397	if (ctxt->sax->characters != NULL)
				2398	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				2399	}
				2400	}
Daniel Veillard	7cc95c0	2001-10-17 15:45:12 +0000	[diff] [blame]	2401	} else {
				2402	/*
				2403	* Loop detection
				2404	*/
				2405	if (cur == 0)
				2406	ctxt->instate = XML_PARSER_EOF;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2407	}
				2408	}
				2409
				2410	/**
				2411	* htmlParseExternalID:
				2412	* @ctxt: an HTML parser context
				2413	* @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2414	*
				2415	* Parse an External ID or a Public ID
				2416	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2417	* [75] ExternalID ::= 'SYSTEM' S SystemLiteral
				2418	* \| 'PUBLIC' S PubidLiteral S SystemLiteral
				2419	*
				2420	* [83] PublicID ::= 'PUBLIC' S PubidLiteral
				2421	*
				2422	* Returns the function returns SystemLiteral and in the second
				2423	* case publicID receives PubidLiteral, is strict is off
				2424	* it is possible to return NULL and have publicID set.
				2425	*/
				2426
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2427	static xmlChar *
				2428	htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2429	xmlChar *URI = NULL;
				2430
				2431	if ((UPPER == 'S') && (UPP(1) == 'Y') &&
				2432	(UPP(2) == 'S') && (UPP(3) == 'T') &&
				2433	(UPP(4) == 'E') && (UPP(5) == 'M')) {
				2434	SKIP(6);
				2435	if (!IS_BLANK(CUR)) {
				2436	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2437	ctxt->sax->error(ctxt->userData,
				2438	"Space required after 'SYSTEM'\n");
				2439	ctxt->wellFormed = 0;
				2440	}
				2441	SKIP_BLANKS;
				2442	URI = htmlParseSystemLiteral(ctxt);
				2443	if (URI == NULL) {
				2444	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2445	ctxt->sax->error(ctxt->userData,
				2446	"htmlParseExternalID: SYSTEM, no URI\n");
				2447	ctxt->wellFormed = 0;
				2448	}
				2449	} else if ((UPPER == 'P') && (UPP(1) == 'U') &&
				2450	(UPP(2) == 'B') && (UPP(3) == 'L') &&
				2451	(UPP(4) == 'I') && (UPP(5) == 'C')) {
				2452	SKIP(6);
				2453	if (!IS_BLANK(CUR)) {
				2454	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2455	ctxt->sax->error(ctxt->userData,
				2456	"Space required after 'PUBLIC'\n");
				2457	ctxt->wellFormed = 0;
				2458	}
				2459	SKIP_BLANKS;
				2460	*publicID = htmlParsePubidLiteral(ctxt);
				2461	if (*publicID == NULL) {
				2462	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2463	ctxt->sax->error(ctxt->userData,
				2464	"htmlParseExternalID: PUBLIC, no Public Identifier\n");
				2465	ctxt->wellFormed = 0;
				2466	}
				2467	SKIP_BLANKS;
				2468	if ((CUR == '"') \|\| (CUR == '\'')) {
				2469	URI = htmlParseSystemLiteral(ctxt);
				2470	}
				2471	}
				2472	return(URI);
				2473	}
				2474
				2475	/**
				2476	* htmlParseComment:
				2477	* @ctxt: an HTML parser context
				2478	*
				2479	* Parse an XML (SGML) comment <!-- .... -->
				2480	*
				2481	* [15] Comment ::= '<!--' ((Char - '-') \| ('-' (Char - '-')))* '-->'
				2482	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2483	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2484	htmlParseComment(htmlParserCtxtPtr ctxt) {
				2485	xmlChar *buf = NULL;
				2486	int len;
				2487	int size = HTML_PARSER_BUFFER_SIZE;
				2488	int q, ql;
				2489	int r, rl;
				2490	int cur, l;
				2491	xmlParserInputState state;
				2492
				2493	/*
				2494	* Check that there is a comment right here.
				2495	*/
				2496	if ((RAW != '<') \|\| (NXT(1) != '!') \|\|
				2497	(NXT(2) != '-') \|\| (NXT(3) != '-')) return;
				2498
				2499	state = ctxt->instate;
				2500	ctxt->instate = XML_PARSER_COMMENT;
				2501	SHRINK;
				2502	SKIP(4);
				2503	buf = (xmlChar ) xmlMalloc(size sizeof(xmlChar));
				2504	if (buf == NULL) {
				2505	xmlGenericError(xmlGenericErrorContext,
				2506	"malloc of %d byte failed\n", size);
				2507	ctxt->instate = state;
				2508	return;
				2509	}
				2510	q = CUR_CHAR(ql);
				2511	NEXTL(ql);
				2512	r = CUR_CHAR(rl);
				2513	NEXTL(rl);
				2514	cur = CUR_CHAR(l);
				2515	len = 0;
				2516	while (IS_CHAR(cur) &&
				2517	((cur != '>') \|\|
				2518	(r != '-') \|\| (q != '-'))) {
				2519	if (len + 5 >= size) {
				2520	size *= 2;
				2521	buf = (xmlChar ) xmlRealloc(buf, size sizeof(xmlChar));
				2522	if (buf == NULL) {
				2523	xmlGenericError(xmlGenericErrorContext,
				2524	"realloc of %d byte failed\n", size);
				2525	ctxt->instate = state;
				2526	return;
				2527	}
				2528	}
				2529	COPY_BUF(ql,buf,len,q);
				2530	q = r;
				2531	ql = rl;
				2532	r = cur;
				2533	rl = l;
				2534	NEXTL(l);
				2535	cur = CUR_CHAR(l);
				2536	if (cur == 0) {
				2537	SHRINK;
				2538	GROW;
				2539	cur = CUR_CHAR(l);
				2540	}
				2541	}
				2542	buf[len] = 0;
				2543	if (!IS_CHAR(cur)) {
				2544	ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
				2545	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2546	ctxt->sax->error(ctxt->userData,
				2547	"Comment not terminated \n<!--%.50s\n", buf);
				2548	ctxt->wellFormed = 0;
				2549	xmlFree(buf);
				2550	} else {
				2551	NEXT;
				2552	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
				2553	(!ctxt->disableSAX))
				2554	ctxt->sax->comment(ctxt->userData, buf);
				2555	xmlFree(buf);
				2556	}
				2557	ctxt->instate = state;
				2558	}
				2559
				2560	/**
				2561	* htmlParseCharRef:
				2562	* @ctxt: an HTML parser context
				2563	*
				2564	* parse Reference declarations
				2565	*
				2566	* [66] CharRef ::= '&#' [0-9]+ ';' \|
				2567	* '&#x' [0-9a-fA-F]+ ';'
				2568	*
				2569	* Returns the value parsed (as an int)
				2570	*/
				2571	int
				2572	htmlParseCharRef(htmlParserCtxtPtr ctxt) {
				2573	int val = 0;
				2574
				2575	if ((CUR == '&') && (NXT(1) == '#') &&
				2576	(NXT(2) == 'x')) {
				2577	SKIP(3);
				2578	while (CUR != ';') {
				2579	if ((CUR >= '0') && (CUR <= '9'))
				2580	val = val * 16 + (CUR - '0');
				2581	else if ((CUR >= 'a') && (CUR <= 'f'))
				2582	val = val * 16 + (CUR - 'a') + 10;
				2583	else if ((CUR >= 'A') && (CUR <= 'F'))
				2584	val = val * 16 + (CUR - 'A') + 10;
				2585	else {
				2586	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2587	ctxt->sax->error(ctxt->userData,
				2588	"htmlParseCharRef: invalid hexadecimal value\n");
				2589	ctxt->wellFormed = 0;
				2590	return(0);
				2591	}
				2592	NEXT;
				2593	}
				2594	if (CUR == ';')
				2595	NEXT;
				2596	} else if ((CUR == '&') && (NXT(1) == '#')) {
				2597	SKIP(2);
				2598	while (CUR != ';') {
				2599	if ((CUR >= '0') && (CUR <= '9'))
				2600	val = val * 10 + (CUR - '0');
				2601	else {
				2602	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2603	ctxt->sax->error(ctxt->userData,
				2604	"htmlParseCharRef: invalid decimal value\n");
				2605	ctxt->wellFormed = 0;
				2606	return(0);
				2607	}
				2608	NEXT;
				2609	}
				2610	if (CUR == ';')
				2611	NEXT;
				2612	} else {
				2613	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2614	ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
				2615	ctxt->wellFormed = 0;
				2616	}
				2617	/*
				2618	* Check the value IS_CHAR ...
				2619	*/
				2620	if (IS_CHAR(val)) {
				2621	return(val);
				2622	} else {
				2623	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2624	ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
				2625	val);
				2626	ctxt->wellFormed = 0;
				2627	}
				2628	return(0);
				2629	}
				2630
				2631
				2632	/**
				2633	* htmlParseDocTypeDecl :
				2634	* @ctxt: an HTML parser context
				2635	*
				2636	* parse a DOCTYPE declaration
				2637	*
				2638	* [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
				2639	* ('[' (markupdecl \| PEReference \| S)* ']' S?)? '>'
				2640	*/
				2641
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2642	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2643	htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
				2644	xmlChar *name;
				2645	xmlChar *ExternalID = NULL;
				2646	xmlChar *URI = NULL;
				2647
				2648	/*
				2649	* We know that '<!DOCTYPE' has been detected.
				2650	*/
				2651	SKIP(9);
				2652
				2653	SKIP_BLANKS;
				2654
				2655	/*
				2656	* Parse the DOCTYPE name.
				2657	*/
				2658	name = htmlParseName(ctxt);
				2659	if (name == NULL) {
				2660	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2661	ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
				2662	ctxt->wellFormed = 0;
				2663	}
				2664	/*
				2665	* Check that upper(name) == "HTML" !!!!!!!!!!!!!
				2666	*/
				2667
				2668	SKIP_BLANKS;
				2669
				2670	/*
				2671	* Check for SystemID and ExternalID
				2672	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2673	URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2674	SKIP_BLANKS;
				2675
				2676	/*
				2677	* We should be at the end of the DOCTYPE declaration.
				2678	*/
				2679	if (CUR != '>') {
				2680	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard	f6ed8bc	2001-10-02 09:22:47 +0000	[diff] [blame]	2681	ctxt->sax->error(ctxt->userData, "DOCTYPE improperly terminated\n");
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2682	ctxt->wellFormed = 0;
				2683	/* We shouldn't try to resynchronize ... */
				2684	}
				2685	NEXT;
				2686
				2687	/*
				2688	* Create or update the document accordingly to the DOCTYPE
				2689	*/
				2690	if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
				2691	(!ctxt->disableSAX))
				2692	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
				2693
				2694	/*
				2695	* Cleanup, since we don't use all those identifiers
				2696	*/
				2697	if (URI != NULL) xmlFree(URI);
				2698	if (ExternalID != NULL) xmlFree(ExternalID);
				2699	if (name != NULL) xmlFree(name);
				2700	}
				2701
				2702	/**
				2703	* htmlParseAttribute:
				2704	* @ctxt: an HTML parser context
				2705	* @value: a xmlChar ** used to store the value of the attribute
				2706	*
				2707	* parse an attribute
				2708	*
				2709	* [41] Attribute ::= Name Eq AttValue
				2710	*
				2711	* [25] Eq ::= S? '=' S?
				2712	*
				2713	* With namespace:
				2714	*
				2715	* [NS 11] Attribute ::= QName Eq AttValue
				2716	*
				2717	* Also the case QName == xmlns:??? is handled independently as a namespace
				2718	* definition.
				2719	*
				2720	* Returns the attribute name, and the value in *value.
				2721	*/
				2722
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2723	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2724	htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
				2725	xmlChar name, val = NULL;
				2726
				2727	*value = NULL;
				2728	name = htmlParseHTMLName(ctxt);
				2729	if (name == NULL) {
				2730	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2731	ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
				2732	ctxt->wellFormed = 0;
				2733	return(NULL);
				2734	}
				2735
				2736	/*
				2737	* read the value
				2738	*/
				2739	SKIP_BLANKS;
				2740	if (CUR == '=') {
				2741	NEXT;
				2742	SKIP_BLANKS;
				2743	val = htmlParseAttValue(ctxt);
				2744	/******
				2745	} else {
				2746	* TODO : some attribute must have values, some may not
				2747	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2748	ctxt->sax->warning(ctxt->userData,
				2749	"No value for attribute %s\n", name); */
				2750	}
				2751
				2752	*value = val;
				2753	return(name);
				2754	}
				2755
				2756	/**
				2757	* htmlCheckEncoding:
				2758	* @ctxt: an HTML parser context
				2759	* @attvalue: the attribute value
				2760	*
				2761	* Checks an http-equiv attribute from a Meta tag to detect
				2762	* the encoding
				2763	* If a new encoding is detected the parser is switched to decode
				2764	* it and pass UTF8
				2765	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2766	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2767	htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
				2768	const xmlChar *encoding;
				2769
				2770	if ((ctxt == NULL) \|\| (attvalue == NULL))
				2771	return;
				2772
				2773	/* do not change encoding */
				2774	if (ctxt->input->encoding != NULL)
				2775	return;
				2776
				2777	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
				2778	if (encoding != NULL) {
				2779	encoding += 8;
				2780	} else {
				2781	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
				2782	if (encoding != NULL)
				2783	encoding += 9;
				2784	}
				2785	if (encoding != NULL) {
				2786	xmlCharEncoding enc;
				2787	xmlCharEncodingHandlerPtr handler;
				2788
				2789	while ((encoding == ' ') \|\| (encoding == '\t')) encoding++;
				2790
				2791	if (ctxt->input->encoding != NULL)
				2792	xmlFree((xmlChar *) ctxt->input->encoding);
				2793	ctxt->input->encoding = xmlStrdup(encoding);
				2794
				2795	enc = xmlParseCharEncoding((const char *) encoding);
				2796	/*
				2797	* registered set of known encodings
				2798	*/
				2799	if (enc != XML_CHAR_ENCODING_ERROR) {
				2800	xmlSwitchEncoding(ctxt, enc);
				2801	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				2802	} else {
				2803	/*
				2804	* fallback for unknown encodings
				2805	*/
				2806	handler = xmlFindCharEncodingHandler((const char *) encoding);
				2807	if (handler != NULL) {
				2808	xmlSwitchToEncoding(ctxt, handler);
				2809	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				2810	} else {
				2811	ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
				2812	}
				2813	}
				2814
				2815	if ((ctxt->input->buf != NULL) &&
				2816	(ctxt->input->buf->encoder != NULL) &&
				2817	(ctxt->input->buf->raw != NULL) &&
				2818	(ctxt->input->buf->buffer != NULL)) {
				2819	int nbchars;
				2820	int processed;
				2821
				2822	/*
				2823	* convert as much as possible to the parser reading buffer.
				2824	*/
				2825	processed = ctxt->input->cur - ctxt->input->base;
				2826	xmlBufferShrink(ctxt->input->buf->buffer, processed);
				2827	nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
				2828	ctxt->input->buf->buffer,
				2829	ctxt->input->buf->raw);
				2830	if (nbchars < 0) {
				2831	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				2832	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2833	ctxt->sax->error(ctxt->userData,
				2834	"htmlCheckEncoding: encoder error\n");
				2835	}
				2836	ctxt->input->base =
				2837	ctxt->input->cur = ctxt->input->buf->buffer->content;
				2838	}
				2839	}
				2840	}
				2841
				2842	/**
				2843	* htmlCheckMeta:
				2844	* @ctxt: an HTML parser context
				2845	* @atts: the attributes values
				2846	*
				2847	* Checks an attributes from a Meta tag
				2848	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2849	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2850	htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
				2851	int i;
				2852	const xmlChar att, value;
				2853	int http = 0;
				2854	const xmlChar *content = NULL;
				2855
				2856	if ((ctxt == NULL) \|\| (atts == NULL))
				2857	return;
				2858
				2859	i = 0;
				2860	att = atts[i++];
				2861	while (att != NULL) {
				2862	value = atts[i++];
				2863	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
				2864	&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
				2865	http = 1;
				2866	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
				2867	content = value;
				2868	att = atts[i++];
				2869	}
				2870	if ((http) && (content != NULL))
				2871	htmlCheckEncoding(ctxt, content);
				2872
				2873	}
				2874
				2875	/**
				2876	* htmlParseStartTag:
				2877	* @ctxt: an HTML parser context
				2878	*
				2879	* parse a start of tag either for rule element or
				2880	* EmptyElement. In both case we don't parse the tag closing chars.
				2881	*
				2882	* [40] STag ::= '<' Name (S Attribute)* S? '>'
				2883	*
				2884	* [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
				2885	*
				2886	* With namespace:
				2887	*
				2888	* [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
				2889	*
				2890	* [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
				2891	*
				2892	*/
				2893
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2894	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2895	htmlParseStartTag(htmlParserCtxtPtr ctxt) {
				2896	xmlChar *name;
				2897	xmlChar *attname;
				2898	xmlChar *attvalue;
				2899	const xmlChar **atts = NULL;
				2900	int nbatts = 0;
				2901	int maxatts = 0;
				2902	int meta = 0;
				2903	int i;
				2904
				2905	if (CUR != '<') return;
				2906	NEXT;
				2907
				2908	GROW;
				2909	name = htmlParseHTMLName(ctxt);
				2910	if (name == NULL) {
				2911	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2912	ctxt->sax->error(ctxt->userData,
				2913	"htmlParseStartTag: invalid element name\n");
				2914	ctxt->wellFormed = 0;
				2915	/* Dump the bogus tag like browsers do */
				2916	while ((IS_CHAR(CUR)) && (CUR != '>'))
				2917	NEXT;
				2918	return;
				2919	}
				2920	if (xmlStrEqual(name, BAD_CAST"meta"))
				2921	meta = 1;
				2922
				2923	/*
				2924	* Check for auto-closure of HTML elements.
				2925	*/
				2926	htmlAutoClose(ctxt, name);
				2927
				2928	/*
				2929	* Check for implied HTML elements.
				2930	*/
				2931	htmlCheckImplied(ctxt, name);
				2932
				2933	/*
				2934	* Avoid html at any level > 0, head at any level != 1
				2935	* or any attempt to recurse body
				2936	*/
				2937	if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
				2938	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2939	ctxt->sax->error(ctxt->userData,
				2940	"htmlParseStartTag: misplaced <html> tag\n");
				2941	ctxt->wellFormed = 0;
				2942	xmlFree(name);
				2943	return;
				2944	}
				2945	if ((ctxt->nameNr != 1) &&
				2946	(xmlStrEqual(name, BAD_CAST"head"))) {
				2947	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2948	ctxt->sax->error(ctxt->userData,
				2949	"htmlParseStartTag: misplaced <head> tag\n");
				2950	ctxt->wellFormed = 0;
				2951	xmlFree(name);
				2952	return;
				2953	}
				2954	if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2955	int indx;
				2956	for (indx = 0;indx < ctxt->nameNr;indx++) {
				2957	if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2958	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2959	ctxt->sax->error(ctxt->userData,
				2960	"htmlParseStartTag: misplaced <body> tag\n");
				2961	ctxt->wellFormed = 0;
				2962	xmlFree(name);
				2963	return;
				2964	}
				2965	}
				2966	}
				2967
				2968	/*
				2969	* Now parse the attributes, it ends up with the ending
				2970	*
				2971	* (S Attribute)* S?
				2972	*/
				2973	SKIP_BLANKS;
				2974	while ((IS_CHAR(CUR)) &&
				2975	(CUR != '>') &&
				2976	((CUR != '/') \|\| (NXT(1) != '>'))) {
				2977	long cons = ctxt->nbChars;
				2978
				2979	GROW;
				2980	attname = htmlParseAttribute(ctxt, &attvalue);
				2981	if (attname != NULL) {
				2982
				2983	/*
				2984	* Well formedness requires at most one declaration of an attribute
				2985	*/
				2986	for (i = 0; i < nbatts;i += 2) {
				2987	if (xmlStrEqual(atts[i], attname)) {
				2988	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2989	ctxt->sax->error(ctxt->userData,
				2990	"Attribute %s redefined\n",
				2991	attname);
				2992	ctxt->wellFormed = 0;
				2993	xmlFree(attname);
				2994	if (attvalue != NULL)
				2995	xmlFree(attvalue);
				2996	goto failed;
				2997	}
				2998	}
				2999
				3000	/*
				3001	* Add the pair to atts
				3002	*/
				3003	if (atts == NULL) {
				3004	maxatts = 10;
				3005	atts = (const xmlChar *) xmlMalloc(maxatts sizeof(xmlChar *));
				3006	if (atts == NULL) {
				3007	xmlGenericError(xmlGenericErrorContext,
				3008	"malloc of %ld byte failed\n",
				3009	maxatts * (long)sizeof(xmlChar *));
				3010	if (name != NULL) xmlFree(name);
				3011	return;
				3012	}
				3013	} else if (nbatts + 4 > maxatts) {
				3014	maxatts *= 2;
				3015	atts = (const xmlChar *) xmlRealloc((void ) atts,
				3016	maxatts * sizeof(xmlChar *));
				3017	if (atts == NULL) {
				3018	xmlGenericError(xmlGenericErrorContext,
				3019	"realloc of %ld byte failed\n",
				3020	maxatts * (long)sizeof(xmlChar *));
				3021	if (name != NULL) xmlFree(name);
				3022	return;
				3023	}
				3024	}
				3025	atts[nbatts++] = attname;
				3026	atts[nbatts++] = attvalue;
				3027	atts[nbatts] = NULL;
				3028	atts[nbatts + 1] = NULL;
				3029	}
				3030	else {
				3031	/* Dump the bogus attribute string up to the next blank or
				3032	* the end of the tag. */
				3033	while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
				3034	&& ((CUR != '/') \|\| (NXT(1) != '>')))
				3035	NEXT;
				3036	}
				3037
				3038	failed:
				3039	SKIP_BLANKS;
				3040	if (cons == ctxt->nbChars) {
				3041	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3042	ctxt->sax->error(ctxt->userData,
				3043	"htmlParseStartTag: problem parsing attributes\n");
				3044	ctxt->wellFormed = 0;
				3045	break;
				3046	}
				3047	}
				3048
				3049	/*
				3050	* Handle specific association to the META tag
				3051	*/
				3052	if (meta)
				3053	htmlCheckMeta(ctxt, atts);
				3054
				3055	/*
				3056	* SAX: Start of Element !
				3057	*/
				3058	htmlnamePush(ctxt, xmlStrdup(name));
				3059	#ifdef DEBUG
				3060	xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
				3061	#endif
				3062	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				3063	ctxt->sax->startElement(ctxt->userData, name, atts);
				3064
				3065	if (atts != NULL) {
				3066	for (i = 0;i < nbatts;i++) {
				3067	if (atts[i] != NULL)
				3068	xmlFree((xmlChar *) atts[i]);
				3069	}
				3070	xmlFree((void *) atts);
				3071	}
				3072	if (name != NULL) xmlFree(name);
				3073	}
				3074
				3075	/**
				3076	* htmlParseEndTag:
				3077	* @ctxt: an HTML parser context
				3078	*
				3079	* parse an end of tag
				3080	*
				3081	* [42] ETag ::= '</' Name S? '>'
				3082	*
				3083	* With namespace
				3084	*
				3085	* [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3086	*
				3087	* Returns 1 if the current level should be closed.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3088	*/
				3089
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3090	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3091	htmlParseEndTag(htmlParserCtxtPtr ctxt) {
				3092	xmlChar *name;
				3093	xmlChar *oldname;
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3094	int i, ret;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3095
				3096	if ((CUR != '<') \|\| (NXT(1) != '/')) {
				3097	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3098	ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
				3099	ctxt->wellFormed = 0;
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3100	return(0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3101	}
				3102	SKIP(2);
				3103
				3104	name = htmlParseHTMLName(ctxt);
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3105	if (name == NULL) return(0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3106
				3107	/*
				3108	* We should definitely be at the ending "S? '>'" part
				3109	*/
				3110	SKIP_BLANKS;
				3111	if ((!IS_CHAR(CUR)) \|\| (CUR != '>')) {
				3112	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3113	ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
				3114	ctxt->wellFormed = 0;
				3115	} else
				3116	NEXT;
				3117
				3118	/*
				3119	* If the name read is not one of the element in the parsing stack
				3120	* then return, it's just an error.
				3121	*/
				3122	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
				3123	if (xmlStrEqual(name, ctxt->nameTab[i])) break;
				3124	}
				3125	if (i < 0) {
				3126	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3127	ctxt->sax->error(ctxt->userData,
				3128	"Unexpected end tag : %s\n", name);
				3129	xmlFree(name);
				3130	ctxt->wellFormed = 0;
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3131	return(0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3132	}
				3133
				3134
				3135	/*
				3136	* Check for auto-closure of HTML elements.
				3137	*/
				3138
				3139	htmlAutoCloseOnClose(ctxt, name);
				3140
				3141	/*
				3142	* Well formedness constraints, opening and closing must match.
				3143	* With the exception that the autoclose may have popped stuff out
				3144	* of the stack.
				3145	*/
				3146	if (!xmlStrEqual(name, ctxt->name)) {
				3147	#ifdef DEBUG
				3148	xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
				3149	#endif
				3150	if ((ctxt->name != NULL) &&
				3151	(!xmlStrEqual(ctxt->name, name))) {
				3152	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3153	ctxt->sax->error(ctxt->userData,
				3154	"Opening and ending tag mismatch: %s and %s\n",
				3155	name, ctxt->name);
				3156	ctxt->wellFormed = 0;
				3157	}
				3158	}
				3159
				3160	/*
				3161	* SAX: End of Tag
				3162	*/
				3163	oldname = ctxt->name;
				3164	if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
				3165	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3166	ctxt->sax->endElement(ctxt->userData, name);
				3167	oldname = htmlnamePop(ctxt);
				3168	if (oldname != NULL) {
				3169	#ifdef DEBUG
				3170	xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
				3171	#endif
				3172	xmlFree(oldname);
				3173	#ifdef DEBUG
				3174	} else {
				3175	xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
				3176	#endif
				3177	}
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3178	ret = 1;
				3179	} else {
				3180	ret = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3181	}
				3182
				3183	if (name != NULL)
				3184	xmlFree(name);
				3185
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3186	return(ret);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3187	}
				3188
				3189
				3190	/**
				3191	* htmlParseReference:
				3192	* @ctxt: an HTML parser context
				3193	*
				3194	* parse and handle entity references in content,
				3195	* this will end-up in a call to character() since this is either a
				3196	* CharRef, or a predefined entity.
				3197	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3198	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3199	htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	3200	const htmlEntityDesc * ent;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3201	xmlChar out[6];
				3202	xmlChar *name;
				3203	if (CUR != '&') return;
				3204
				3205	if (NXT(1) == '#') {
				3206	unsigned int c;
				3207	int bits, i = 0;
				3208
				3209	c = htmlParseCharRef(ctxt);
				3210	if (c == 0)
				3211	return;
				3212
				3213	if (c < 0x80) { out[i++]= c; bits= -6; }
				3214	else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				3215	else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				3216	else { out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				3217
				3218	for ( ; bits >= 0; bits-= 6) {
				3219	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
				3220	}
				3221	out[i] = 0;
				3222
				3223	htmlCheckParagraph(ctxt);
				3224	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3225	ctxt->sax->characters(ctxt->userData, out, i);
				3226	} else {
				3227	ent = htmlParseEntityRef(ctxt, &name);
				3228	if (name == NULL) {
				3229	htmlCheckParagraph(ctxt);
				3230	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3231	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
				3232	return;
				3233	}
				3234	if ((ent == NULL) \|\| (ent->value <= 0)) {
				3235	htmlCheckParagraph(ctxt);
				3236	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
				3237	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
				3238	ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
				3239	/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
				3240	}
				3241	} else {
				3242	unsigned int c;
				3243	int bits, i = 0;
				3244
				3245	c = ent->value;
				3246	if (c < 0x80)
				3247	{ out[i++]= c; bits= -6; }
				3248	else if (c < 0x800)
				3249	{ out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				3250	else if (c < 0x10000)
				3251	{ out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				3252	else
				3253	{ out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				3254
				3255	for ( ; bits >= 0; bits-= 6) {
				3256	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
				3257	}
				3258	out[i] = 0;
				3259
				3260	htmlCheckParagraph(ctxt);
				3261	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3262	ctxt->sax->characters(ctxt->userData, out, i);
				3263	}
				3264	xmlFree(name);
				3265	}
				3266	}
				3267
				3268	/**
				3269	* htmlParseContent:
				3270	* @ctxt: an HTML parser context
				3271	* @name: the node name
				3272	*
				3273	* Parse a content: comment, sub-element, reference or text.
				3274	*
				3275	*/
				3276
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3277	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3278	htmlParseContent(htmlParserCtxtPtr ctxt) {
				3279	xmlChar *currentNode;
				3280	int depth;
				3281
				3282	currentNode = xmlStrdup(ctxt->name);
				3283	depth = ctxt->nameNr;
				3284	while (1) {
				3285	long cons = ctxt->nbChars;
				3286
				3287	GROW;
				3288	/*
				3289	* Our tag or one of it's parent or children is ending.
				3290	*/
				3291	if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3292	if (htmlParseEndTag(ctxt) &&
				3293	((currentNode != NULL) \|\| (ctxt->nameNr == 0))) {
				3294	if (currentNode != NULL)
				3295	xmlFree(currentNode);
				3296	return;
				3297	}
				3298	continue; /* while */
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3299	}
				3300
				3301	/*
				3302	* Has this node been popped out during parsing of
				3303	* the next element
				3304	*/
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3305	if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
				3306	(!xmlStrEqual(currentNode, ctxt->name)))
				3307	{
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3308	if (currentNode != NULL) xmlFree(currentNode);
				3309	return;
				3310	}
				3311
Daniel Veillard	f9533d1	2001-03-03 10:04:57 +0000	[diff] [blame]	3312	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) \|\|
				3313	(xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3314	/*
				3315	* Handle SCRIPT/STYLE separately
				3316	*/
				3317	htmlParseScript(ctxt);
				3318	} else {
				3319	/*
				3320	* Sometimes DOCTYPE arrives in the middle of the document
				3321	*/
				3322	if ((CUR == '<') && (NXT(1) == '!') &&
				3323	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				3324	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				3325	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				3326	(UPP(8) == 'E')) {
				3327	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3328	ctxt->sax->error(ctxt->userData,
				3329	"Misplaced DOCTYPE declaration\n");
				3330	ctxt->wellFormed = 0;
				3331	htmlParseDocTypeDecl(ctxt);
				3332	}
				3333
				3334	/*
				3335	* First case : a comment
				3336	*/
				3337	if ((CUR == '<') && (NXT(1) == '!') &&
				3338	(NXT(2) == '-') && (NXT(3) == '-')) {
				3339	htmlParseComment(ctxt);
				3340	}
				3341
				3342	/*
				3343	* Second case : a sub-element.
				3344	*/
				3345	else if (CUR == '<') {
				3346	htmlParseElement(ctxt);
				3347	}
				3348
				3349	/*
				3350	* Third case : a reference. If if has not been resolved,
				3351	* parsing returns it's Name, create the node
				3352	*/
				3353	else if (CUR == '&') {
				3354	htmlParseReference(ctxt);
				3355	}
				3356
				3357	/*
				3358	* Fourth : end of the resource
				3359	*/
				3360	else if (CUR == 0) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3361	htmlAutoCloseOnEnd(ctxt);
				3362	break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3363	}
				3364
				3365	/*
				3366	* Last case, text. Note that References are handled directly.
				3367	*/
				3368	else {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3369	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3370	}
				3371
				3372	if (cons == ctxt->nbChars) {
				3373	if (ctxt->node != NULL) {
				3374	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3375	ctxt->sax->error(ctxt->userData,
				3376	"detected an error in element content\n");
				3377	ctxt->wellFormed = 0;
				3378	}
				3379	break;
				3380	}
				3381	}
				3382	GROW;
				3383	}
				3384	if (currentNode != NULL) xmlFree(currentNode);
				3385	}
				3386
				3387	/**
				3388	* htmlParseElement:
				3389	* @ctxt: an HTML parser context
				3390	*
				3391	* parse an HTML element, this is highly recursive
				3392	*
				3393	* [39] element ::= EmptyElemTag \| STag content ETag
				3394	*
				3395	* [41] Attribute ::= Name Eq AttValue
				3396	*/
				3397
				3398	void
				3399	htmlParseElement(htmlParserCtxtPtr ctxt) {
				3400	xmlChar *name;
				3401	xmlChar *currentNode = NULL;
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	3402	const htmlElemDesc * info;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3403	htmlParserNodeInfo node_info;
				3404	xmlChar *oldname;
				3405	int depth = ctxt->nameNr;
Daniel Veillard	3fbe8e3	2001-10-06 13:30:33 +0000	[diff] [blame]	3406	const xmlChar *oldptr;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3407
				3408	/* Capture start position */
				3409	if (ctxt->record_info) {
				3410	node_info.begin_pos = ctxt->input->consumed +
				3411	(CUR_PTR - ctxt->input->base);
				3412	node_info.begin_line = ctxt->input->line;
				3413	}
				3414
				3415	oldname = xmlStrdup(ctxt->name);
				3416	htmlParseStartTag(ctxt);
				3417	name = ctxt->name;
				3418	#ifdef DEBUG
				3419	if (oldname == NULL)
				3420	xmlGenericError(xmlGenericErrorContext,
				3421	"Start of element %s\n", name);
				3422	else if (name == NULL)
				3423	xmlGenericError(xmlGenericErrorContext,
				3424	"Start of element failed, was %s\n", oldname);
				3425	else
				3426	xmlGenericError(xmlGenericErrorContext,
				3427	"Start of element %s, was %s\n", name, oldname);
				3428	#endif
				3429	if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) \|\|
				3430	(name == NULL)) {
				3431	if (CUR == '>')
				3432	NEXT;
				3433	if (oldname != NULL)
				3434	xmlFree(oldname);
				3435	return;
				3436	}
				3437	if (oldname != NULL)
				3438	xmlFree(oldname);
				3439
				3440	/*
				3441	* Lookup the info for that element.
				3442	*/
				3443	info = htmlTagLookup(name);
				3444	if (info == NULL) {
				3445	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3446	ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
				3447	name);
				3448	ctxt->wellFormed = 0;
				3449	} else if (info->depr) {
				3450	/***************************
				3451	if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
				3452	ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
				3453	name);
				3454	***************************/
				3455	}
				3456
				3457	/*
				3458	* Check for an Empty Element labelled the XML/SGML way
				3459	*/
				3460	if ((CUR == '/') && (NXT(1) == '>')) {
				3461	SKIP(2);
				3462	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3463	ctxt->sax->endElement(ctxt->userData, name);
				3464	oldname = htmlnamePop(ctxt);
				3465	#ifdef DEBUG
				3466	xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
				3467	#endif
				3468	if (oldname != NULL)
				3469	xmlFree(oldname);
				3470	return;
				3471	}
				3472
				3473	if (CUR == '>') {
				3474	NEXT;
				3475	} else {
				3476	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3477	ctxt->sax->error(ctxt->userData,
				3478	"Couldn't find end of Start Tag %s\n",
				3479	name);
				3480	ctxt->wellFormed = 0;
				3481
				3482	/*
				3483	* end of parsing of this node.
				3484	*/
				3485	if (xmlStrEqual(name, ctxt->name)) {
				3486	nodePop(ctxt);
				3487	oldname = htmlnamePop(ctxt);
				3488	#ifdef DEBUG
				3489	xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
				3490	#endif
				3491	if (oldname != NULL)
				3492	xmlFree(oldname);
				3493	}
				3494
				3495	/*
				3496	* Capture end position and add node
				3497	*/
				3498	if ( currentNode != NULL && ctxt->record_info ) {
				3499	node_info.end_pos = ctxt->input->consumed +
				3500	(CUR_PTR - ctxt->input->base);
				3501	node_info.end_line = ctxt->input->line;
				3502	node_info.node = ctxt->node;
				3503	xmlParserAddNodeInfo(ctxt, &node_info);
				3504	}
				3505	return;
				3506	}
				3507
				3508	/*
				3509	* Check for an Empty Element from DTD definition
				3510	*/
				3511	if ((info != NULL) && (info->empty)) {
				3512	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3513	ctxt->sax->endElement(ctxt->userData, name);
				3514	oldname = htmlnamePop(ctxt);
				3515	#ifdef DEBUG
				3516	xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
				3517	#endif
				3518	if (oldname != NULL)
				3519	xmlFree(oldname);
				3520	return;
				3521	}
				3522
				3523	/*
				3524	* Parse the content of the element:
				3525	*/
				3526	currentNode = xmlStrdup(ctxt->name);
				3527	depth = ctxt->nameNr;
				3528	while (IS_CHAR(CUR)) {
William M. Brack	d28e48a	2001-09-23 01:55:08 +0000	[diff] [blame]	3529	oldptr = ctxt->input->cur;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3530	htmlParseContent(ctxt);
William M. Brack	d28e48a	2001-09-23 01:55:08 +0000	[diff] [blame]	3531	if (oldptr==ctxt->input->cur) break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3532	if (ctxt->nameNr < depth) break;
				3533	}
				3534
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3535	/*
				3536	* Capture end position and add node
				3537	*/
				3538	if ( currentNode != NULL && ctxt->record_info ) {
				3539	node_info.end_pos = ctxt->input->consumed +
				3540	(CUR_PTR - ctxt->input->base);
				3541	node_info.end_line = ctxt->input->line;
				3542	node_info.node = ctxt->node;
				3543	xmlParserAddNodeInfo(ctxt, &node_info);
				3544	}
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3545	if (!IS_CHAR(CUR)) {
				3546	htmlAutoCloseOnEnd(ctxt);
				3547	}
				3548
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3549	if (currentNode != NULL)
				3550	xmlFree(currentNode);
				3551	}
				3552
				3553	/**
				3554	* htmlParseDocument :
				3555	* @ctxt: an HTML parser context
				3556	*
				3557	* parse an HTML document (and build a tree if using the standard SAX
				3558	* interface).
				3559	*
				3560	* Returns 0, -1 in case of error. the parser context is augmented
				3561	* as a result of the parsing.
				3562	*/
				3563
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3564	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3565	htmlParseDocument(htmlParserCtxtPtr ctxt) {
				3566	xmlDtdPtr dtd;
				3567
Daniel Veillard	d046356	2001-10-13 09:15:48 +0000	[diff] [blame]	3568	xmlInitParser();
				3569
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3570	htmlDefaultSAXHandlerInit();
				3571	ctxt->html = 1;
				3572
				3573	GROW;
				3574	/*
				3575	* SAX: beginning of the document processing.
				3576	*/
				3577	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
				3578	ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
				3579
				3580	/*
				3581	* Wipe out everything which is before the first '<'
				3582	*/
				3583	SKIP_BLANKS;
				3584	if (CUR == 0) {
				3585	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3586	ctxt->sax->error(ctxt->userData, "Document is empty\n");
				3587	ctxt->wellFormed = 0;
				3588	}
				3589
				3590	if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
				3591	ctxt->sax->startDocument(ctxt->userData);
				3592
				3593
				3594	/*
				3595	* Parse possible comments before any content
				3596	*/
				3597	while ((CUR == '<') && (NXT(1) == '!') &&
				3598	(NXT(2) == '-') && (NXT(3) == '-')) {
				3599	htmlParseComment(ctxt);
				3600	SKIP_BLANKS;
				3601	}
				3602
				3603
				3604	/*
				3605	* Then possibly doc type declaration(s) and more Misc
				3606	* (doctypedecl Misc*)?
				3607	*/
				3608	if ((CUR == '<') && (NXT(1) == '!') &&
				3609	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				3610	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				3611	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				3612	(UPP(8) == 'E')) {
				3613	htmlParseDocTypeDecl(ctxt);
				3614	}
				3615	SKIP_BLANKS;
				3616
				3617	/*
				3618	* Parse possible comments before any content
				3619	*/
				3620	while ((CUR == '<') && (NXT(1) == '!') &&
				3621	(NXT(2) == '-') && (NXT(3) == '-')) {
				3622	htmlParseComment(ctxt);
				3623	SKIP_BLANKS;
				3624	}
				3625
				3626	/*
				3627	* Time to start parsing the tree itself
				3628	*/
				3629	htmlParseContent(ctxt);
				3630
				3631	/*
				3632	* autoclose
				3633	*/
				3634	if (CUR == 0)
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3635	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3636
				3637
				3638	/*
				3639	* SAX: end of the document processing.
				3640	*/
				3641	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				3642	ctxt->sax->endDocument(ctxt->userData);
				3643
				3644	if (ctxt->myDoc != NULL) {
				3645	dtd = xmlGetIntSubset(ctxt->myDoc);
				3646	if (dtd == NULL)
				3647	ctxt->myDoc->intSubset =
				3648	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
				3649	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				3650	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
				3651	}
				3652	if (! ctxt->wellFormed) return(-1);
				3653	return(0);
				3654	}
				3655
				3656
				3657	/************************************************************************
				3658	* *
				3659	* Parser contexts handling *
				3660	* *
				3661	************************************************************************/
				3662
				3663	/**
				3664	* xmlInitParserCtxt:
				3665	* @ctxt: an HTML parser context
				3666	*
				3667	* Initialize a parser context
				3668	*/
				3669
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3670	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3671	htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
				3672	{
				3673	htmlSAXHandler *sax;
				3674
				3675	if (ctxt == NULL) return;
				3676	memset(ctxt, 0, sizeof(htmlParserCtxt));
				3677
				3678	sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
				3679	if (sax == NULL) {
				3680	xmlGenericError(xmlGenericErrorContext,
				3681	"htmlInitParserCtxt: out of memory\n");
				3682	}
				3683	else
				3684	memset(sax, 0, sizeof(htmlSAXHandler));
				3685
				3686	/* Allocate the Input stack */
				3687	ctxt->inputTab = (htmlParserInputPtr *)
				3688	xmlMalloc(5 * sizeof(htmlParserInputPtr));
				3689	if (ctxt->inputTab == NULL) {
				3690	xmlGenericError(xmlGenericErrorContext,
				3691	"htmlInitParserCtxt: out of memory\n");
				3692	ctxt->inputNr = 0;
				3693	ctxt->inputMax = 0;
				3694	ctxt->input = NULL;
				3695	return;
				3696	}
				3697	ctxt->inputNr = 0;
				3698	ctxt->inputMax = 5;
				3699	ctxt->input = NULL;
				3700	ctxt->version = NULL;
				3701	ctxt->encoding = NULL;
				3702	ctxt->standalone = -1;
				3703	ctxt->instate = XML_PARSER_START;
				3704
				3705	/* Allocate the Node stack */
				3706	ctxt->nodeTab = (htmlNodePtr ) xmlMalloc(10 sizeof(htmlNodePtr));
				3707	if (ctxt->nodeTab == NULL) {
				3708	xmlGenericError(xmlGenericErrorContext,
				3709	"htmlInitParserCtxt: out of memory\n");
				3710	ctxt->nodeNr = 0;
				3711	ctxt->nodeMax = 0;
				3712	ctxt->node = NULL;
				3713	ctxt->inputNr = 0;
				3714	ctxt->inputMax = 0;
				3715	ctxt->input = NULL;
				3716	return;
				3717	}
				3718	ctxt->nodeNr = 0;
				3719	ctxt->nodeMax = 10;
				3720	ctxt->node = NULL;
				3721
				3722	/* Allocate the Name stack */
				3723	ctxt->nameTab = (xmlChar *) xmlMalloc(10 sizeof(xmlChar *));
				3724	if (ctxt->nameTab == NULL) {
				3725	xmlGenericError(xmlGenericErrorContext,
				3726	"htmlInitParserCtxt: out of memory\n");
				3727	ctxt->nameNr = 0;
				3728	ctxt->nameMax = 10;
				3729	ctxt->name = NULL;
				3730	ctxt->nodeNr = 0;
				3731	ctxt->nodeMax = 0;
				3732	ctxt->node = NULL;
				3733	ctxt->inputNr = 0;
				3734	ctxt->inputMax = 0;
				3735	ctxt->input = NULL;
				3736	return;
				3737	}
				3738	ctxt->nameNr = 0;
				3739	ctxt->nameMax = 10;
				3740	ctxt->name = NULL;
				3741
				3742	if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
				3743	else {
				3744	ctxt->sax = sax;
				3745	memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
				3746	}
				3747	ctxt->userData = ctxt;
				3748	ctxt->myDoc = NULL;
				3749	ctxt->wellFormed = 1;
				3750	ctxt->replaceEntities = 0;
Daniel Veillard	635ef72	2001-10-29 11:48:19 +0000	[diff] [blame]	3751	ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3752	ctxt->html = 1;
				3753	ctxt->record_info = 0;
				3754	ctxt->validate = 0;
				3755	ctxt->nbChars = 0;
				3756	ctxt->checkIndex = 0;
Daniel Veillard	dc2cee2	2001-08-22 16:30:37 +0000	[diff] [blame]	3757	ctxt->catalogs = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3758	xmlInitNodeInfoSeq(&ctxt->node_seq);
				3759	}
				3760
				3761	/**
				3762	* htmlFreeParserCtxt:
				3763	* @ctxt: an HTML parser context
				3764	*
				3765	* Free all the memory used by a parser context. However the parsed
				3766	* document in ctxt->myDoc is not freed.
				3767	*/
				3768
				3769	void
				3770	htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
				3771	{
				3772	xmlFreeParserCtxt(ctxt);
				3773	}
				3774
				3775	/**
				3776	* htmlCreateDocParserCtxt :
				3777	* @cur: a pointer to an array of xmlChar
				3778	* @encoding: a free form C string describing the HTML document encoding, or NULL
				3779	*
				3780	* Create a parser context for an HTML document.
				3781	*
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3782	* TODO: check the need to add encoding handling there
				3783	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3784	* Returns the new parser context or NULL
				3785	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3786	static htmlParserCtxtPtr
Daniel Veillard	c86a4fa	2001-03-26 16:28:29 +0000	[diff] [blame]	3787	htmlCreateDocParserCtxt(xmlChar cur, const char encoding ATTRIBUTE_UNUSED) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3788	htmlParserCtxtPtr ctxt;
				3789	htmlParserInputPtr input;
				3790	/* htmlCharEncoding enc; */
				3791
				3792	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				3793	if (ctxt == NULL) {
				3794	perror("malloc");
				3795	return(NULL);
				3796	}
				3797	htmlInitParserCtxt(ctxt);
				3798	input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				3799	if (input == NULL) {
				3800	perror("malloc");
				3801	xmlFree(ctxt);
				3802	return(NULL);
				3803	}
				3804	memset(input, 0, sizeof(htmlParserInput));
				3805
				3806	input->line = 1;
				3807	input->col = 1;
				3808	input->base = cur;
				3809	input->cur = cur;
				3810
				3811	inputPush(ctxt, input);
				3812	return(ctxt);
				3813	}
				3814
				3815	/************************************************************************
				3816	* *
				3817	* Progressive parsing interfaces *
				3818	* *
				3819	************************************************************************/
				3820
				3821	/**
				3822	* htmlParseLookupSequence:
				3823	* @ctxt: an HTML parser context
				3824	* @first: the first char to lookup
				3825	* @next: the next char to lookup or zero
				3826	* @third: the next char to lookup or zero
				3827	*
				3828	* Try to find if a sequence (first, next, third) or just (first next) or
				3829	* (first) is available in the input stream.
				3830	* This function has a side effect of (possibly) incrementing ctxt->checkIndex
				3831	* to avoid rescanning sequences of bytes, it DOES change the state of the
				3832	* parser, do not use liberally.
				3833	* This is basically similar to xmlParseLookupSequence()
				3834	*
				3835	* Returns the index to the current parsing point if the full sequence
				3836	* is available, -1 otherwise.
				3837	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3838	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3839	htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
				3840	xmlChar next, xmlChar third) {
				3841	int base, len;
				3842	htmlParserInputPtr in;
				3843	const xmlChar *buf;
				3844
				3845	in = ctxt->input;
				3846	if (in == NULL) return(-1);
				3847	base = in->cur - in->base;
				3848	if (base < 0) return(-1);
				3849	if (ctxt->checkIndex > base)
				3850	base = ctxt->checkIndex;
				3851	if (in->buf == NULL) {
				3852	buf = in->base;
				3853	len = in->length;
				3854	} else {
				3855	buf = in->buf->buffer->content;
				3856	len = in->buf->buffer->use;
				3857	}
				3858	/* take into account the sequence length */
				3859	if (third) len -= 2;
				3860	else if (next) len --;
				3861	for (;base < len;base++) {
				3862	if (buf[base] == first) {
				3863	if (third != 0) {
				3864	if ((buf[base + 1] != next) \|\|
				3865	(buf[base + 2] != third)) continue;
				3866	} else if (next != 0) {
				3867	if (buf[base + 1] != next) continue;
				3868	}
				3869	ctxt->checkIndex = 0;
				3870	#ifdef DEBUG_PUSH
				3871	if (next == 0)
				3872	xmlGenericError(xmlGenericErrorContext,
				3873	"HPP: lookup '%c' found at %d\n",
				3874	first, base);
				3875	else if (third == 0)
				3876	xmlGenericError(xmlGenericErrorContext,
				3877	"HPP: lookup '%c%c' found at %d\n",
				3878	first, next, base);
				3879	else
				3880	xmlGenericError(xmlGenericErrorContext,
				3881	"HPP: lookup '%c%c%c' found at %d\n",
				3882	first, next, third, base);
				3883	#endif
				3884	return(base - (in->cur - in->base));
				3885	}
				3886	}
				3887	ctxt->checkIndex = base;
				3888	#ifdef DEBUG_PUSH
				3889	if (next == 0)
				3890	xmlGenericError(xmlGenericErrorContext,
				3891	"HPP: lookup '%c' failed\n", first);
				3892	else if (third == 0)
				3893	xmlGenericError(xmlGenericErrorContext,
				3894	"HPP: lookup '%c%c' failed\n", first, next);
				3895	else
				3896	xmlGenericError(xmlGenericErrorContext,
				3897	"HPP: lookup '%c%c%c' failed\n", first, next, third);
				3898	#endif
				3899	return(-1);
				3900	}
				3901
				3902	/**
				3903	* htmlParseTryOrFinish:
				3904	* @ctxt: an HTML parser context
				3905	* @terminate: last chunk indicator
				3906	*
				3907	* Try to progress on parsing
				3908	*
				3909	* Returns zero if no parsing was possible
				3910	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3911	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3912	htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
				3913	int ret = 0;
				3914	htmlParserInputPtr in;
				3915	int avail = 0;
				3916	xmlChar cur, next;
				3917
				3918	#ifdef DEBUG_PUSH
				3919	switch (ctxt->instate) {
				3920	case XML_PARSER_EOF:
				3921	xmlGenericError(xmlGenericErrorContext,
				3922	"HPP: try EOF\n"); break;
				3923	case XML_PARSER_START:
				3924	xmlGenericError(xmlGenericErrorContext,
				3925	"HPP: try START\n"); break;
				3926	case XML_PARSER_MISC:
				3927	xmlGenericError(xmlGenericErrorContext,
				3928	"HPP: try MISC\n");break;
				3929	case XML_PARSER_COMMENT:
				3930	xmlGenericError(xmlGenericErrorContext,
				3931	"HPP: try COMMENT\n");break;
				3932	case XML_PARSER_PROLOG:
				3933	xmlGenericError(xmlGenericErrorContext,
				3934	"HPP: try PROLOG\n");break;
				3935	case XML_PARSER_START_TAG:
				3936	xmlGenericError(xmlGenericErrorContext,
				3937	"HPP: try START_TAG\n");break;
				3938	case XML_PARSER_CONTENT:
				3939	xmlGenericError(xmlGenericErrorContext,
				3940	"HPP: try CONTENT\n");break;
				3941	case XML_PARSER_CDATA_SECTION:
				3942	xmlGenericError(xmlGenericErrorContext,
				3943	"HPP: try CDATA_SECTION\n");break;
				3944	case XML_PARSER_END_TAG:
				3945	xmlGenericError(xmlGenericErrorContext,
				3946	"HPP: try END_TAG\n");break;
				3947	case XML_PARSER_ENTITY_DECL:
				3948	xmlGenericError(xmlGenericErrorContext,
				3949	"HPP: try ENTITY_DECL\n");break;
				3950	case XML_PARSER_ENTITY_VALUE:
				3951	xmlGenericError(xmlGenericErrorContext,
				3952	"HPP: try ENTITY_VALUE\n");break;
				3953	case XML_PARSER_ATTRIBUTE_VALUE:
				3954	xmlGenericError(xmlGenericErrorContext,
				3955	"HPP: try ATTRIBUTE_VALUE\n");break;
				3956	case XML_PARSER_DTD:
				3957	xmlGenericError(xmlGenericErrorContext,
				3958	"HPP: try DTD\n");break;
				3959	case XML_PARSER_EPILOG:
				3960	xmlGenericError(xmlGenericErrorContext,
				3961	"HPP: try EPILOG\n");break;
				3962	case XML_PARSER_PI:
				3963	xmlGenericError(xmlGenericErrorContext,
				3964	"HPP: try PI\n");break;
				3965	case XML_PARSER_SYSTEM_LITERAL:
				3966	xmlGenericError(xmlGenericErrorContext,
				3967	"HPP: try SYSTEM_LITERAL\n");break;
				3968	}
				3969	#endif
				3970
				3971	while (1) {
				3972
				3973	in = ctxt->input;
				3974	if (in == NULL) break;
				3975	if (in->buf == NULL)
				3976	avail = in->length - (in->cur - in->base);
				3977	else
				3978	avail = in->buf->buffer->use - (in->cur - in->base);
				3979	if ((avail == 0) && (terminate)) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3980	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3981	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
				3982	/*
				3983	* SAX: end of the document processing.
				3984	*/
				3985	ctxt->instate = XML_PARSER_EOF;
				3986	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				3987	ctxt->sax->endDocument(ctxt->userData);
				3988	}
				3989	}
				3990	if (avail < 1)
				3991	goto done;
				3992	switch (ctxt->instate) {
				3993	case XML_PARSER_EOF:
				3994	/*
				3995	* Document parsing is done !
				3996	*/
				3997	goto done;
				3998	case XML_PARSER_START:
				3999	/*
				4000	* Very first chars read from the document flow.
				4001	*/
				4002	cur = in->cur[0];
				4003	if (IS_BLANK(cur)) {
				4004	SKIP_BLANKS;
				4005	if (in->buf == NULL)
				4006	avail = in->length - (in->cur - in->base);
				4007	else
				4008	avail = in->buf->buffer->use - (in->cur - in->base);
				4009	}
				4010	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
				4011	ctxt->sax->setDocumentLocator(ctxt->userData,
				4012	&xmlDefaultSAXLocator);
				4013	if ((ctxt->sax) && (ctxt->sax->startDocument) &&
				4014	(!ctxt->disableSAX))
				4015	ctxt->sax->startDocument(ctxt->userData);
				4016
				4017	cur = in->cur[0];
				4018	next = in->cur[1];
				4019	if ((cur == '<') && (next == '!') &&
				4020	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4021	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4022	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4023	(UPP(8) == 'E')) {
				4024	if ((!terminate) &&
				4025	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4026	goto done;
				4027	#ifdef DEBUG_PUSH
				4028	xmlGenericError(xmlGenericErrorContext,
				4029	"HPP: Parsing internal subset\n");
				4030	#endif
				4031	htmlParseDocTypeDecl(ctxt);
				4032	ctxt->instate = XML_PARSER_PROLOG;
				4033	#ifdef DEBUG_PUSH
				4034	xmlGenericError(xmlGenericErrorContext,
				4035	"HPP: entering PROLOG\n");
				4036	#endif
				4037	} else {
				4038	ctxt->instate = XML_PARSER_MISC;
				4039	}
				4040	#ifdef DEBUG_PUSH
				4041	xmlGenericError(xmlGenericErrorContext,
				4042	"HPP: entering MISC\n");
				4043	#endif
				4044	break;
				4045	case XML_PARSER_MISC:
				4046	SKIP_BLANKS;
				4047	if (in->buf == NULL)
				4048	avail = in->length - (in->cur - in->base);
				4049	else
				4050	avail = in->buf->buffer->use - (in->cur - in->base);
				4051	if (avail < 2)
				4052	goto done;
				4053	cur = in->cur[0];
				4054	next = in->cur[1];
				4055	if ((cur == '<') && (next == '!') &&
				4056	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4057	if ((!terminate) &&
				4058	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4059	goto done;
				4060	#ifdef DEBUG_PUSH
				4061	xmlGenericError(xmlGenericErrorContext,
				4062	"HPP: Parsing Comment\n");
				4063	#endif
				4064	htmlParseComment(ctxt);
				4065	ctxt->instate = XML_PARSER_MISC;
				4066	} else if ((cur == '<') && (next == '!') &&
				4067	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4068	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4069	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4070	(UPP(8) == 'E')) {
				4071	if ((!terminate) &&
				4072	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4073	goto done;
				4074	#ifdef DEBUG_PUSH
				4075	xmlGenericError(xmlGenericErrorContext,
				4076	"HPP: Parsing internal subset\n");
				4077	#endif
				4078	htmlParseDocTypeDecl(ctxt);
				4079	ctxt->instate = XML_PARSER_PROLOG;
				4080	#ifdef DEBUG_PUSH
				4081	xmlGenericError(xmlGenericErrorContext,
				4082	"HPP: entering PROLOG\n");
				4083	#endif
				4084	} else if ((cur == '<') && (next == '!') &&
				4085	(avail < 9)) {
				4086	goto done;
				4087	} else {
				4088	ctxt->instate = XML_PARSER_START_TAG;
				4089	#ifdef DEBUG_PUSH
				4090	xmlGenericError(xmlGenericErrorContext,
				4091	"HPP: entering START_TAG\n");
				4092	#endif
				4093	}
				4094	break;
				4095	case XML_PARSER_PROLOG:
				4096	SKIP_BLANKS;
				4097	if (in->buf == NULL)
				4098	avail = in->length - (in->cur - in->base);
				4099	else
				4100	avail = in->buf->buffer->use - (in->cur - in->base);
				4101	if (avail < 2)
				4102	goto done;
				4103	cur = in->cur[0];
				4104	next = in->cur[1];
				4105	if ((cur == '<') && (next == '!') &&
				4106	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4107	if ((!terminate) &&
				4108	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4109	goto done;
				4110	#ifdef DEBUG_PUSH
				4111	xmlGenericError(xmlGenericErrorContext,
				4112	"HPP: Parsing Comment\n");
				4113	#endif
				4114	htmlParseComment(ctxt);
				4115	ctxt->instate = XML_PARSER_PROLOG;
				4116	} else if ((cur == '<') && (next == '!') &&
				4117	(avail < 4)) {
				4118	goto done;
				4119	} else {
				4120	ctxt->instate = XML_PARSER_START_TAG;
				4121	#ifdef DEBUG_PUSH
				4122	xmlGenericError(xmlGenericErrorContext,
				4123	"HPP: entering START_TAG\n");
				4124	#endif
				4125	}
				4126	break;
				4127	case XML_PARSER_EPILOG:
				4128	if (in->buf == NULL)
				4129	avail = in->length - (in->cur - in->base);
				4130	else
				4131	avail = in->buf->buffer->use - (in->cur - in->base);
				4132	if (avail < 1)
				4133	goto done;
				4134	cur = in->cur[0];
				4135	if (IS_BLANK(cur)) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	4136	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4137	goto done;
				4138	}
				4139	if (avail < 2)
				4140	goto done;
				4141	next = in->cur[1];
				4142	if ((cur == '<') && (next == '!') &&
				4143	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4144	if ((!terminate) &&
				4145	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4146	goto done;
				4147	#ifdef DEBUG_PUSH
				4148	xmlGenericError(xmlGenericErrorContext,
				4149	"HPP: Parsing Comment\n");
				4150	#endif
				4151	htmlParseComment(ctxt);
				4152	ctxt->instate = XML_PARSER_EPILOG;
				4153	} else if ((cur == '<') && (next == '!') &&
				4154	(avail < 4)) {
				4155	goto done;
				4156	} else {
				4157	ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4158	ctxt->wellFormed = 0;
				4159	ctxt->instate = XML_PARSER_EOF;
				4160	#ifdef DEBUG_PUSH
				4161	xmlGenericError(xmlGenericErrorContext,
				4162	"HPP: entering EOF\n");
				4163	#endif
				4164	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4165	ctxt->sax->endDocument(ctxt->userData);
				4166	goto done;
				4167	}
				4168	break;
				4169	case XML_PARSER_START_TAG: {
				4170	xmlChar name, oldname;
				4171	int depth = ctxt->nameNr;
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	4172	const htmlElemDesc * info;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4173
				4174	if (avail < 2)
				4175	goto done;
				4176	cur = in->cur[0];
				4177	if (cur != '<') {
				4178	ctxt->instate = XML_PARSER_CONTENT;
				4179	#ifdef DEBUG_PUSH
				4180	xmlGenericError(xmlGenericErrorContext,
				4181	"HPP: entering CONTENT\n");
				4182	#endif
				4183	break;
				4184	}
Daniel Veillard	f69bb4b	2001-05-19 13:24:56 +0000	[diff] [blame]	4185	if (in->cur[1] == '/') {
				4186	ctxt->instate = XML_PARSER_END_TAG;
				4187	ctxt->checkIndex = 0;
				4188	#ifdef DEBUG_PUSH
				4189	xmlGenericError(xmlGenericErrorContext,
				4190	"HPP: entering END_TAG\n");
				4191	#endif
				4192	break;
				4193	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4194	if ((!terminate) &&
				4195	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4196	goto done;
				4197
				4198	oldname = xmlStrdup(ctxt->name);
				4199	htmlParseStartTag(ctxt);
				4200	name = ctxt->name;
				4201	#ifdef DEBUG
				4202	if (oldname == NULL)
				4203	xmlGenericError(xmlGenericErrorContext,
				4204	"Start of element %s\n", name);
				4205	else if (name == NULL)
				4206	xmlGenericError(xmlGenericErrorContext,
				4207	"Start of element failed, was %s\n",
				4208	oldname);
				4209	else
				4210	xmlGenericError(xmlGenericErrorContext,
				4211	"Start of element %s, was %s\n",
				4212	name, oldname);
				4213	#endif
				4214	if (((depth == ctxt->nameNr) &&
				4215	(xmlStrEqual(oldname, ctxt->name))) \|\|
				4216	(name == NULL)) {
				4217	if (CUR == '>')
				4218	NEXT;
				4219	if (oldname != NULL)
				4220	xmlFree(oldname);
				4221	break;
				4222	}
				4223	if (oldname != NULL)
				4224	xmlFree(oldname);
				4225
				4226	/*
				4227	* Lookup the info for that element.
				4228	*/
				4229	info = htmlTagLookup(name);
				4230	if (info == NULL) {
				4231	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4232	ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
				4233	name);
				4234	ctxt->wellFormed = 0;
				4235	} else if (info->depr) {
				4236	/***************************
				4237	if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
				4238	ctxt->sax->warning(ctxt->userData,
				4239	"Tag %s is deprecated\n",
				4240	name);
				4241	***************************/
				4242	}
				4243
				4244	/*
				4245	* Check for an Empty Element labelled the XML/SGML way
				4246	*/
				4247	if ((CUR == '/') && (NXT(1) == '>')) {
				4248	SKIP(2);
				4249	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4250	ctxt->sax->endElement(ctxt->userData, name);
				4251	oldname = htmlnamePop(ctxt);
				4252	#ifdef DEBUG
				4253	xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
				4254	oldname);
				4255	#endif
				4256	if (oldname != NULL)
				4257	xmlFree(oldname);
				4258	ctxt->instate = XML_PARSER_CONTENT;
				4259	#ifdef DEBUG_PUSH
				4260	xmlGenericError(xmlGenericErrorContext,
				4261	"HPP: entering CONTENT\n");
				4262	#endif
				4263	break;
				4264	}
				4265
				4266	if (CUR == '>') {
				4267	NEXT;
				4268	} else {
				4269	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4270	ctxt->sax->error(ctxt->userData,
				4271	"Couldn't find end of Start Tag %s\n",
				4272	name);
				4273	ctxt->wellFormed = 0;
				4274
				4275	/*
				4276	* end of parsing of this node.
				4277	*/
				4278	if (xmlStrEqual(name, ctxt->name)) {
				4279	nodePop(ctxt);
				4280	oldname = htmlnamePop(ctxt);
				4281	#ifdef DEBUG
				4282	xmlGenericError(xmlGenericErrorContext,
				4283	"End of start tag problem: popping out %s\n", oldname);
				4284	#endif
				4285	if (oldname != NULL)
				4286	xmlFree(oldname);
				4287	}
				4288
				4289	ctxt->instate = XML_PARSER_CONTENT;
				4290	#ifdef DEBUG_PUSH
				4291	xmlGenericError(xmlGenericErrorContext,
				4292	"HPP: entering CONTENT\n");
				4293	#endif
				4294	break;
				4295	}
				4296
				4297	/*
				4298	* Check for an Empty Element from DTD definition
				4299	*/
				4300	if ((info != NULL) && (info->empty)) {
				4301	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4302	ctxt->sax->endElement(ctxt->userData, name);
				4303	oldname = htmlnamePop(ctxt);
				4304	#ifdef DEBUG
				4305	xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
				4306	#endif
				4307	if (oldname != NULL)
				4308	xmlFree(oldname);
				4309	}
				4310	ctxt->instate = XML_PARSER_CONTENT;
				4311	#ifdef DEBUG_PUSH
				4312	xmlGenericError(xmlGenericErrorContext,
				4313	"HPP: entering CONTENT\n");
				4314	#endif
				4315	break;
				4316	}
				4317	case XML_PARSER_CONTENT: {
				4318	long cons;
				4319	/*
				4320	* Handle preparsed entities and charRef
				4321	*/
				4322	if (ctxt->token != 0) {
				4323	xmlChar chr[2] = { 0 , 0 } ;
				4324
				4325	chr[0] = (xmlChar) ctxt->token;
				4326	htmlCheckParagraph(ctxt);
				4327	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				4328	ctxt->sax->characters(ctxt->userData, chr, 1);
				4329	ctxt->token = 0;
				4330	ctxt->checkIndex = 0;
				4331	}
				4332	if ((avail == 1) && (terminate)) {
				4333	cur = in->cur[0];
				4334	if ((cur != '<') && (cur != '&')) {
				4335	if (ctxt->sax != NULL) {
				4336	if (IS_BLANK(cur)) {
				4337	if (ctxt->sax->ignorableWhitespace != NULL)
				4338	ctxt->sax->ignorableWhitespace(
				4339	ctxt->userData, &cur, 1);
				4340	} else {
				4341	htmlCheckParagraph(ctxt);
				4342	if (ctxt->sax->characters != NULL)
				4343	ctxt->sax->characters(
				4344	ctxt->userData, &cur, 1);
				4345	}
				4346	}
				4347	ctxt->token = 0;
				4348	ctxt->checkIndex = 0;
				4349	NEXT;
William M. Brack	1633d18	2001-10-05 15:41:19 +0000	[diff] [blame]	4350	break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4351	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4352	}
				4353	if (avail < 2)
				4354	goto done;
				4355	cur = in->cur[0];
				4356	next = in->cur[1];
				4357	cons = ctxt->nbChars;
				4358	if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) \|\|
				4359	(xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
				4360	/*
				4361	* Handle SCRIPT/STYLE separately
				4362	*/
				4363	if ((!terminate) &&
				4364	(htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
				4365	goto done;
				4366	htmlParseScript(ctxt);
				4367	if ((cur == '<') && (next == '/')) {
				4368	ctxt->instate = XML_PARSER_END_TAG;
				4369	ctxt->checkIndex = 0;
				4370	#ifdef DEBUG_PUSH
				4371	xmlGenericError(xmlGenericErrorContext,
				4372	"HPP: entering END_TAG\n");
				4373	#endif
				4374	break;
				4375	}
				4376	} else {
				4377	/*
				4378	* Sometimes DOCTYPE arrives in the middle of the document
				4379	*/
				4380	if ((cur == '<') && (next == '!') &&
				4381	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4382	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4383	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4384	(UPP(8) == 'E')) {
				4385	if ((!terminate) &&
				4386	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4387	goto done;
				4388	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4389	ctxt->sax->error(ctxt->userData,
				4390	"Misplaced DOCTYPE declaration\n");
				4391	ctxt->wellFormed = 0;
				4392	htmlParseDocTypeDecl(ctxt);
				4393	} else if ((cur == '<') && (next == '!') &&
				4394	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4395	if ((!terminate) &&
				4396	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4397	goto done;
				4398	#ifdef DEBUG_PUSH
				4399	xmlGenericError(xmlGenericErrorContext,
				4400	"HPP: Parsing Comment\n");
				4401	#endif
				4402	htmlParseComment(ctxt);
				4403	ctxt->instate = XML_PARSER_CONTENT;
				4404	} else if ((cur == '<') && (next == '!') && (avail < 4)) {
				4405	goto done;
				4406	} else if ((cur == '<') && (next == '/')) {
				4407	ctxt->instate = XML_PARSER_END_TAG;
				4408	ctxt->checkIndex = 0;
				4409	#ifdef DEBUG_PUSH
				4410	xmlGenericError(xmlGenericErrorContext,
				4411	"HPP: entering END_TAG\n");
				4412	#endif
				4413	break;
				4414	} else if (cur == '<') {
				4415	ctxt->instate = XML_PARSER_START_TAG;
				4416	ctxt->checkIndex = 0;
				4417	#ifdef DEBUG_PUSH
				4418	xmlGenericError(xmlGenericErrorContext,
				4419	"HPP: entering START_TAG\n");
				4420	#endif
				4421	break;
				4422	} else if (cur == '&') {
				4423	if ((!terminate) &&
				4424	(htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
				4425	goto done;
				4426	#ifdef DEBUG_PUSH
				4427	xmlGenericError(xmlGenericErrorContext,
				4428	"HPP: Parsing Reference\n");
				4429	#endif
				4430	/* TODO: check generation of subtrees if noent !!! */
				4431	htmlParseReference(ctxt);
				4432	} else {
				4433	/* TODO Avoid the extra copy, handle directly !!!!!! */
				4434	/*
				4435	* Goal of the following test is :
				4436	* - minimize calls to the SAX 'character' callback
				4437	* when they are mergeable
				4438	*/
				4439	if ((ctxt->inputNr == 1) &&
				4440	(avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
				4441	if ((!terminate) &&
				4442	(htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
				4443	goto done;
				4444	}
				4445	ctxt->checkIndex = 0;
				4446	#ifdef DEBUG_PUSH
				4447	xmlGenericError(xmlGenericErrorContext,
				4448	"HPP: Parsing char data\n");
				4449	#endif
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	4450	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4451	}
				4452	}
				4453	if (cons == ctxt->nbChars) {
				4454	if (ctxt->node != NULL) {
				4455	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4456	ctxt->sax->error(ctxt->userData,
				4457	"detected an error in element content\n");
				4458	ctxt->wellFormed = 0;
				4459	}
				4460	NEXT;
				4461	break;
				4462	}
				4463
				4464	break;
				4465	}
				4466	case XML_PARSER_END_TAG:
				4467	if (avail < 2)
				4468	goto done;
				4469	if ((!terminate) &&
				4470	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4471	goto done;
				4472	htmlParseEndTag(ctxt);
				4473	if (ctxt->nameNr == 0) {
				4474	ctxt->instate = XML_PARSER_EPILOG;
				4475	} else {
				4476	ctxt->instate = XML_PARSER_CONTENT;
				4477	}
				4478	ctxt->checkIndex = 0;
				4479	#ifdef DEBUG_PUSH
				4480	xmlGenericError(xmlGenericErrorContext,
				4481	"HPP: entering CONTENT\n");
				4482	#endif
				4483	break;
				4484	case XML_PARSER_CDATA_SECTION:
				4485	xmlGenericError(xmlGenericErrorContext,
				4486	"HPP: internal error, state == CDATA\n");
				4487	ctxt->instate = XML_PARSER_CONTENT;
				4488	ctxt->checkIndex = 0;
				4489	#ifdef DEBUG_PUSH
				4490	xmlGenericError(xmlGenericErrorContext,
				4491	"HPP: entering CONTENT\n");
				4492	#endif
				4493	break;
				4494	case XML_PARSER_DTD:
				4495	xmlGenericError(xmlGenericErrorContext,
				4496	"HPP: internal error, state == DTD\n");
				4497	ctxt->instate = XML_PARSER_CONTENT;
				4498	ctxt->checkIndex = 0;
				4499	#ifdef DEBUG_PUSH
				4500	xmlGenericError(xmlGenericErrorContext,
				4501	"HPP: entering CONTENT\n");
				4502	#endif
				4503	break;
				4504	case XML_PARSER_COMMENT:
				4505	xmlGenericError(xmlGenericErrorContext,
				4506	"HPP: internal error, state == COMMENT\n");
				4507	ctxt->instate = XML_PARSER_CONTENT;
				4508	ctxt->checkIndex = 0;
				4509	#ifdef DEBUG_PUSH
				4510	xmlGenericError(xmlGenericErrorContext,
				4511	"HPP: entering CONTENT\n");
				4512	#endif
				4513	break;
				4514	case XML_PARSER_PI:
				4515	xmlGenericError(xmlGenericErrorContext,
				4516	"HPP: internal error, state == PI\n");
				4517	ctxt->instate = XML_PARSER_CONTENT;
				4518	ctxt->checkIndex = 0;
				4519	#ifdef DEBUG_PUSH
				4520	xmlGenericError(xmlGenericErrorContext,
				4521	"HPP: entering CONTENT\n");
				4522	#endif
				4523	break;
				4524	case XML_PARSER_ENTITY_DECL:
				4525	xmlGenericError(xmlGenericErrorContext,
				4526	"HPP: internal error, state == ENTITY_DECL\n");
				4527	ctxt->instate = XML_PARSER_CONTENT;
				4528	ctxt->checkIndex = 0;
				4529	#ifdef DEBUG_PUSH
				4530	xmlGenericError(xmlGenericErrorContext,
				4531	"HPP: entering CONTENT\n");
				4532	#endif
				4533	break;
				4534	case XML_PARSER_ENTITY_VALUE:
				4535	xmlGenericError(xmlGenericErrorContext,
				4536	"HPP: internal error, state == ENTITY_VALUE\n");
				4537	ctxt->instate = XML_PARSER_CONTENT;
				4538	ctxt->checkIndex = 0;
				4539	#ifdef DEBUG_PUSH
				4540	xmlGenericError(xmlGenericErrorContext,
				4541	"HPP: entering DTD\n");
				4542	#endif
				4543	break;
				4544	case XML_PARSER_ATTRIBUTE_VALUE:
				4545	xmlGenericError(xmlGenericErrorContext,
				4546	"HPP: internal error, state == ATTRIBUTE_VALUE\n");
				4547	ctxt->instate = XML_PARSER_START_TAG;
				4548	ctxt->checkIndex = 0;
				4549	#ifdef DEBUG_PUSH
				4550	xmlGenericError(xmlGenericErrorContext,
				4551	"HPP: entering START_TAG\n");
				4552	#endif
				4553	break;
				4554	case XML_PARSER_SYSTEM_LITERAL:
				4555	xmlGenericError(xmlGenericErrorContext,
				4556	"HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
				4557	ctxt->instate = XML_PARSER_CONTENT;
				4558	ctxt->checkIndex = 0;
				4559	#ifdef DEBUG_PUSH
				4560	xmlGenericError(xmlGenericErrorContext,
				4561	"HPP: entering CONTENT\n");
				4562	#endif
				4563	break;
				4564	case XML_PARSER_IGNORE:
				4565	xmlGenericError(xmlGenericErrorContext,
				4566	"HPP: internal error, state == XML_PARSER_IGNORE\n");
				4567	ctxt->instate = XML_PARSER_CONTENT;
				4568	ctxt->checkIndex = 0;
				4569	#ifdef DEBUG_PUSH
				4570	xmlGenericError(xmlGenericErrorContext,
				4571	"HPP: entering CONTENT\n");
				4572	#endif
				4573	break;
				4574	}
				4575	}
				4576	done:
				4577	if ((avail == 0) && (terminate)) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	4578	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4579	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
				4580	/*
				4581	* SAX: end of the document processing.
				4582	*/
				4583	ctxt->instate = XML_PARSER_EOF;
				4584	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4585	ctxt->sax->endDocument(ctxt->userData);
				4586	}
				4587	}
				4588	if ((ctxt->myDoc != NULL) &&
				4589	((terminate) \|\| (ctxt->instate == XML_PARSER_EOF) \|\|
				4590	(ctxt->instate == XML_PARSER_EPILOG))) {
				4591	xmlDtdPtr dtd;
				4592	dtd = xmlGetIntSubset(ctxt->myDoc);
				4593	if (dtd == NULL)
				4594	ctxt->myDoc->intSubset =
				4595	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
				4596	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				4597	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
				4598	}
				4599	#ifdef DEBUG_PUSH
				4600	xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
				4601	#endif
				4602	return(ret);
				4603	}
				4604
				4605	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4606	* htmlParseChunk:
				4607	* @ctxt: an XML parser context
				4608	* @chunk: an char array
				4609	* @size: the size in byte of the chunk
				4610	* @terminate: last chunk indicator
				4611	*
				4612	* Parse a Chunk of memory
				4613	*
				4614	* Returns zero if no error, the xmlParserErrors otherwise.
				4615	*/
				4616	int
				4617	htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
				4618	int terminate) {
				4619	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
				4620	(ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
				4621	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
				4622	int cur = ctxt->input->cur - ctxt->input->base;
				4623
				4624	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
				4625	ctxt->input->base = ctxt->input->buf->buffer->content + base;
				4626	ctxt->input->cur = ctxt->input->base + cur;
				4627	#ifdef DEBUG_PUSH
				4628	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
				4629	#endif
				4630
				4631	if ((terminate) \|\| (ctxt->input->buf->buffer->use > 80))
				4632	htmlParseTryOrFinish(ctxt, terminate);
				4633	} else if (ctxt->instate != XML_PARSER_EOF) {
				4634	xmlParserInputBufferPush(ctxt->input->buf, 0, "");
				4635	htmlParseTryOrFinish(ctxt, terminate);
				4636	}
				4637	if (terminate) {
				4638	if ((ctxt->instate != XML_PARSER_EOF) &&
				4639	(ctxt->instate != XML_PARSER_EPILOG) &&
				4640	(ctxt->instate != XML_PARSER_MISC)) {
				4641	ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4642	ctxt->wellFormed = 0;
				4643	}
				4644	if (ctxt->instate != XML_PARSER_EOF) {
				4645	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4646	ctxt->sax->endDocument(ctxt->userData);
				4647	}
				4648	ctxt->instate = XML_PARSER_EOF;
				4649	}
				4650	return((xmlParserErrors) ctxt->errNo);
				4651	}
				4652
				4653	/************************************************************************
				4654	* *
				4655	* User entry points *
				4656	* *
				4657	************************************************************************/
				4658
				4659	/**
				4660	* htmlCreatePushParserCtxt :
				4661	* @sax: a SAX handler
				4662	* @user_data: The user data returned on SAX callbacks
				4663	* @chunk: a pointer to an array of chars
				4664	* @size: number of chars in the array
				4665	* @filename: an optional file name or URI
				4666	* @enc: an optional encoding
				4667	*
				4668	* Create a parser context for using the HTML parser in push mode
				4669	* To allow content encoding detection, @size should be >= 4
				4670	* The value of @filename is used for fetching external entities
				4671	* and error/warning reports.
				4672	*
				4673	* Returns the new parser context or NULL
				4674	*/
				4675	htmlParserCtxtPtr
				4676	htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
				4677	const char chunk, int size, const char filename,
				4678	xmlCharEncoding enc) {
				4679	htmlParserCtxtPtr ctxt;
				4680	htmlParserInputPtr inputStream;
				4681	xmlParserInputBufferPtr buf;
				4682
Daniel Veillard	d046356	2001-10-13 09:15:48 +0000	[diff] [blame]	4683	xmlInitParser();
				4684
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4685	buf = xmlAllocParserInputBuffer(enc);
				4686	if (buf == NULL) return(NULL);
				4687
				4688	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				4689	if (ctxt == NULL) {
				4690	xmlFree(buf);
				4691	return(NULL);
				4692	}
				4693	memset(ctxt, 0, sizeof(htmlParserCtxt));
				4694	htmlInitParserCtxt(ctxt);
				4695	if (sax != NULL) {
				4696	if (ctxt->sax != &htmlDefaultSAXHandler)
				4697	xmlFree(ctxt->sax);
				4698	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
				4699	if (ctxt->sax == NULL) {
				4700	xmlFree(buf);
				4701	xmlFree(ctxt);
				4702	return(NULL);
				4703	}
				4704	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
				4705	if (user_data != NULL)
				4706	ctxt->userData = user_data;
				4707	}
				4708	if (filename == NULL) {
				4709	ctxt->directory = NULL;
				4710	} else {
				4711	ctxt->directory = xmlParserGetDirectory(filename);
				4712	}
				4713
				4714	inputStream = htmlNewInputStream(ctxt);
				4715	if (inputStream == NULL) {
				4716	xmlFreeParserCtxt(ctxt);
				4717	return(NULL);
				4718	}
				4719
				4720	if (filename == NULL)
				4721	inputStream->filename = NULL;
				4722	else
				4723	inputStream->filename = xmlMemStrdup(filename);
				4724	inputStream->buf = buf;
				4725	inputStream->base = inputStream->buf->buffer->content;
				4726	inputStream->cur = inputStream->buf->buffer->content;
				4727
				4728	inputPush(ctxt, inputStream);
				4729
				4730	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
				4731	(ctxt->input->buf != NULL)) {
				4732	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
				4733	#ifdef DEBUG_PUSH
				4734	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
				4735	#endif
				4736	}
				4737
				4738	return(ctxt);
				4739	}
				4740
				4741	/**
				4742	* htmlSAXParseDoc :
				4743	* @cur: a pointer to an array of xmlChar
				4744	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4745	* @sax: the SAX handler block
				4746	* @userData: if using SAX, this pointer will be provided on callbacks.
				4747	*
Daniel Veillard	4d65a1c	2001-07-04 22:06:23 +0000	[diff] [blame]	4748	* Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
				4749	* to handle parse events. If sax is NULL, fallback to the default DOM
				4750	* behavior and return a tree.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4751	*
Daniel Veillard	4d65a1c	2001-07-04 22:06:23 +0000	[diff] [blame]	4752	* Returns the resulting document tree unless SAX is NULL or the document is
				4753	* not well formed.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4754	*/
				4755
				4756	htmlDocPtr
				4757	htmlSAXParseDoc(xmlChar cur, const char encoding, htmlSAXHandlerPtr sax, void *userData) {
				4758	htmlDocPtr ret;
				4759	htmlParserCtxtPtr ctxt;
				4760
Daniel Veillard	d046356	2001-10-13 09:15:48 +0000	[diff] [blame]	4761	xmlInitParser();
				4762
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4763	if (cur == NULL) return(NULL);
				4764
				4765
				4766	ctxt = htmlCreateDocParserCtxt(cur, encoding);
				4767	if (ctxt == NULL) return(NULL);
				4768	if (sax != NULL) {
				4769	ctxt->sax = sax;
				4770	ctxt->userData = userData;
				4771	}
				4772
				4773	htmlParseDocument(ctxt);
				4774	ret = ctxt->myDoc;
				4775	if (sax != NULL) {
				4776	ctxt->sax = NULL;
				4777	ctxt->userData = NULL;
				4778	}
				4779	htmlFreeParserCtxt(ctxt);
				4780
				4781	return(ret);
				4782	}
				4783
				4784	/**
				4785	* htmlParseDoc :
				4786	* @cur: a pointer to an array of xmlChar
				4787	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4788	*
				4789	* parse an HTML in-memory document and build a tree.
				4790	*
				4791	* Returns the resulting document tree
				4792	*/
				4793
				4794	htmlDocPtr
				4795	htmlParseDoc(xmlChar cur, const char encoding) {
				4796	return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
				4797	}
				4798
				4799
				4800	/**
				4801	* htmlCreateFileParserCtxt :
				4802	* @filename: the filename
				4803	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4804	*
				4805	* Create a parser context for a file content.
				4806	* Automatic support for ZLIB/Compress compressed document is provided
				4807	* by default if found at compile-time.
				4808	*
				4809	* Returns the new parser context or NULL
				4810	*/
				4811	htmlParserCtxtPtr
				4812	htmlCreateFileParserCtxt(const char filename, const char encoding)
				4813	{
				4814	htmlParserCtxtPtr ctxt;
				4815	htmlParserInputPtr inputStream;
				4816	xmlParserInputBufferPtr buf;
				4817	/* htmlCharEncoding enc; */
				4818	xmlChar content, content_line = (xmlChar *) "charset=";
				4819
				4820	buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
				4821	if (buf == NULL) return(NULL);
				4822
				4823	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				4824	if (ctxt == NULL) {
				4825	perror("malloc");
				4826	return(NULL);
				4827	}
				4828	memset(ctxt, 0, sizeof(htmlParserCtxt));
				4829	htmlInitParserCtxt(ctxt);
				4830	inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				4831	if (inputStream == NULL) {
				4832	perror("malloc");
				4833	xmlFree(ctxt);
				4834	return(NULL);
				4835	}
				4836	memset(inputStream, 0, sizeof(htmlParserInput));
				4837
				4838	inputStream->filename = xmlMemStrdup(filename);
				4839	inputStream->line = 1;
				4840	inputStream->col = 1;
				4841	inputStream->buf = buf;
				4842	inputStream->directory = NULL;
				4843
				4844	inputStream->base = inputStream->buf->buffer->content;
				4845	inputStream->cur = inputStream->buf->buffer->content;
				4846	inputStream->free = NULL;
				4847
				4848	inputPush(ctxt, inputStream);
				4849
				4850	/* set encoding */
				4851	if (encoding) {
				4852	content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
				4853	if (content) {
				4854	strcpy ((char )content, (char )content_line);
				4855	strcat ((char )content, (char )encoding);
				4856	htmlCheckEncoding (ctxt, content);
				4857	xmlFree (content);
				4858	}
				4859	}
				4860
				4861	return(ctxt);
				4862	}
				4863
				4864	/**
				4865	* htmlSAXParseFile :
				4866	* @filename: the filename
				4867	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4868	* @sax: the SAX handler block
				4869	* @userData: if using SAX, this pointer will be provided on callbacks.
				4870	*
				4871	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				4872	* compressed document is provided by default if found at compile-time.
				4873	* It use the given SAX function block to handle the parsing callback.
				4874	* If sax is NULL, fallback to the default DOM tree building routines.
				4875	*
Daniel Veillard	4d65a1c	2001-07-04 22:06:23 +0000	[diff] [blame]	4876	* Returns the resulting document tree unless SAX is NULL or the document is
				4877	* not well formed.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4878	*/
				4879
				4880	htmlDocPtr
				4881	htmlSAXParseFile(const char filename, const char encoding, htmlSAXHandlerPtr sax,
				4882	void *userData) {
				4883	htmlDocPtr ret;
				4884	htmlParserCtxtPtr ctxt;
				4885	htmlSAXHandlerPtr oldsax = NULL;
				4886
Daniel Veillard	d046356	2001-10-13 09:15:48 +0000	[diff] [blame]	4887	xmlInitParser();
				4888
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4889	ctxt = htmlCreateFileParserCtxt(filename, encoding);
				4890	if (ctxt == NULL) return(NULL);
				4891	if (sax != NULL) {
				4892	oldsax = ctxt->sax;
				4893	ctxt->sax = sax;
				4894	ctxt->userData = userData;
				4895	}
				4896
				4897	htmlParseDocument(ctxt);
				4898
				4899	ret = ctxt->myDoc;
				4900	if (sax != NULL) {
				4901	ctxt->sax = oldsax;
				4902	ctxt->userData = NULL;
				4903	}
				4904	htmlFreeParserCtxt(ctxt);
				4905
				4906	return(ret);
				4907	}
				4908
				4909	/**
				4910	* htmlParseFile :
				4911	* @filename: the filename
				4912	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4913	*
				4914	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				4915	* compressed document is provided by default if found at compile-time.
				4916	*
				4917	* Returns the resulting document tree
				4918	*/
				4919
				4920	htmlDocPtr
				4921	htmlParseFile(const char filename, const char encoding) {
				4922	return(htmlSAXParseFile(filename, encoding, NULL, NULL));
				4923	}
				4924
				4925	/**
				4926	* htmlHandleOmittedElem:
				4927	* @val: int 0 or 1
				4928	*
				4929	* Set and return the previous value for handling HTML omitted tags.
				4930	*
				4931	* Returns the last value for 0 for no handling, 1 for auto insertion.
				4932	*/
				4933
				4934	int
				4935	htmlHandleOmittedElem(int val) {
				4936	int old = htmlOmittedDefaultValue;
				4937
				4938	htmlOmittedDefaultValue = val;
				4939	return(old);
				4940	}
				4941
				4942	#endif /* LIBXML_HTML_ENABLED */