Blame - HTMLparser.c - fp2-dev/platform/external/libxml2

blob: 428248edd71737e12bd2c6f6b23a4e0aa9001e16 [file] [log] [blame]

Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1	/*
				2	* HTMLparser.c : an HTML 4.0 non-verifying parser
				3	*
				4	* See Copyright for the status of this software.
				5	*
Daniel Veillard	c5d6434	2001-06-24 12:13:24 +0000	[diff] [blame]	6	* daniel@veillard.com
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	7	*/
				8
Bjorn Reese	70a9da5	2001-04-21 16:57:29 +0000	[diff] [blame]	9	#include "libxml.h"
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	10	#ifdef LIBXML_HTML_ENABLED
Bjorn Reese	70a9da5	2001-04-21 16:57:29 +0000	[diff] [blame]	11
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	12	#include <string.h>
				13	#ifdef HAVE_CTYPE_H
				14	#include <ctype.h>
				15	#endif
				16	#ifdef HAVE_STDLIB_H
				17	#include <stdlib.h>
				18	#endif
				19	#ifdef HAVE_SYS_STAT_H
				20	#include <sys/stat.h>
				21	#endif
				22	#ifdef HAVE_FCNTL_H
				23	#include <fcntl.h>
				24	#endif
				25	#ifdef HAVE_UNISTD_H
				26	#include <unistd.h>
				27	#endif
				28	#ifdef HAVE_ZLIB_H
				29	#include <zlib.h>
				30	#endif
				31
				32	#include <libxml/xmlmemory.h>
				33	#include <libxml/tree.h>
				34	#include <libxml/parser.h>
				35	#include <libxml/parserInternals.h>
				36	#include <libxml/xmlerror.h>
				37	#include <libxml/HTMLparser.h>
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	38	#include <libxml/HTMLtree.h>
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	39	#include <libxml/entities.h>
				40	#include <libxml/encoding.h>
				41	#include <libxml/valid.h>
				42	#include <libxml/xmlIO.h>
Daniel Veillard	3c01b1d	2001-10-17 15:58:35 +0000	[diff] [blame]	43	#include <libxml/globals.h>
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	44
				45	#define HTML_MAX_NAMELEN 1000
				46	#define HTML_PARSER_BIG_BUFFER_SIZE 1000
				47	#define HTML_PARSER_BUFFER_SIZE 100
				48
				49	/* #define DEBUG */
				50	/* #define DEBUG_PUSH */
				51
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	52	static int htmlOmittedDefaultValue = 1;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	53
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	54	xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
				55	xmlChar end, xmlChar end2, xmlChar end3);
				56
				57	/************************************************************************
				58	* *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	59	* Parser stacks related functions and macros *
				60	* *
				61	************************************************************************/
				62
				63	/*
				64	* Generic function for accessing stacks in the Parser Context
				65	*/
				66
				67	#define PUSH_AND_POP(scope, type, name) \
				68	scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
				69	if (ctxt->name##Nr >= ctxt->name##Max) { \
				70	ctxt->name##Max *= 2; \
				71	ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
				72	ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
				73	if (ctxt->name##Tab == NULL) { \
				74	xmlGenericError(xmlGenericErrorContext, \
				75	"realloc failed !\n"); \
				76	return(0); \
				77	} \
				78	} \
				79	ctxt->name##Tab[ctxt->name##Nr] = value; \
				80	ctxt->name = value; \
				81	return(ctxt->name##Nr++); \
				82	} \
				83	scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
				84	type ret; \
				85	if (ctxt->name##Nr < 0) return(0); \
				86	ctxt->name##Nr--; \
				87	if (ctxt->name##Nr < 0) return(0); \
				88	if (ctxt->name##Nr > 0) \
				89	ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
				90	else \
				91	ctxt->name = NULL; \
				92	ret = ctxt->name##Tab[ctxt->name##Nr]; \
				93	ctxt->name##Tab[ctxt->name##Nr] = 0; \
				94	return(ret); \
				95	} \
				96
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	97	/* PUSH_AND_POP(static, xmlNodePtr, node) */
				98	PUSH_AND_POP(static, xmlChar*, name)
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	99
				100	/*
				101	* Macros for accessing the content. Those should be used only by the parser,
				102	* and not exported.
				103	*
				104	* Dirty macros, i.e. one need to make assumption on the context to use them
				105	*
				106	* CUR_PTR return the current pointer to the xmlChar to be parsed.
				107	* CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
				108	* in ISO-Latin or UTF-8, and the current 16 bit value if compiled
				109	* in UNICODE mode. This should be used internally by the parser
				110	* only to compare to ASCII values otherwise it would break when
				111	* running with UTF-8 encoding.
				112	* NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
				113	* to compare on ASCII based substring.
				114	* UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
				115	* it should be used only to compare on ASCII based substring.
				116	* SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
				117	* strings within the parser.
				118	*
				119	* Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
				120	*
				121	* CURRENT Returns the current char value, with the full decoding of
				122	* UTF-8 if we are using this mode. It returns an int.
				123	* NEXT Skip to the next character, this does the proper decoding
				124	* in UTF-8 mode. It also pop-up unfinished entities on the fly.
				125	* COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
				126	*/
				127
				128	#define UPPER (toupper(*ctxt->input->cur))
				129
				130	#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
				131
				132	#define NXT(val) ctxt->input->cur[(val)]
				133
				134	#define UPP(val) (toupper(ctxt->input->cur[(val)]))
				135
				136	#define CUR_PTR ctxt->input->cur
				137
				138	#define SHRINK xmlParserInputShrink(ctxt->input)
				139
				140	#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
				141
				142	#define CURRENT ((int) (*ctxt->input->cur))
				143
				144	#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
				145
				146	/* Inported from XML */
				147
				148	/* #define CUR (ctxt->token ? ctxt->token : (int) (ctxt->input->cur)) /
				149	#define CUR ((int) (*ctxt->input->cur))
				150	#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
				151
				152	#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
				153	#define NXT(val) ctxt->input->cur[(val)]
				154	#define CUR_PTR ctxt->input->cur
				155
				156
				157	#define NEXTL(l) do { \
				158	if (*(ctxt->input->cur) == '\n') { \
				159	ctxt->input->line++; ctxt->input->col = 1; \
				160	} else ctxt->input->col++; \
				161	ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
				162	} while (0)
				163
				164	/************
				165	\
				166	if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
				167	if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
				168	************/
				169
				170	#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
				171	#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
				172
				173	#define COPY_BUF(l,b,i,v) \
				174	if (l == 1) b[i++] = (xmlChar) v; \
				175	else i += xmlCopyChar(l,&b[i],v)
				176
				177	/**
				178	* htmlCurrentChar:
				179	* @ctxt: the HTML parser context
				180	* @len: pointer to the length of the char read
				181	*
				182	* The current char value, if using UTF-8 this may actaully span multiple
				183	* bytes in the input buffer. Implement the end of line normalization:
				184	* 2.11 End-of-Line Handling
				185	* If the encoding is unspecified, in the case we find an ISO-Latin-1
				186	* char, then the encoding converter is plugged in automatically.
				187	*
Daniel Veillard	60087f3	2001-10-10 09:45:09 +0000	[diff] [blame]	188	* Returns the current char value and its length
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	189	*/
				190
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	191	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	192	htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
				193	if (ctxt->instate == XML_PARSER_EOF)
				194	return(0);
				195
				196	if (ctxt->token != 0) {
				197	*len = 0;
				198	return(ctxt->token);
				199	}
				200	if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
				201	/*
				202	* We are supposed to handle UTF8, check it's valid
				203	* From rfc2044: encoding of the Unicode values on UTF-8:
				204	*
				205	* UCS-4 range (hex.) UTF-8 octet sequence (binary)
				206	* 0000 0000-0000 007F 0xxxxxxx
				207	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
				208	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
				209	*
				210	* Check for the 0x110000 limit too
				211	*/
				212	const unsigned char *cur = ctxt->input->cur;
				213	unsigned char c;
				214	unsigned int val;
				215
				216	c = *cur;
				217	if (c & 0x80) {
				218	if (cur[1] == 0)
				219	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				220	if ((cur[1] & 0xc0) != 0x80)
				221	goto encoding_error;
				222	if ((c & 0xe0) == 0xe0) {
				223
				224	if (cur[2] == 0)
				225	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				226	if ((cur[2] & 0xc0) != 0x80)
				227	goto encoding_error;
				228	if ((c & 0xf0) == 0xf0) {
				229	if (cur[3] == 0)
				230	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				231	if (((c & 0xf8) != 0xf0) \|\|
				232	((cur[3] & 0xc0) != 0x80))
				233	goto encoding_error;
				234	/* 4-byte code */
				235	*len = 4;
				236	val = (cur[0] & 0x7) << 18;
				237	val \|= (cur[1] & 0x3f) << 12;
				238	val \|= (cur[2] & 0x3f) << 6;
				239	val \|= cur[3] & 0x3f;
				240	} else {
				241	/* 3-byte code */
				242	*len = 3;
				243	val = (cur[0] & 0xf) << 12;
				244	val \|= (cur[1] & 0x3f) << 6;
				245	val \|= cur[2] & 0x3f;
				246	}
				247	} else {
				248	/* 2-byte code */
				249	*len = 2;
				250	val = (cur[0] & 0x1f) << 6;
				251	val \|= cur[1] & 0x3f;
				252	}
				253	if (!IS_CHAR(val)) {
				254	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				255	if ((ctxt->sax != NULL) &&
				256	(ctxt->sax->error != NULL))
				257	ctxt->sax->error(ctxt->userData,
				258	"Char 0x%X out of allowed range\n", val);
				259	ctxt->wellFormed = 0;
				260	ctxt->disableSAX = 1;
				261	}
				262	return(val);
				263	} else {
				264	/* 1-byte code */
				265	*len = 1;
				266	return((int) *ctxt->input->cur);
				267	}
				268	}
				269	/*
Daniel Veillard	60087f3	2001-10-10 09:45:09 +0000	[diff] [blame]	270	* Assume it's a fixed length encoding (1) with
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	271	* a compatibke encoding for the ASCII set, since
				272	* XML constructs only use < 128 chars
				273	*/
				274	*len = 1;
				275	if ((int) *ctxt->input->cur < 0x80)
				276	return((int) *ctxt->input->cur);
				277
				278	/*
				279	* Humm this is bad, do an automatic flow conversion
				280	*/
				281	xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
				282	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				283	return(xmlCurrentChar(ctxt, len));
				284
				285	encoding_error:
				286	/*
				287	* If we detect an UTF8 error that probably mean that the
				288	* input encoding didn't get properly advertized in the
				289	* declaration header. Report the error and switch the encoding
				290	* to ISO-Latin-1 (if you don't like this policy, just declare the
				291	* encoding !)
				292	*/
				293	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				294	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
				295	ctxt->sax->error(ctxt->userData,
				296	"Input is not proper UTF-8, indicate encoding !\n");
				297	ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
				298	ctxt->input->cur[0], ctxt->input->cur[1],
				299	ctxt->input->cur[2], ctxt->input->cur[3]);
				300	}
				301
				302	ctxt->charset = XML_CHAR_ENCODING_8859_1;
				303	*len = 1;
				304	return((int) *ctxt->input->cur);
				305	}
				306
				307	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	308	* htmlSkipBlankChars:
				309	* @ctxt: the HTML parser context
				310	*
				311	* skip all blanks character found at that point in the input streams.
				312	*
				313	* Returns the number of space chars skipped
				314	*/
				315
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	316	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	317	htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
				318	int res = 0;
				319
				320	while (IS_BLANK(*(ctxt->input->cur))) {
				321	if ((*ctxt->input->cur == 0) &&
				322	(xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
				323	xmlPopInput(ctxt);
				324	} else {
				325	if (*(ctxt->input->cur) == '\n') {
				326	ctxt->input->line++; ctxt->input->col = 1;
				327	} else ctxt->input->col++;
				328	ctxt->input->cur++;
				329	ctxt->nbChars++;
				330	if (*ctxt->input->cur == 0)
				331	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				332	}
				333	res++;
				334	}
				335	return(res);
				336	}
				337
				338
				339
				340	/************************************************************************
				341	* *
				342	* The list of HTML elements and their properties *
				343	* *
				344	************************************************************************/
				345
				346	/*
				347	* Start Tag: 1 means the start tag can be ommited
				348	* End Tag: 1 means the end tag can be ommited
				349	* 2 means it's forbidden (empty elements)
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	350	* 3 means the tag is stylistic and should be closed easilly
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	351	* Depr: this element is deprecated
				352	* DTD: 1 means that this element is valid only in the Loose DTD
				353	* 2 means that this element is valid only in the Frameset DTD
				354	*
Daniel Veillard	02bb170	2001-06-13 21:11:59 +0000	[diff] [blame]	355	* Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	356	*/
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	357	static const htmlElemDesc
				358	html40ElementTable[] = {
Daniel Veillard	02bb170	2001-06-13 21:11:59 +0000	[diff] [blame]	359	{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor " },
				360	{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form" },
				361	{ "acronym", 0, 0, 0, 0, 0, 0, 1, "" },
				362	{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author " },
				363	{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet " },
				364	{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area " },
				365	{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style" },
				366	{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri " },
				367	{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " },
				368	{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride " },
				369	{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style" },
				370	{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation " },
				371	{ "body", 1, 1, 0, 0, 0, 0, 0, "document body " },
				372	{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break " },
				373	{ "button", 0, 0, 0, 0, 0, 0, 2, "push button " },
				374	{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption " },
				375	{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center " },
				376	{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation" },
				377	{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment" },
				378	{ "col", 0, 2, 2, 1, 0, 0, 0, "table column " },
				379	{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group " },
				380	{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description " },
				381	{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text " },
				382	{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition" },
				383	{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list" },
				384	{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container"},
				385	{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list " },
				386	{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term " },
				387	{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis" },
				388	{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group " },
				389	{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font " },
				390	{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form " },
				391	{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " },
				392	{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" },
				393	{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading " },
				394	{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading " },
				395	{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading " },
				396	{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading " },
				397	{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading " },
				398	{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading " },
				399	{ "head", 1, 1, 0, 0, 0, 0, 0, "document head " },
				400	{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " },
				401	{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element " },
				402	{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style" },
				403	{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow " },
				404	{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image " },
				405	{ "input", 0, 2, 2, 1, 0, 0, 1, "form control " },
				406	{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text" },
				407	{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt " },
				408	{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user" },
				409	{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text " },
				410	{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend " },
				411	{ "li", 0, 1, 1, 0, 0, 0, 0, "list item " },
				412	{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link " },
				413	{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map " },
				414	{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list " },
				415	{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation " },
				416	{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering " },
				417	{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
				418	{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object " },
				419	{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list " },
				420	{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group " },
				421	{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " },
				422	{ "p", 0, 1, 1, 0, 0, 0, 0, "paragraph " },
				423	{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value " },
				424	{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text " },
				425	{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation " },
				426	{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style" },
				427	{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc." },
				428	{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements " },
				429	{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector " },
				430	{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style" },
				431	{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container " },
				432	{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text" },
				433	{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis" },
				434	{ "style", 0, 0, 0, 0, 0, 0, 0, "style info " },
				435	{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript" },
				436	{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript " },
				437	{ "table", 0, 0, 0, 0, 0, 0, 0, " " },
				438	{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body " },
				439	{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell" },
				440	{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field " },
				441	{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer " },
				442	{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell" },
				443	{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header " },
				444	{ "title", 0, 0, 0, 0, 0, 0, 0, "document title " },
				445	{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row " },
				446	{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style" },
				447	{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style" },
				448	{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list " },
				449	{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	450	};
				451
				452	/*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	453	* start tags that imply the end of current element
				454	*/
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	455	static const char *htmlStartClose[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	456	"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
				457	"dl", "ul", "ol", "menu", "dir", "address", "pre",
				458	"listing", "xmp", "head", NULL,
				459	"head", "p", NULL,
				460	"title", "p", NULL,
				461	"body", "head", "style", "link", "title", "p", NULL,
				462	"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
				463	"pre", "listing", "xmp", "head", "li", NULL,
				464	"hr", "p", "head", NULL,
				465	"h1", "p", "head", NULL,
				466	"h2", "p", "head", NULL,
				467	"h3", "p", "head", NULL,
				468	"h4", "p", "head", NULL,
				469	"h5", "p", "head", NULL,
				470	"h6", "p", "head", NULL,
				471	"dir", "p", "head", NULL,
				472	"address", "p", "head", "ul", NULL,
				473	"pre", "p", "head", "ul", NULL,
				474	"listing", "p", "head", NULL,
				475	"xmp", "p", "head", NULL,
				476	"blockquote", "p", "head", NULL,
				477	"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
				478	"xmp", "head", NULL,
				479	"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
				480	"head", "dd", NULL,
				481	"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
				482	"head", "dt", NULL,
				483	"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
				484	"listing", "xmp", NULL,
				485	"ol", "p", "head", "ul", NULL,
				486	"menu", "p", "head", "ul", NULL,
				487	"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
				488	"div", "p", "head", NULL,
				489	"noscript", "p", "head", NULL,
				490	"center", "font", "b", "i", "p", "head", NULL,
				491	"a", "a", NULL,
				492	"caption", "p", NULL,
				493	"colgroup", "caption", "colgroup", "col", "p", NULL,
				494	"col", "caption", "col", "p", NULL,
				495	"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
				496	"listing", "xmp", "a", NULL,
Daniel Veillard	43dadeb	2001-04-24 11:23:35 +0000	[diff] [blame]	497	"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
				498	"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	499	"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
				500	"thead", "caption", "col", "colgroup", NULL,
				501	"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
				502	"tbody", "p", NULL,
				503	"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
				504	"tfoot", "tbody", "p", NULL,
				505	"optgroup", "option", NULL,
				506	"option", "option", NULL,
				507	"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
				508	"pre", "listing", "xmp", "a", NULL,
				509	NULL
				510	};
				511
				512	/*
				513	* The list of HTML elements which are supposed not to have
				514	* CDATA content and where a p element will be implied
				515	*
				516	* TODO: extend that list by reading the HTML SGML DtD on
				517	* implied paragraph
				518	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	519	static const char *htmlNoContentElements[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	520	"html",
				521	"head",
				522	"body",
				523	NULL
				524	};
				525
				526	/*
				527	* The list of HTML attributes which are of content %Script;
				528	* NOTE: when adding ones, check htmlIsScriptAttribute() since
				529	* it assumes the name starts with 'on'
				530	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	531	static const char *htmlScriptAttributes[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	532	"onclick",
				533	"ondblclick",
				534	"onmousedown",
				535	"onmouseup",
				536	"onmouseover",
				537	"onmousemove",
				538	"onmouseout",
				539	"onkeypress",
				540	"onkeydown",
				541	"onkeyup",
				542	"onload",
				543	"onunload",
				544	"onfocus",
				545	"onblur",
				546	"onsubmit",
				547	"onrest",
				548	"onchange",
				549	"onselect"
				550	};
				551
Daniel Veillard	a2bc368	2001-05-03 08:27:20 +0000	[diff] [blame]	552	/*
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	553	* This table is used by the htmlparser to know what to do with
				554	* broken html pages. By assigning different priorities to different
				555	* elements the parser can decide how to handle extra endtags.
				556	* Endtags are only allowed to close elements with lower or equal
				557	* priority.
				558	*/
Daniel Veillard	a2bc368	2001-05-03 08:27:20 +0000	[diff] [blame]	559
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	560	typedef struct {
				561	const char *name;
				562	int priority;
				563	} elementPriority;
				564
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	565	static const elementPriority htmlEndPriority[] = {
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	566	{"div", 150},
				567	{"td", 160},
				568	{"th", 160},
				569	{"tr", 170},
				570	{"thead", 180},
				571	{"tbody", 180},
				572	{"tfoot", 180},
				573	{"table", 190},
				574	{"head", 200},
				575	{"body", 200},
				576	{"html", 220},
				577	{NULL, 100} /* Default priority */
				578	};
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	579
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	580	static const char** htmlStartCloseIndex[100];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	581	static int htmlStartCloseIndexinitialized = 0;
				582
				583	/************************************************************************
				584	* *
				585	* functions to handle HTML specific data *
				586	* *
				587	************************************************************************/
				588
				589	/**
				590	* htmlInitAutoClose:
				591	*
				592	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				593	* This is not reentrant. Call xmlInitParser() once before processing in
				594	* case of use in multithreaded programs.
				595	*/
				596	void
				597	htmlInitAutoClose(void) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	598	int indx, i = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	599
				600	if (htmlStartCloseIndexinitialized) return;
				601
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	602	for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
				603	indx = 0;
				604	while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
				605	htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	606	while (htmlStartClose[i] != NULL) i++;
				607	i++;
				608	}
				609	htmlStartCloseIndexinitialized = 1;
				610	}
				611
				612	/**
				613	* htmlTagLookup:
				614	* @tag: The tag name in lowercase
				615	*
				616	* Lookup the HTML tag in the ElementTable
				617	*
				618	* Returns the related htmlElemDescPtr or NULL if not found.
				619	*/
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	620	const htmlElemDesc *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	621	htmlTagLookup(const xmlChar *tag) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	622	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	623
				624	for (i = 0; i < (sizeof(html40ElementTable) /
				625	sizeof(html40ElementTable[0]));i++) {
Daniel Veillard	1ed3f88	2001-04-18 09:45:35 +0000	[diff] [blame]	626	if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	627	return((const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	628	}
				629	return(NULL);
				630	}
				631
				632	/**
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	633	* htmlGetEndPriority:
				634	* @name: The name of the element to look up the priority for.
				635	*
				636	* Return value: The "endtag" priority.
				637	**/
				638	static int
				639	htmlGetEndPriority (const xmlChar *name) {
				640	int i = 0;
				641
				642	while ((htmlEndPriority[i].name != NULL) &&
				643	(!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
				644	i++;
				645
				646	return(htmlEndPriority[i].priority);
				647	}
				648
				649	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	650	* htmlCheckAutoClose:
				651	* @newtag: The new tag name
				652	* @oldtag: The old tag name
				653	*
				654	* Checks wether the new tag is one of the registered valid tags for closing old.
				655	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				656	*
				657	* Returns 0 if no, 1 if yes.
				658	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	659	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	660	htmlCheckAutoClose(const xmlChar newtag, const xmlChar oldtag) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	661	int i, indx;
				662	const char **closed = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	663
				664	if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
				665
				666	/* inefficient, but not a big deal */
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	667	for (indx = 0; indx < 100;indx++) {
				668	closed = htmlStartCloseIndex[indx];
				669	if (closed == NULL) return(0);
				670	if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	671	}
				672
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	673	i = closed - htmlStartClose;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	674	i++;
				675	while (htmlStartClose[i] != NULL) {
				676	if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
				677	return(1);
				678	}
				679	i++;
				680	}
				681	return(0);
				682	}
				683
				684	/**
				685	* htmlAutoCloseOnClose:
				686	* @ctxt: an HTML parser context
				687	* @newtag: The new tag name
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	688	* @force: force the tag closure
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	689	*
				690	* The HTmL DtD allows an ending tag to implicitely close other tags.
				691	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	692	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	693	htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	694	const htmlElemDesc * info;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	695	xmlChar *oldname;
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	696	int i, priority;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	697
				698	#ifdef DEBUG
				699	xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
				700	for (i = 0;i < ctxt->nameNr;i++)
				701	xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
				702	#endif
				703
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	704	priority = htmlGetEndPriority (newtag);
				705
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	706	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	707
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	708	if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	709	/*
				710	* A missplaced endtagad can only close elements with lower
				711	* or equal priority, so if we find an element with higher
				712	* priority before we find an element with
				713	* matching name, we just ignore this endtag
				714	*/
				715	if (htmlGetEndPriority (ctxt->nameTab[i]) > priority) return;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	716	}
				717	if (i < 0) return;
				718
				719	while (!xmlStrEqual(newtag, ctxt->name)) {
				720	info = htmlTagLookup(ctxt->name);
				721	if ((info == NULL) \|\| (info->endTag == 1)) {
				722	#ifdef DEBUG
				723	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
				724	#endif
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	725	} else if (info->endTag == 3) {
				726	#ifdef DEBUG
Daniel Veillard	f69bb4b	2001-05-19 13:24:56 +0000	[diff] [blame]	727	xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", newtag, ctxt->name);
William M. Brack	1633d18	2001-10-05 15:41:19 +0000	[diff] [blame]	728
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	729	#endif
				730	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				731	ctxt->sax->error(ctxt->userData,
				732	"Opening and ending tag mismatch: %s and %s\n",
				733	newtag, ctxt->name);
				734	ctxt->wellFormed = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	735	}
				736	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				737	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				738	oldname = htmlnamePop(ctxt);
				739	if (oldname != NULL) {
				740	#ifdef DEBUG
				741	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
				742	#endif
				743	xmlFree(oldname);
				744	}
				745	}
				746	}
				747
				748	/**
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	749	* htmlAutoCloseOnEnd:
				750	* @ctxt: an HTML parser context
				751	*
				752	* Close all remaining tags at the end of the stream
				753	*/
				754	static void
				755	htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
				756	xmlChar *oldname;
				757	int i;
				758
				759	if (ctxt->nameNr == 0)
				760	return;
				761	#ifdef DEBUG
				762	xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
				763	#endif
				764
				765	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
				766	#ifdef DEBUG
				767	xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
				768	#endif
				769	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				770	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				771	oldname = htmlnamePop(ctxt);
				772	if (oldname != NULL) {
				773	#ifdef DEBUG
				774	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
				775	#endif
				776	xmlFree(oldname);
				777	}
				778	}
				779	}
				780
				781	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	782	* htmlAutoClose:
				783	* @ctxt: an HTML parser context
				784	* @newtag: The new tag name or NULL
				785	*
				786	* The HTmL DtD allows a tag to implicitely close other tags.
				787	* The list is kept in htmlStartClose array. This function is
				788	* called when a new tag has been detected and generates the
				789	* appropriates closes if possible/needed.
				790	* If newtag is NULL this mean we are at the end of the resource
				791	* and we should check
				792	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	793	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	794	htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				795	xmlChar *oldname;
				796	while ((newtag != NULL) && (ctxt->name != NULL) &&
				797	(htmlCheckAutoClose(newtag, ctxt->name))) {
				798	#ifdef DEBUG
				799	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
				800	#endif
				801	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				802	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				803	oldname = htmlnamePop(ctxt);
				804	if (oldname != NULL) {
				805	#ifdef DEBUG
				806	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
				807	#endif
				808	xmlFree(oldname);
				809	}
				810	}
				811	if (newtag == NULL) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	812	htmlAutoCloseOnEnd(ctxt);
				813	return;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	814	}
				815	while ((newtag == NULL) && (ctxt->name != NULL) &&
				816	((xmlStrEqual(ctxt->name, BAD_CAST"head")) \|\|
				817	(xmlStrEqual(ctxt->name, BAD_CAST"body")) \|\|
				818	(xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
				819	#ifdef DEBUG
				820	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
				821	#endif
				822	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				823	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				824	oldname = htmlnamePop(ctxt);
				825	if (oldname != NULL) {
				826	#ifdef DEBUG
				827	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
				828	#endif
				829	xmlFree(oldname);
				830	}
				831	}
				832
				833	}
				834
				835	/**
				836	* htmlAutoCloseTag:
				837	* @doc: the HTML document
				838	* @name: The tag name
				839	* @elem: the HTML element
				840	*
				841	* The HTmL DtD allows a tag to implicitely close other tags.
				842	* The list is kept in htmlStartClose array. This function checks
				843	* if the element or one of it's children would autoclose the
				844	* given tag.
				845	*
				846	* Returns 1 if autoclose, 0 otherwise
				847	*/
				848	int
				849	htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
				850	htmlNodePtr child;
				851
				852	if (elem == NULL) return(1);
				853	if (xmlStrEqual(name, elem->name)) return(0);
				854	if (htmlCheckAutoClose(elem->name, name)) return(1);
				855	child = elem->children;
				856	while (child != NULL) {
				857	if (htmlAutoCloseTag(doc, name, child)) return(1);
				858	child = child->next;
				859	}
				860	return(0);
				861	}
				862
				863	/**
				864	* htmlIsAutoClosed:
				865	* @doc: the HTML document
				866	* @elem: the HTML element
				867	*
				868	* The HTmL DtD allows a tag to implicitely close other tags.
				869	* The list is kept in htmlStartClose array. This function checks
				870	* if a tag is autoclosed by one of it's child
				871	*
				872	* Returns 1 if autoclosed, 0 otherwise
				873	*/
				874	int
				875	htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
				876	htmlNodePtr child;
				877
				878	if (elem == NULL) return(1);
				879	child = elem->children;
				880	while (child != NULL) {
				881	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
				882	child = child->next;
				883	}
				884	return(0);
				885	}
				886
				887	/**
				888	* htmlCheckImplied:
				889	* @ctxt: an HTML parser context
				890	* @newtag: The new tag name
				891	*
				892	* The HTML DtD allows a tag to exists only implicitely
				893	* called when a new tag has been detected and generates the
				894	* appropriates implicit tags if missing
				895	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	896	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	897	htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				898	if (!htmlOmittedDefaultValue)
				899	return;
				900	if (xmlStrEqual(newtag, BAD_CAST"html"))
				901	return;
				902	if (ctxt->nameNr <= 0) {
				903	#ifdef DEBUG
				904	xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
				905	#endif
				906	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
				907	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				908	ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
				909	}
				910	if ((xmlStrEqual(newtag, BAD_CAST"body")) \|\| (xmlStrEqual(newtag, BAD_CAST"head")))
				911	return;
				912	if ((ctxt->nameNr <= 1) &&
				913	((xmlStrEqual(newtag, BAD_CAST"script")) \|\|
				914	(xmlStrEqual(newtag, BAD_CAST"style")) \|\|
				915	(xmlStrEqual(newtag, BAD_CAST"meta")) \|\|
				916	(xmlStrEqual(newtag, BAD_CAST"link")) \|\|
				917	(xmlStrEqual(newtag, BAD_CAST"title")) \|\|
				918	(xmlStrEqual(newtag, BAD_CAST"base")))) {
				919	/*
				920	* dropped OBJECT ... i you put it first BODY will be
				921	* assumed !
				922	*/
				923	#ifdef DEBUG
				924	xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
				925	#endif
				926	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
				927	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				928	ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
				929	} else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
				930	(!xmlStrEqual(newtag, BAD_CAST"frame")) &&
				931	(!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
				932	int i;
				933	for (i = 0;i < ctxt->nameNr;i++) {
				934	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
				935	return;
				936	}
				937	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
				938	return;
				939	}
				940	}
				941
				942	#ifdef DEBUG
				943	xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
				944	#endif
				945	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
				946	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				947	ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
				948	}
				949	}
				950
				951	/**
				952	* htmlCheckParagraph
				953	* @ctxt: an HTML parser context
				954	*
				955	* Check whether a p element need to be implied before inserting
				956	* characters in the current element.
				957	*
				958	* Returns 1 if a paragraph has been inserted, 0 if not and -1
				959	* in case of error.
				960	*/
				961
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	962	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	963	htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
				964	const xmlChar *tag;
				965	int i;
				966
				967	if (ctxt == NULL)
				968	return(-1);
				969	tag = ctxt->name;
				970	if (tag == NULL) {
				971	htmlAutoClose(ctxt, BAD_CAST"p");
				972	htmlCheckImplied(ctxt, BAD_CAST"p");
				973	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
				974	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				975	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
				976	return(1);
				977	}
				978	if (!htmlOmittedDefaultValue)
				979	return(0);
				980	for (i = 0; htmlNoContentElements[i] != NULL; i++) {
				981	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
				982	#ifdef DEBUG
				983	xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
				984	#endif
				985	htmlAutoClose(ctxt, BAD_CAST"p");
				986	htmlCheckImplied(ctxt, BAD_CAST"p");
				987	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
				988	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				989	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
				990	return(1);
				991	}
				992	}
				993	return(0);
				994	}
				995
				996	/**
				997	* htmlIsScriptAttribute:
				998	* @name: an attribute name
				999	*
				1000	* Check if an attribute is of content type Script
				1001	*
				1002	* Returns 1 is the attribute is a script 0 otherwise
				1003	*/
				1004	int
				1005	htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1006	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1007
				1008	if (name == NULL)
				1009	return(0);
				1010	/*
				1011	* all script attributes start with 'on'
				1012	*/
				1013	if ((name[0] != 'o') \|\| (name[1] != 'n'))
				1014	return(0);
				1015	for (i = 0;
				1016	i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
				1017	i++) {
				1018	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
				1019	return(1);
				1020	}
				1021	return(0);
				1022	}
				1023
				1024	/************************************************************************
				1025	* *
				1026	* The list of HTML predefined entities *
				1027	* *
				1028	************************************************************************/
				1029
				1030
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	1031	static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1032	/*
				1033	* the 4 absolute ones, plus apostrophe.
				1034	*/
				1035	{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
				1036	{ 38, "amp", "ampersand, U+0026 ISOnum" },
				1037	{ 39, "apos", "single quote" },
				1038	{ 60, "lt", "less-than sign, U+003C ISOnum" },
				1039	{ 62, "gt", "greater-than sign, U+003E ISOnum" },
				1040
				1041	/*
				1042	* A bunch still in the 128-255 range
				1043	* Replacing them depend really on the charset used.
				1044	*/
				1045	{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
				1046	{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
				1047	{ 162, "cent", "cent sign, U+00A2 ISOnum" },
				1048	{ 163, "pound","pound sign, U+00A3 ISOnum" },
				1049	{ 164, "curren","currency sign, U+00A4 ISOnum" },
				1050	{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
				1051	{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
				1052	{ 167, "sect", "section sign, U+00A7 ISOnum" },
				1053	{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
				1054	{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
				1055	{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
				1056	{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
				1057	{ 172, "not", "not sign, U+00AC ISOnum" },
				1058	{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
				1059	{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
				1060	{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
				1061	{ 176, "deg", "degree sign, U+00B0 ISOnum" },
				1062	{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
				1063	{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
				1064	{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
				1065	{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
				1066	{ 181, "micro","micro sign, U+00B5 ISOnum" },
				1067	{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
				1068	{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
				1069	{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
				1070	{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
				1071	{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
				1072	{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
				1073	{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
				1074	{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
				1075	{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
				1076	{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
				1077	{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
				1078	{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
				1079	{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
				1080	{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
				1081	{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
				1082	{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
				1083	{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
				1084	{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
				1085	{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
				1086	{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
				1087	{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
				1088	{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
				1089	{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
				1090	{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
				1091	{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
				1092	{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
				1093	{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
				1094	{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
				1095	{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
				1096	{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
				1097	{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
				1098	{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
				1099	{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
				1100	{ 215, "times","multiplication sign, U+00D7 ISOnum" },
				1101	{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
				1102	{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
				1103	{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
				1104	{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
				1105	{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
				1106	{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
				1107	{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
				1108	{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
				1109	{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
				1110	{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
				1111	{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
				1112	{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
				1113	{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
				1114	{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
				1115	{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
				1116	{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
				1117	{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
				1118	{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
				1119	{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
				1120	{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
				1121	{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
				1122	{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
				1123	{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
				1124	{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
				1125	{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
				1126	{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
				1127	{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
				1128	{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
				1129	{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
				1130	{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
				1131	{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
				1132	{ 247, "divide","division sign, U+00F7 ISOnum" },
				1133	{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
				1134	{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
				1135	{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
				1136	{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
				1137	{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
				1138	{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
				1139	{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
				1140	{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
				1141
				1142	{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
				1143	{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
				1144	{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
				1145	{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
				1146	{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
				1147
				1148	/*
				1149	* Anything below should really be kept as entities references
				1150	*/
				1151	{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
				1152
				1153	{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
				1154	{ 732, "tilde","small tilde, U+02DC ISOdia" },
				1155
				1156	{ 913, "Alpha","greek capital letter alpha, U+0391" },
				1157	{ 914, "Beta", "greek capital letter beta, U+0392" },
				1158	{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
				1159	{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
				1160	{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
				1161	{ 918, "Zeta", "greek capital letter zeta, U+0396" },
				1162	{ 919, "Eta", "greek capital letter eta, U+0397" },
				1163	{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
				1164	{ 921, "Iota", "greek capital letter iota, U+0399" },
				1165	{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1166	{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1167	{ 924, "Mu", "greek capital letter mu, U+039C" },
				1168	{ 925, "Nu", "greek capital letter nu, U+039D" },
				1169	{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
				1170	{ 927, "Omicron","greek capital letter omicron, U+039F" },
				1171	{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
				1172	{ 929, "Rho", "greek capital letter rho, U+03A1" },
				1173	{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
				1174	{ 932, "Tau", "greek capital letter tau, U+03A4" },
				1175	{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
				1176	{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
				1177	{ 935, "Chi", "greek capital letter chi, U+03A7" },
				1178	{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
				1179	{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
				1180
				1181	{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
				1182	{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
				1183	{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
				1184	{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
				1185	{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
				1186	{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
				1187	{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
				1188	{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
				1189	{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
				1190	{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
				1191	{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
				1192	{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
				1193	{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
				1194	{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
				1195	{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
				1196	{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
				1197	{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
				1198	{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
				1199	{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
				1200	{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
				1201	{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
				1202	{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
				1203	{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
				1204	{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
				1205	{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
				1206	{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
				1207	{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
				1208	{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
				1209
				1210	{ 8194, "ensp", "en space, U+2002 ISOpub" },
				1211	{ 8195, "emsp", "em space, U+2003 ISOpub" },
				1212	{ 8201, "thinsp","thin space, U+2009 ISOpub" },
				1213	{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
				1214	{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
				1215	{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
				1216	{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
				1217	{ 8211, "ndash","en dash, U+2013 ISOpub" },
				1218	{ 8212, "mdash","em dash, U+2014 ISOpub" },
				1219	{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
				1220	{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
				1221	{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
				1222	{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
				1223	{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
				1224	{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
				1225	{ 8224, "dagger","dagger, U+2020 ISOpub" },
				1226	{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
				1227
				1228	{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
				1229	{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
				1230
				1231	{ 8240, "permil","per mille sign, U+2030 ISOtech" },
				1232
				1233	{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
				1234	{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
				1235
				1236	{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
				1237	{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
				1238
				1239	{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
				1240	{ 8260, "frasl","fraction slash, U+2044 NEW" },
				1241
				1242	{ 8364, "euro", "euro sign, U+20AC NEW" },
				1243
				1244	{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
				1245	{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
				1246	{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
				1247	{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
				1248	{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
				1249	{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
				1250	{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
				1251	{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
				1252	{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
				1253	{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
				1254	{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
				1255	{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
				1256	{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
				1257	{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
				1258	{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
				1259	{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
				1260
				1261	{ 8704, "forall","for all, U+2200 ISOtech" },
				1262	{ 8706, "part", "partial differential, U+2202 ISOtech" },
				1263	{ 8707, "exist","there exists, U+2203 ISOtech" },
				1264	{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
				1265	{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
				1266	{ 8712, "isin", "element of, U+2208 ISOtech" },
				1267	{ 8713, "notin","not an element of, U+2209 ISOtech" },
				1268	{ 8715, "ni", "contains as member, U+220B ISOtech" },
				1269	{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
				1270	{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
				1271	{ 8722, "minus","minus sign, U+2212 ISOtech" },
				1272	{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
				1273	{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
				1274	{ 8733, "prop", "proportional to, U+221D ISOtech" },
				1275	{ 8734, "infin","infinity, U+221E ISOtech" },
				1276	{ 8736, "ang", "angle, U+2220 ISOamso" },
				1277	{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
				1278	{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
				1279	{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
				1280	{ 8746, "cup", "union = cup, U+222A ISOtech" },
				1281	{ 8747, "int", "integral, U+222B ISOtech" },
				1282	{ 8756, "there4","therefore, U+2234 ISOtech" },
				1283	{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
				1284	{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
				1285	{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
				1286	{ 8800, "ne", "not equal to, U+2260 ISOtech" },
				1287	{ 8801, "equiv","identical to, U+2261 ISOtech" },
				1288	{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
				1289	{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
				1290	{ 8834, "sub", "subset of, U+2282 ISOtech" },
				1291	{ 8835, "sup", "superset of, U+2283 ISOtech" },
				1292	{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
				1293	{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
				1294	{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
				1295	{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
				1296	{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
				1297	{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
				1298	{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
				1299	{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
				1300	{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
				1301	{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
				1302	{ 8971, "rfloor","right floor, U+230B ISOamsc" },
				1303	{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
				1304	{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
				1305	{ 9674, "loz", "lozenge, U+25CA ISOpub" },
				1306
				1307	{ 9824, "spades","black spade suit, U+2660 ISOpub" },
				1308	{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
				1309	{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
				1310	{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
				1311
				1312	};
				1313
				1314	/************************************************************************
				1315	* *
				1316	* Commodity functions to handle entities *
				1317	* *
				1318	************************************************************************/
				1319
				1320	/*
				1321	* Macro used to grow the current buffer.
				1322	*/
				1323	#define growBuffer(buffer) { \
				1324	buffer##_size *= 2; \
				1325	buffer = (xmlChar ) xmlRealloc(buffer, buffer##_size sizeof(xmlChar)); \
				1326	if (buffer == NULL) { \
				1327	perror("realloc failed"); \
				1328	return(NULL); \
				1329	} \
				1330	}
				1331
				1332	/**
				1333	* htmlEntityLookup:
				1334	* @name: the entity name
				1335	*
				1336	* Lookup the given entity in EntitiesTable
				1337	*
				1338	* TODO: the linear scan is really ugly, an hash table is really needed.
				1339	*
				1340	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
				1341	*/
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	1342	const htmlEntityDesc *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1343	htmlEntityLookup(const xmlChar *name) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1344	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1345
				1346	for (i = 0;i < (sizeof(html40EntitiesTable)/
				1347	sizeof(html40EntitiesTable[0]));i++) {
				1348	if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
				1349	#ifdef DEBUG
				1350	xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
				1351	#endif
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	1352	return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1353	}
				1354	}
				1355	return(NULL);
				1356	}
				1357
				1358	/**
				1359	* htmlEntityValueLookup:
				1360	* @value: the entity's unicode value
				1361	*
				1362	* Lookup the given entity in EntitiesTable
				1363	*
				1364	* TODO: the linear scan is really ugly, an hash table is really needed.
				1365	*
				1366	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
				1367	*/
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	1368	const htmlEntityDesc *
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1369	htmlEntityValueLookup(unsigned int value) {
				1370	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1371	#ifdef DEBUG
Daniel Veillard	f69bb4b	2001-05-19 13:24:56 +0000	[diff] [blame]	1372	unsigned int lv = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1373	#endif
				1374
				1375	for (i = 0;i < (sizeof(html40EntitiesTable)/
				1376	sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1377	if (html40EntitiesTable[i].value >= value) {
				1378	if (html40EntitiesTable[i].value > value)
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1379	break;
				1380	#ifdef DEBUG
				1381	xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
				1382	#endif
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	1383	return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1384	}
				1385	#ifdef DEBUG
				1386	if (lv > html40EntitiesTable[i].value) {
				1387	xmlGenericError(xmlGenericErrorContext,
				1388	"html40EntitiesTable[] is not sorted (%d > %d)!\n",
				1389	lv, html40EntitiesTable[i].value);
				1390	}
				1391	lv = html40EntitiesTable[i].value;
				1392	#endif
				1393	}
				1394	return(NULL);
				1395	}
				1396
				1397	/**
				1398	* UTF8ToHtml:
				1399	* @out: a pointer to an array of bytes to store the result
				1400	* @outlen: the length of @out
				1401	* @in: a pointer to an array of UTF-8 chars
				1402	* @inlen: the length of @in
				1403	*
				1404	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				1405	* plus HTML entities block of chars out.
				1406	*
				1407	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				1408	* The value of @inlen after return is the number of octets consumed
				1409	* as the return value is positive, else unpredictiable.
				1410	* The value of @outlen after return is the number of octets consumed.
				1411	*/
				1412	int
				1413	UTF8ToHtml(unsigned char* out, int *outlen,
				1414	const unsigned char* in, int *inlen) {
				1415	const unsigned char* processed = in;
				1416	const unsigned char* outend;
				1417	const unsigned char* outstart = out;
				1418	const unsigned char* instart = in;
				1419	const unsigned char* inend;
				1420	unsigned int c, d;
				1421	int trailing;
				1422
				1423	if (in == NULL) {
				1424	/*
				1425	* initialization nothing to do
				1426	*/
				1427	*outlen = 0;
				1428	*inlen = 0;
				1429	return(0);
				1430	}
				1431	inend = in + (*inlen);
				1432	outend = out + (*outlen);
				1433	while (in < inend) {
				1434	d = *in++;
				1435	if (d < 0x80) { c= d; trailing= 0; }
				1436	else if (d < 0xC0) {
				1437	/* trailing byte in leading position */
				1438	*outlen = out - outstart;
				1439	*inlen = processed - instart;
				1440	return(-2);
				1441	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				1442	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				1443	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				1444	else {
				1445	/* no chance for this in Ascii */
				1446	*outlen = out - outstart;
				1447	*inlen = processed - instart;
				1448	return(-2);
				1449	}
				1450
				1451	if (inend - in < trailing) {
				1452	break;
				1453	}
				1454
				1455	for ( ; trailing; trailing--) {
				1456	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
				1457	break;
				1458	c <<= 6;
				1459	c \|= d & 0x3F;
				1460	}
				1461
				1462	/* assertion: c is a single UTF-4 value */
				1463	if (c < 0x80) {
				1464	if (out + 1 >= outend)
				1465	break;
				1466	*out++ = c;
				1467	} else {
				1468	int len;
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	1469	const htmlEntityDesc * ent;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1470
				1471	/*
				1472	* Try to lookup a predefined HTML entity for it
				1473	*/
				1474
				1475	ent = htmlEntityValueLookup(c);
				1476	if (ent == NULL) {
				1477	/* no chance for this in Ascii */
				1478	*outlen = out - outstart;
				1479	*inlen = processed - instart;
				1480	return(-2);
				1481	}
				1482	len = strlen(ent->name);
				1483	if (out + 2 + len >= outend)
				1484	break;
				1485	*out++ = '&';
				1486	memcpy(out, ent->name, len);
				1487	out += len;
				1488	*out++ = ';';
				1489	}
				1490	processed = in;
				1491	}
				1492	*outlen = out - outstart;
				1493	*inlen = processed - instart;
				1494	return(0);
				1495	}
				1496
				1497	/**
				1498	* htmlEncodeEntities:
				1499	* @out: a pointer to an array of bytes to store the result
				1500	* @outlen: the length of @out
				1501	* @in: a pointer to an array of UTF-8 chars
				1502	* @inlen: the length of @in
				1503	* @quoteChar: the quote character to escape (' or ") or zero.
				1504	*
				1505	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				1506	* plus HTML entities block of chars out.
				1507	*
				1508	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				1509	* The value of @inlen after return is the number of octets consumed
				1510	* as the return value is positive, else unpredictiable.
				1511	* The value of @outlen after return is the number of octets consumed.
				1512	*/
				1513	int
				1514	htmlEncodeEntities(unsigned char* out, int *outlen,
				1515	const unsigned char* in, int *inlen, int quoteChar) {
				1516	const unsigned char* processed = in;
				1517	const unsigned char* outend = out + (*outlen);
				1518	const unsigned char* outstart = out;
				1519	const unsigned char* instart = in;
				1520	const unsigned char* inend = in + (*inlen);
				1521	unsigned int c, d;
				1522	int trailing;
				1523
				1524	while (in < inend) {
				1525	d = *in++;
				1526	if (d < 0x80) { c= d; trailing= 0; }
				1527	else if (d < 0xC0) {
				1528	/* trailing byte in leading position */
				1529	*outlen = out - outstart;
				1530	*inlen = processed - instart;
				1531	return(-2);
				1532	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				1533	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				1534	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				1535	else {
				1536	/* no chance for this in Ascii */
				1537	*outlen = out - outstart;
				1538	*inlen = processed - instart;
				1539	return(-2);
				1540	}
				1541
				1542	if (inend - in < trailing)
				1543	break;
				1544
				1545	while (trailing--) {
				1546	if (((d= *in++) & 0xC0) != 0x80) {
				1547	*outlen = out - outstart;
				1548	*inlen = processed - instart;
				1549	return(-2);
				1550	}
				1551	c <<= 6;
				1552	c \|= d & 0x3F;
				1553	}
				1554
				1555	/* assertion: c is a single UTF-4 value */
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1556	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
				1557	(c != '&') && (c != '<') && (c != '>')) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1558	if (out >= outend)
				1559	break;
				1560	*out++ = c;
				1561	} else {
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	1562	const htmlEntityDesc * ent;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1563	const char *cp;
				1564	char nbuf[16];
				1565	int len;
				1566
				1567	/*
				1568	* Try to lookup a predefined HTML entity for it
				1569	*/
				1570	ent = htmlEntityValueLookup(c);
				1571	if (ent == NULL) {
				1572	sprintf(nbuf, "#%u", c);
				1573	cp = nbuf;
				1574	}
				1575	else
				1576	cp = ent->name;
				1577	len = strlen(cp);
				1578	if (out + 2 + len > outend)
				1579	break;
				1580	*out++ = '&';
				1581	memcpy(out, cp, len);
				1582	out += len;
				1583	*out++ = ';';
				1584	}
				1585	processed = in;
				1586	}
				1587	*outlen = out - outstart;
				1588	*inlen = processed - instart;
				1589	return(0);
				1590	}
				1591
				1592	/**
				1593	* htmlDecodeEntities:
				1594	* @ctxt: the parser context
				1595	* @len: the len to decode (in bytes !), -1 for no size limit
				1596	* @end: an end marker xmlChar, 0 if none
				1597	* @end2: an end marker xmlChar, 0 if none
				1598	* @end3: an end marker xmlChar, 0 if none
				1599	*
				1600	* Subtitute the HTML entities by their value
				1601	*
				1602	* DEPRECATED !!!!
				1603	*
				1604	* Returns A newly allocated string with the substitution done. The caller
				1605	* must deallocate it !
				1606	*/
				1607	xmlChar *
Daniel Veillard	c86a4fa	2001-03-26 16:28:29 +0000	[diff] [blame]	1608	htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
				1609	xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1610	static int deprecated = 0;
				1611	if (!deprecated) {
				1612	xmlGenericError(xmlGenericErrorContext,
				1613	"htmlDecodeEntities() deprecated function reached\n");
				1614	deprecated = 1;
				1615	}
				1616	return(NULL);
				1617	#if 0
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1618	xmlChar *name = NULL;
				1619	xmlChar *buffer = NULL;
				1620	unsigned int buffer_size = 0;
				1621	unsigned int nbchars = 0;
				1622	htmlEntityDescPtr ent;
				1623	unsigned int max = (unsigned int) len;
				1624	int c,l;
				1625
				1626	if (ctxt->depth > 40) {
				1627	ctxt->errNo = XML_ERR_ENTITY_LOOP;
				1628	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1629	ctxt->sax->error(ctxt->userData,
				1630	"Detected entity reference loop\n");
				1631	ctxt->wellFormed = 0;
				1632	ctxt->disableSAX = 1;
				1633	return(NULL);
				1634	}
				1635
				1636	/*
				1637	* allocate a translation buffer.
				1638	*/
				1639	buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
				1640	buffer = (xmlChar ) xmlMalloc(buffer_size sizeof(xmlChar));
				1641	if (buffer == NULL) {
				1642	perror("xmlDecodeEntities: malloc failed");
				1643	return(NULL);
				1644	}
				1645
				1646	/*
				1647	* Ok loop until we reach one of the ending char or a size limit.
				1648	*/
				1649	c = CUR_CHAR(l);
				1650	while ((nbchars < max) && (c != end) &&
				1651	(c != end2) && (c != end3)) {
				1652
				1653	if (c == 0) break;
				1654	if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
				1655	int val = htmlParseCharRef(ctxt);
				1656	COPY_BUF(0,buffer,nbchars,val);
				1657	NEXTL(l);
				1658	} else if ((c == '&') && (ctxt->token != '&')) {
				1659	ent = htmlParseEntityRef(ctxt, &name);
				1660	if (name != NULL) {
				1661	if (ent != NULL) {
				1662	int val = ent->value;
				1663	COPY_BUF(0,buffer,nbchars,val);
				1664	NEXTL(l);
				1665	} else {
				1666	const xmlChar *cur = name;
				1667
				1668	buffer[nbchars++] = '&';
				1669	if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
				1670	growBuffer(buffer);
				1671	}
				1672	while (*cur != 0) {
				1673	buffer[nbchars++] = *cur++;
				1674	}
				1675	buffer[nbchars++] = ';';
				1676	}
				1677	}
				1678	} else {
				1679	COPY_BUF(l,buffer,nbchars,c);
				1680	NEXTL(l);
				1681	if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
				1682	growBuffer(buffer);
				1683	}
				1684	}
				1685	c = CUR_CHAR(l);
				1686	}
				1687	buffer[nbchars++] = 0;
				1688	return(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1689	#endif
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1690	}
				1691
				1692	/************************************************************************
				1693	* *
				1694	* Commodity functions to handle streams *
				1695	* *
				1696	************************************************************************/
				1697
				1698	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1699	* htmlNewInputStream:
				1700	* @ctxt: an HTML parser context
				1701	*
				1702	* Create a new input stream structure
				1703	* Returns the new input stream or NULL
				1704	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1705	static htmlParserInputPtr
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1706	htmlNewInputStream(htmlParserCtxtPtr ctxt) {
				1707	htmlParserInputPtr input;
				1708
				1709	input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				1710	if (input == NULL) {
				1711	ctxt->errNo = XML_ERR_NO_MEMORY;
				1712	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1713	ctxt->sax->error(ctxt->userData,
				1714	"malloc: couldn't allocate a new input stream\n");
				1715	return(NULL);
				1716	}
				1717	memset(input, 0, sizeof(htmlParserInput));
				1718	input->filename = NULL;
				1719	input->directory = NULL;
				1720	input->base = NULL;
				1721	input->cur = NULL;
				1722	input->buf = NULL;
				1723	input->line = 1;
				1724	input->col = 1;
				1725	input->buf = NULL;
				1726	input->free = NULL;
				1727	input->version = NULL;
				1728	input->consumed = 0;
				1729	input->length = 0;
				1730	return(input);
				1731	}
				1732
				1733
				1734	/************************************************************************
				1735	* *
				1736	* Commodity functions, cleanup needed ? *
				1737	* *
				1738	************************************************************************/
				1739
				1740	/**
				1741	* areBlanks:
				1742	* @ctxt: an HTML parser context
				1743	* @str: a xmlChar *
				1744	* @len: the size of @str
				1745	*
				1746	* Is this a sequence of blank chars that one can ignore ?
				1747	*
				1748	* Returns 1 if ignorable 0 otherwise.
				1749	*/
				1750
				1751	static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
				1752	int i;
				1753	xmlNodePtr lastChild;
				1754
				1755	for (i = 0;i < len;i++)
				1756	if (!(IS_BLANK(str[i]))) return(0);
				1757
				1758	if (CUR == 0) return(1);
				1759	if (CUR != '<') return(0);
				1760	if (ctxt->name == NULL)
				1761	return(1);
				1762	if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
				1763	return(1);
				1764	if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
				1765	return(1);
				1766	if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
				1767	return(1);
				1768	if (ctxt->node == NULL) return(0);
				1769	lastChild = xmlGetLastChild(ctxt->node);
				1770	if (lastChild == NULL) {
Daniel Veillard	7db3773	2001-07-12 01:20:08 +0000	[diff] [blame]	1771	if ((ctxt->node->type != XML_ELEMENT_NODE) &&
				1772	(ctxt->node->content != NULL)) return(0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1773	} else if (xmlNodeIsText(lastChild)) {
				1774	return(0);
				1775	} else if (xmlStrEqual(lastChild->name, BAD_CAST"b")) {
				1776	return(0);
				1777	} else if (xmlStrEqual(lastChild->name, BAD_CAST"bold")) {
				1778	return(0);
				1779	} else if (xmlStrEqual(lastChild->name, BAD_CAST"em")) {
				1780	return(0);
				1781	}
				1782	return(1);
				1783	}
				1784
				1785	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1786	* htmlNewDocNoDtD:
				1787	* @URI: URI for the dtd, or NULL
				1788	* @ExternalID: the external ID of the DTD, or NULL
				1789	*
Daniel Veillard	5e2dace	2001-07-18 19:30:27 +0000	[diff] [blame]	1790	* Creates a new HTML document without a DTD node if @URI and @ExternalID
				1791	* are NULL
				1792	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1793	* Returns a new document, do not intialize the DTD if not provided
				1794	*/
				1795	htmlDocPtr
				1796	htmlNewDocNoDtD(const xmlChar URI, const xmlChar ExternalID) {
				1797	xmlDocPtr cur;
				1798
				1799	/*
				1800	* Allocate a new document and fill the fields.
				1801	*/
				1802	cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
				1803	if (cur == NULL) {
				1804	xmlGenericError(xmlGenericErrorContext,
				1805	"xmlNewDoc : malloc failed\n");
				1806	return(NULL);
				1807	}
				1808	memset(cur, 0, sizeof(xmlDoc));
				1809
				1810	cur->type = XML_HTML_DOCUMENT_NODE;
				1811	cur->version = NULL;
				1812	cur->intSubset = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1813	cur->doc = cur;
				1814	cur->name = NULL;
				1815	cur->children = NULL;
				1816	cur->extSubset = NULL;
				1817	cur->oldNs = NULL;
				1818	cur->encoding = NULL;
				1819	cur->standalone = 1;
				1820	cur->compression = 0;
				1821	cur->ids = NULL;
				1822	cur->refs = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1823	cur->_private = NULL;
Daniel Veillard	b6b0fd8	2001-10-22 12:31:11 +0000	[diff] [blame^]	1824	if ((ExternalID != NULL) \|\|
				1825	(URI != NULL))
				1826	xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1827	return(cur);
				1828	}
				1829
				1830	/**
				1831	* htmlNewDoc:
				1832	* @URI: URI for the dtd, or NULL
				1833	* @ExternalID: the external ID of the DTD, or NULL
				1834	*
Daniel Veillard	5e2dace	2001-07-18 19:30:27 +0000	[diff] [blame]	1835	* Creates a new HTML document
				1836	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1837	* Returns a new document
				1838	*/
				1839	htmlDocPtr
				1840	htmlNewDoc(const xmlChar URI, const xmlChar ExternalID) {
				1841	if ((URI == NULL) && (ExternalID == NULL))
				1842	return(htmlNewDocNoDtD(
Daniel Veillard	6426935	2001-05-04 17:52:34 +0000	[diff] [blame]	1843	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
				1844	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1845
				1846	return(htmlNewDocNoDtD(URI, ExternalID));
				1847	}
				1848
				1849
				1850	/************************************************************************
				1851	* *
				1852	* The parser itself *
				1853	* Relates to http://www.w3.org/TR/html40 *
				1854	* *
				1855	************************************************************************/
				1856
				1857	/************************************************************************
				1858	* *
				1859	* The parser itself *
				1860	* *
				1861	************************************************************************/
				1862
				1863	/**
				1864	* htmlParseHTMLName:
				1865	* @ctxt: an HTML parser context
				1866	*
				1867	* parse an HTML tag or attribute name, note that we convert it to lowercase
				1868	* since HTML names are not case-sensitive.
				1869	*
				1870	* Returns the Tag Name parsed or NULL
				1871	*/
				1872
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1873	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1874	htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
				1875	xmlChar *ret = NULL;
				1876	int i = 0;
				1877	xmlChar loc[HTML_PARSER_BUFFER_SIZE];
				1878
				1879	if (!IS_LETTER(CUR) && (CUR != '_') &&
				1880	(CUR != ':')) return(NULL);
				1881
				1882	while ((i < HTML_PARSER_BUFFER_SIZE) &&
				1883	((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1884	(CUR == ':') \|\| (CUR == '-') \|\| (CUR == '_'))) {
				1885	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
				1886	else loc[i] = CUR;
				1887	i++;
				1888
				1889	NEXT;
				1890	}
				1891
				1892	ret = xmlStrndup(loc, i);
				1893
				1894	return(ret);
				1895	}
				1896
				1897	/**
				1898	* htmlParseName:
				1899	* @ctxt: an HTML parser context
				1900	*
				1901	* parse an HTML name, this routine is case sensistive.
				1902	*
				1903	* Returns the Name parsed or NULL
				1904	*/
				1905
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1906	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1907	htmlParseName(htmlParserCtxtPtr ctxt) {
				1908	xmlChar buf[HTML_MAX_NAMELEN];
				1909	int len = 0;
				1910
				1911	GROW;
				1912	if (!IS_LETTER(CUR) && (CUR != '_')) {
				1913	return(NULL);
				1914	}
				1915
				1916	while ((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1917	(CUR == '.') \|\| (CUR == '-') \|\|
				1918	(CUR == '_') \|\| (CUR == ':') \|\|
				1919	(IS_COMBINING(CUR)) \|\|
				1920	(IS_EXTENDER(CUR))) {
				1921	buf[len++] = CUR;
				1922	NEXT;
				1923	if (len >= HTML_MAX_NAMELEN) {
				1924	xmlGenericError(xmlGenericErrorContext,
				1925	"htmlParseName: reached HTML_MAX_NAMELEN limit\n");
				1926	while ((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1927	(CUR == '.') \|\| (CUR == '-') \|\|
				1928	(CUR == '_') \|\| (CUR == ':') \|\|
				1929	(IS_COMBINING(CUR)) \|\|
				1930	(IS_EXTENDER(CUR)))
				1931	NEXT;
				1932	break;
				1933	}
				1934	}
				1935	return(xmlStrndup(buf, len));
				1936	}
				1937
				1938	/**
				1939	* htmlParseHTMLAttribute:
				1940	* @ctxt: an HTML parser context
				1941	* @stop: a char stop value
				1942	*
				1943	* parse an HTML attribute value till the stop (quote), if
				1944	* stop is 0 then it stops at the first space
				1945	*
				1946	* Returns the attribute parsed or NULL
				1947	*/
				1948
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1949	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1950	htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
				1951	xmlChar *buffer = NULL;
				1952	int buffer_size = 0;
				1953	xmlChar *out = NULL;
				1954	xmlChar *name = NULL;
				1955
				1956	xmlChar *cur = NULL;
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	1957	const htmlEntityDesc * ent;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1958
				1959	/*
				1960	* allocate a translation buffer.
				1961	*/
				1962	buffer_size = HTML_PARSER_BUFFER_SIZE;
				1963	buffer = (xmlChar ) xmlMalloc(buffer_size sizeof(xmlChar));
				1964	if (buffer == NULL) {
				1965	perror("htmlParseHTMLAttribute: malloc failed");
				1966	return(NULL);
				1967	}
				1968	out = buffer;
				1969
				1970	/*
				1971	* Ok loop until we reach one of the ending chars
				1972	*/
				1973	while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
				1974	if ((stop == 0) && (IS_BLANK(CUR))) break;
				1975	if (CUR == '&') {
				1976	if (NXT(1) == '#') {
				1977	unsigned int c;
				1978	int bits;
				1979
				1980	c = htmlParseCharRef(ctxt);
				1981	if (c < 0x80)
				1982	{ *out++ = c; bits= -6; }
				1983	else if (c < 0x800)
				1984	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				1985	else if (c < 0x10000)
				1986	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				1987	else
				1988	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				1989
				1990	for ( ; bits >= 0; bits-= 6) {
				1991	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				1992	}
				1993	} else {
				1994	ent = htmlParseEntityRef(ctxt, &name);
				1995	if (name == NULL) {
				1996	*out++ = '&';
				1997	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1998	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1999
				2000	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2001	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2002	}
				2003	} else if (ent == NULL) {
				2004	*out++ = '&';
				2005	cur = name;
				2006	while (*cur != 0) {
				2007	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2008	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2009
				2010	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2011	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2012	}
				2013	out++ = cur++;
				2014	}
				2015	xmlFree(name);
				2016	} else {
				2017	unsigned int c;
				2018	int bits;
				2019
				2020	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2021	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2022
				2023	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2024	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2025	}
				2026	c = (xmlChar)ent->value;
				2027	if (c < 0x80)
				2028	{ *out++ = c; bits= -6; }
				2029	else if (c < 0x800)
				2030	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2031	else if (c < 0x10000)
				2032	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2033	else
				2034	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2035
				2036	for ( ; bits >= 0; bits-= 6) {
				2037	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2038	}
				2039	xmlFree(name);
				2040	}
				2041	}
				2042	} else {
				2043	unsigned int c;
				2044	int bits, l;
				2045
				2046	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2047	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2048
				2049	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2050	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2051	}
				2052	c = CUR_CHAR(l);
				2053	if (c < 0x80)
				2054	{ *out++ = c; bits= -6; }
				2055	else if (c < 0x800)
				2056	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2057	else if (c < 0x10000)
				2058	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2059	else
				2060	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2061
				2062	for ( ; bits >= 0; bits-= 6) {
				2063	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2064	}
				2065	NEXT;
				2066	}
				2067	}
				2068	*out++ = 0;
				2069	return(buffer);
				2070	}
				2071
				2072	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2073	* htmlParseEntityRef:
				2074	* @ctxt: an HTML parser context
				2075	* @str: location to store the entity name
				2076	*
				2077	* parse an HTML ENTITY references
				2078	*
				2079	* [68] EntityRef ::= '&' Name ';'
				2080	*
				2081	* Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
				2082	* if non-NULL *str will have to be freed by the caller.
				2083	*/
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	2084	const htmlEntityDesc *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2085	htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
				2086	xmlChar *name;
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	2087	const htmlEntityDesc * ent = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2088	*str = NULL;
				2089
				2090	if (CUR == '&') {
				2091	NEXT;
				2092	name = htmlParseName(ctxt);
				2093	if (name == NULL) {
				2094	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2095	ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
				2096	ctxt->wellFormed = 0;
				2097	} else {
				2098	GROW;
				2099	if (CUR == ';') {
				2100	*str = name;
				2101
				2102	/*
				2103	* Lookup the entity in the table.
				2104	*/
				2105	ent = htmlEntityLookup(name);
				2106	if (ent != NULL) /* OK that's ugly !!! */
				2107	NEXT;
				2108	} else {
				2109	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2110	ctxt->sax->error(ctxt->userData,
				2111	"htmlParseEntityRef: expecting ';'\n");
				2112	*str = name;
				2113	}
				2114	}
				2115	}
				2116	return(ent);
				2117	}
				2118
				2119	/**
				2120	* htmlParseAttValue:
				2121	* @ctxt: an HTML parser context
				2122	*
				2123	* parse a value for an attribute
				2124	* Note: the parser won't do substitution of entities here, this
				2125	* will be handled later in xmlStringGetNodeList, unless it was
				2126	* asked for ctxt->replaceEntities != 0
				2127	*
				2128	* Returns the AttValue parsed or NULL.
				2129	*/
				2130
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2131	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2132	htmlParseAttValue(htmlParserCtxtPtr ctxt) {
				2133	xmlChar *ret = NULL;
				2134
				2135	if (CUR == '"') {
				2136	NEXT;
				2137	ret = htmlParseHTMLAttribute(ctxt, '"');
				2138	if (CUR != '"') {
				2139	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2140	ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
				2141	ctxt->wellFormed = 0;
				2142	} else
				2143	NEXT;
				2144	} else if (CUR == '\'') {
				2145	NEXT;
				2146	ret = htmlParseHTMLAttribute(ctxt, '\'');
				2147	if (CUR != '\'') {
				2148	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2149	ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
				2150	ctxt->wellFormed = 0;
				2151	} else
				2152	NEXT;
				2153	} else {
				2154	/*
				2155	* That's an HTMLism, the attribute value may not be quoted
				2156	*/
				2157	ret = htmlParseHTMLAttribute(ctxt, 0);
				2158	if (ret == NULL) {
				2159	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2160	ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
				2161	ctxt->wellFormed = 0;
				2162	}
				2163	}
				2164	return(ret);
				2165	}
				2166
				2167	/**
				2168	* htmlParseSystemLiteral:
				2169	* @ctxt: an HTML parser context
				2170	*
				2171	* parse an HTML Literal
				2172	*
				2173	* [11] SystemLiteral ::= ('"' [^"]* '"') \| ("'" [^']* "'")
				2174	*
				2175	* Returns the SystemLiteral parsed or NULL
				2176	*/
				2177
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2178	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2179	htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
				2180	const xmlChar *q;
				2181	xmlChar *ret = NULL;
				2182
				2183	if (CUR == '"') {
				2184	NEXT;
				2185	q = CUR_PTR;
				2186	while ((IS_CHAR(CUR)) && (CUR != '"'))
				2187	NEXT;
				2188	if (!IS_CHAR(CUR)) {
				2189	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2190	ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
				2191	ctxt->wellFormed = 0;
				2192	} else {
				2193	ret = xmlStrndup(q, CUR_PTR - q);
				2194	NEXT;
				2195	}
				2196	} else if (CUR == '\'') {
				2197	NEXT;
				2198	q = CUR_PTR;
				2199	while ((IS_CHAR(CUR)) && (CUR != '\''))
				2200	NEXT;
				2201	if (!IS_CHAR(CUR)) {
				2202	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2203	ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
				2204	ctxt->wellFormed = 0;
				2205	} else {
				2206	ret = xmlStrndup(q, CUR_PTR - q);
				2207	NEXT;
				2208	}
				2209	} else {
				2210	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2211	ctxt->sax->error(ctxt->userData,
				2212	"SystemLiteral \" or ' expected\n");
				2213	ctxt->wellFormed = 0;
				2214	}
				2215
				2216	return(ret);
				2217	}
				2218
				2219	/**
				2220	* htmlParsePubidLiteral:
				2221	* @ctxt: an HTML parser context
				2222	*
				2223	* parse an HTML public literal
				2224	*
				2225	* [12] PubidLiteral ::= '"' PubidChar* '"' \| "'" (PubidChar - "'")* "'"
				2226	*
				2227	* Returns the PubidLiteral parsed or NULL.
				2228	*/
				2229
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2230	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2231	htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
				2232	const xmlChar *q;
				2233	xmlChar *ret = NULL;
				2234	/*
				2235	* Name ::= (Letter \| '_') (NameChar)*
				2236	*/
				2237	if (CUR == '"') {
				2238	NEXT;
				2239	q = CUR_PTR;
				2240	while (IS_PUBIDCHAR(CUR)) NEXT;
				2241	if (CUR != '"') {
				2242	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2243	ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
				2244	ctxt->wellFormed = 0;
				2245	} else {
				2246	ret = xmlStrndup(q, CUR_PTR - q);
				2247	NEXT;
				2248	}
				2249	} else if (CUR == '\'') {
				2250	NEXT;
				2251	q = CUR_PTR;
				2252	while ((IS_LETTER(CUR)) && (CUR != '\''))
				2253	NEXT;
				2254	if (!IS_LETTER(CUR)) {
				2255	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2256	ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
				2257	ctxt->wellFormed = 0;
				2258	} else {
				2259	ret = xmlStrndup(q, CUR_PTR - q);
				2260	NEXT;
				2261	}
				2262	} else {
				2263	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2264	ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
				2265	ctxt->wellFormed = 0;
				2266	}
				2267
				2268	return(ret);
				2269	}
				2270
				2271	/**
				2272	* htmlParseScript:
				2273	* @ctxt: an HTML parser context
				2274	*
				2275	* parse the content of an HTML SCRIPT or STYLE element
				2276	* http://www.w3.org/TR/html4/sgml/dtd.html#Script
				2277	* http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
				2278	* http://www.w3.org/TR/html4/types.html#type-script
				2279	* http://www.w3.org/TR/html4/types.html#h-6.15
				2280	* http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
				2281	*
				2282	* Script data ( %Script; in the DTD) can be the content of the SCRIPT
				2283	* element and the value of intrinsic event attributes. User agents must
				2284	* not evaluate script data as HTML markup but instead must pass it on as
				2285	* data to a script engine.
				2286	* NOTES:
				2287	* - The content is passed like CDATA
				2288	* - the attributes for style and scripting "onXXX" are also described
				2289	* as CDATA but SGML allows entities references in attributes so their
				2290	* processing is identical as other attributes
				2291	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2292	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2293	htmlParseScript(htmlParserCtxtPtr ctxt) {
				2294	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
				2295	int nbchar = 0;
				2296	xmlChar cur;
				2297
				2298	SHRINK;
				2299	cur = CUR;
				2300	while (IS_CHAR(cur)) {
				2301	if ((cur == '<') && (NXT(1) == '/')) {
				2302	/*
				2303	* One should break here, the specification is clear:
				2304	* Authors should therefore escape "</" within the content.
				2305	* Escape mechanisms are specific to each scripting or
				2306	* style sheet language.
				2307	*/
				2308	if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) \|\|
				2309	((NXT(2) >= 'a') && (NXT(2) <= 'z')))
				2310	break; /* while */
				2311	}
				2312	buf[nbchar++] = cur;
				2313	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
				2314	if (ctxt->sax->cdataBlock!= NULL) {
				2315	/*
				2316	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2317	*/
				2318	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2319	}
				2320	nbchar = 0;
				2321	}
				2322	NEXT;
				2323	cur = CUR;
				2324	}
				2325	if (!(IS_CHAR(cur))) {
				2326	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2327	ctxt->sax->error(ctxt->userData,
				2328	"Invalid char in CDATA 0x%X\n", cur);
				2329	ctxt->wellFormed = 0;
				2330	NEXT;
				2331	}
				2332
				2333	if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2334	if (ctxt->sax->cdataBlock!= NULL) {
				2335	/*
				2336	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2337	*/
				2338	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2339	}
				2340	}
				2341	}
				2342
				2343
				2344	/**
				2345	* htmlParseCharData:
				2346	* @ctxt: an HTML parser context
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2347	*
				2348	* parse a CharData section.
				2349	* if we are within a CDATA section ']]>' marks an end of section.
				2350	*
				2351	* [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
				2352	*/
				2353
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2354	static void
				2355	htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2356	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
				2357	int nbchar = 0;
				2358	int cur, l;
				2359
				2360	SHRINK;
				2361	cur = CUR_CHAR(l);
				2362	while (((cur != '<') \|\| (ctxt->token == '<')) &&
				2363	((cur != '&') \|\| (ctxt->token == '&')) &&
				2364	(IS_CHAR(cur))) {
				2365	COPY_BUF(l,buf,nbchar,cur);
				2366	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
				2367	/*
				2368	* Ok the segment is to be consumed as chars.
				2369	*/
				2370	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2371	if (areBlanks(ctxt, buf, nbchar)) {
				2372	if (ctxt->sax->ignorableWhitespace != NULL)
				2373	ctxt->sax->ignorableWhitespace(ctxt->userData,
				2374	buf, nbchar);
				2375	} else {
				2376	htmlCheckParagraph(ctxt);
				2377	if (ctxt->sax->characters != NULL)
				2378	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				2379	}
				2380	}
				2381	nbchar = 0;
				2382	}
				2383	NEXTL(l);
				2384	cur = CUR_CHAR(l);
				2385	}
				2386	if (nbchar != 0) {
				2387	/*
				2388	* Ok the segment is to be consumed as chars.
				2389	*/
				2390	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2391	if (areBlanks(ctxt, buf, nbchar)) {
				2392	if (ctxt->sax->ignorableWhitespace != NULL)
				2393	ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
				2394	} else {
				2395	htmlCheckParagraph(ctxt);
				2396	if (ctxt->sax->characters != NULL)
				2397	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				2398	}
				2399	}
Daniel Veillard	7cc95c0	2001-10-17 15:45:12 +0000	[diff] [blame]	2400	} else {
				2401	/*
				2402	* Loop detection
				2403	*/
				2404	if (cur == 0)
				2405	ctxt->instate = XML_PARSER_EOF;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2406	}
				2407	}
				2408
				2409	/**
				2410	* htmlParseExternalID:
				2411	* @ctxt: an HTML parser context
				2412	* @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2413	*
				2414	* Parse an External ID or a Public ID
				2415	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2416	* [75] ExternalID ::= 'SYSTEM' S SystemLiteral
				2417	* \| 'PUBLIC' S PubidLiteral S SystemLiteral
				2418	*
				2419	* [83] PublicID ::= 'PUBLIC' S PubidLiteral
				2420	*
				2421	* Returns the function returns SystemLiteral and in the second
				2422	* case publicID receives PubidLiteral, is strict is off
				2423	* it is possible to return NULL and have publicID set.
				2424	*/
				2425
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2426	static xmlChar *
				2427	htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2428	xmlChar *URI = NULL;
				2429
				2430	if ((UPPER == 'S') && (UPP(1) == 'Y') &&
				2431	(UPP(2) == 'S') && (UPP(3) == 'T') &&
				2432	(UPP(4) == 'E') && (UPP(5) == 'M')) {
				2433	SKIP(6);
				2434	if (!IS_BLANK(CUR)) {
				2435	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2436	ctxt->sax->error(ctxt->userData,
				2437	"Space required after 'SYSTEM'\n");
				2438	ctxt->wellFormed = 0;
				2439	}
				2440	SKIP_BLANKS;
				2441	URI = htmlParseSystemLiteral(ctxt);
				2442	if (URI == NULL) {
				2443	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2444	ctxt->sax->error(ctxt->userData,
				2445	"htmlParseExternalID: SYSTEM, no URI\n");
				2446	ctxt->wellFormed = 0;
				2447	}
				2448	} else if ((UPPER == 'P') && (UPP(1) == 'U') &&
				2449	(UPP(2) == 'B') && (UPP(3) == 'L') &&
				2450	(UPP(4) == 'I') && (UPP(5) == 'C')) {
				2451	SKIP(6);
				2452	if (!IS_BLANK(CUR)) {
				2453	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2454	ctxt->sax->error(ctxt->userData,
				2455	"Space required after 'PUBLIC'\n");
				2456	ctxt->wellFormed = 0;
				2457	}
				2458	SKIP_BLANKS;
				2459	*publicID = htmlParsePubidLiteral(ctxt);
				2460	if (*publicID == NULL) {
				2461	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2462	ctxt->sax->error(ctxt->userData,
				2463	"htmlParseExternalID: PUBLIC, no Public Identifier\n");
				2464	ctxt->wellFormed = 0;
				2465	}
				2466	SKIP_BLANKS;
				2467	if ((CUR == '"') \|\| (CUR == '\'')) {
				2468	URI = htmlParseSystemLiteral(ctxt);
				2469	}
				2470	}
				2471	return(URI);
				2472	}
				2473
				2474	/**
				2475	* htmlParseComment:
				2476	* @ctxt: an HTML parser context
				2477	*
				2478	* Parse an XML (SGML) comment <!-- .... -->
				2479	*
				2480	* [15] Comment ::= '<!--' ((Char - '-') \| ('-' (Char - '-')))* '-->'
				2481	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2482	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2483	htmlParseComment(htmlParserCtxtPtr ctxt) {
				2484	xmlChar *buf = NULL;
				2485	int len;
				2486	int size = HTML_PARSER_BUFFER_SIZE;
				2487	int q, ql;
				2488	int r, rl;
				2489	int cur, l;
				2490	xmlParserInputState state;
				2491
				2492	/*
				2493	* Check that there is a comment right here.
				2494	*/
				2495	if ((RAW != '<') \|\| (NXT(1) != '!') \|\|
				2496	(NXT(2) != '-') \|\| (NXT(3) != '-')) return;
				2497
				2498	state = ctxt->instate;
				2499	ctxt->instate = XML_PARSER_COMMENT;
				2500	SHRINK;
				2501	SKIP(4);
				2502	buf = (xmlChar ) xmlMalloc(size sizeof(xmlChar));
				2503	if (buf == NULL) {
				2504	xmlGenericError(xmlGenericErrorContext,
				2505	"malloc of %d byte failed\n", size);
				2506	ctxt->instate = state;
				2507	return;
				2508	}
				2509	q = CUR_CHAR(ql);
				2510	NEXTL(ql);
				2511	r = CUR_CHAR(rl);
				2512	NEXTL(rl);
				2513	cur = CUR_CHAR(l);
				2514	len = 0;
				2515	while (IS_CHAR(cur) &&
				2516	((cur != '>') \|\|
				2517	(r != '-') \|\| (q != '-'))) {
				2518	if (len + 5 >= size) {
				2519	size *= 2;
				2520	buf = (xmlChar ) xmlRealloc(buf, size sizeof(xmlChar));
				2521	if (buf == NULL) {
				2522	xmlGenericError(xmlGenericErrorContext,
				2523	"realloc of %d byte failed\n", size);
				2524	ctxt->instate = state;
				2525	return;
				2526	}
				2527	}
				2528	COPY_BUF(ql,buf,len,q);
				2529	q = r;
				2530	ql = rl;
				2531	r = cur;
				2532	rl = l;
				2533	NEXTL(l);
				2534	cur = CUR_CHAR(l);
				2535	if (cur == 0) {
				2536	SHRINK;
				2537	GROW;
				2538	cur = CUR_CHAR(l);
				2539	}
				2540	}
				2541	buf[len] = 0;
				2542	if (!IS_CHAR(cur)) {
				2543	ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
				2544	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2545	ctxt->sax->error(ctxt->userData,
				2546	"Comment not terminated \n<!--%.50s\n", buf);
				2547	ctxt->wellFormed = 0;
				2548	xmlFree(buf);
				2549	} else {
				2550	NEXT;
				2551	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
				2552	(!ctxt->disableSAX))
				2553	ctxt->sax->comment(ctxt->userData, buf);
				2554	xmlFree(buf);
				2555	}
				2556	ctxt->instate = state;
				2557	}
				2558
				2559	/**
				2560	* htmlParseCharRef:
				2561	* @ctxt: an HTML parser context
				2562	*
				2563	* parse Reference declarations
				2564	*
				2565	* [66] CharRef ::= '&#' [0-9]+ ';' \|
				2566	* '&#x' [0-9a-fA-F]+ ';'
				2567	*
				2568	* Returns the value parsed (as an int)
				2569	*/
				2570	int
				2571	htmlParseCharRef(htmlParserCtxtPtr ctxt) {
				2572	int val = 0;
				2573
				2574	if ((CUR == '&') && (NXT(1) == '#') &&
				2575	(NXT(2) == 'x')) {
				2576	SKIP(3);
				2577	while (CUR != ';') {
				2578	if ((CUR >= '0') && (CUR <= '9'))
				2579	val = val * 16 + (CUR - '0');
				2580	else if ((CUR >= 'a') && (CUR <= 'f'))
				2581	val = val * 16 + (CUR - 'a') + 10;
				2582	else if ((CUR >= 'A') && (CUR <= 'F'))
				2583	val = val * 16 + (CUR - 'A') + 10;
				2584	else {
				2585	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2586	ctxt->sax->error(ctxt->userData,
				2587	"htmlParseCharRef: invalid hexadecimal value\n");
				2588	ctxt->wellFormed = 0;
				2589	return(0);
				2590	}
				2591	NEXT;
				2592	}
				2593	if (CUR == ';')
				2594	NEXT;
				2595	} else if ((CUR == '&') && (NXT(1) == '#')) {
				2596	SKIP(2);
				2597	while (CUR != ';') {
				2598	if ((CUR >= '0') && (CUR <= '9'))
				2599	val = val * 10 + (CUR - '0');
				2600	else {
				2601	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2602	ctxt->sax->error(ctxt->userData,
				2603	"htmlParseCharRef: invalid decimal value\n");
				2604	ctxt->wellFormed = 0;
				2605	return(0);
				2606	}
				2607	NEXT;
				2608	}
				2609	if (CUR == ';')
				2610	NEXT;
				2611	} else {
				2612	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2613	ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
				2614	ctxt->wellFormed = 0;
				2615	}
				2616	/*
				2617	* Check the value IS_CHAR ...
				2618	*/
				2619	if (IS_CHAR(val)) {
				2620	return(val);
				2621	} else {
				2622	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2623	ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
				2624	val);
				2625	ctxt->wellFormed = 0;
				2626	}
				2627	return(0);
				2628	}
				2629
				2630
				2631	/**
				2632	* htmlParseDocTypeDecl :
				2633	* @ctxt: an HTML parser context
				2634	*
				2635	* parse a DOCTYPE declaration
				2636	*
				2637	* [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
				2638	* ('[' (markupdecl \| PEReference \| S)* ']' S?)? '>'
				2639	*/
				2640
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2641	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2642	htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
				2643	xmlChar *name;
				2644	xmlChar *ExternalID = NULL;
				2645	xmlChar *URI = NULL;
				2646
				2647	/*
				2648	* We know that '<!DOCTYPE' has been detected.
				2649	*/
				2650	SKIP(9);
				2651
				2652	SKIP_BLANKS;
				2653
				2654	/*
				2655	* Parse the DOCTYPE name.
				2656	*/
				2657	name = htmlParseName(ctxt);
				2658	if (name == NULL) {
				2659	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2660	ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
				2661	ctxt->wellFormed = 0;
				2662	}
				2663	/*
				2664	* Check that upper(name) == "HTML" !!!!!!!!!!!!!
				2665	*/
				2666
				2667	SKIP_BLANKS;
				2668
				2669	/*
				2670	* Check for SystemID and ExternalID
				2671	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2672	URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2673	SKIP_BLANKS;
				2674
				2675	/*
				2676	* We should be at the end of the DOCTYPE declaration.
				2677	*/
				2678	if (CUR != '>') {
				2679	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard	f6ed8bc	2001-10-02 09:22:47 +0000	[diff] [blame]	2680	ctxt->sax->error(ctxt->userData, "DOCTYPE improperly terminated\n");
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2681	ctxt->wellFormed = 0;
				2682	/* We shouldn't try to resynchronize ... */
				2683	}
				2684	NEXT;
				2685
				2686	/*
				2687	* Create or update the document accordingly to the DOCTYPE
				2688	*/
				2689	if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
				2690	(!ctxt->disableSAX))
				2691	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
				2692
				2693	/*
				2694	* Cleanup, since we don't use all those identifiers
				2695	*/
				2696	if (URI != NULL) xmlFree(URI);
				2697	if (ExternalID != NULL) xmlFree(ExternalID);
				2698	if (name != NULL) xmlFree(name);
				2699	}
				2700
				2701	/**
				2702	* htmlParseAttribute:
				2703	* @ctxt: an HTML parser context
				2704	* @value: a xmlChar ** used to store the value of the attribute
				2705	*
				2706	* parse an attribute
				2707	*
				2708	* [41] Attribute ::= Name Eq AttValue
				2709	*
				2710	* [25] Eq ::= S? '=' S?
				2711	*
				2712	* With namespace:
				2713	*
				2714	* [NS 11] Attribute ::= QName Eq AttValue
				2715	*
				2716	* Also the case QName == xmlns:??? is handled independently as a namespace
				2717	* definition.
				2718	*
				2719	* Returns the attribute name, and the value in *value.
				2720	*/
				2721
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2722	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2723	htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
				2724	xmlChar name, val = NULL;
				2725
				2726	*value = NULL;
				2727	name = htmlParseHTMLName(ctxt);
				2728	if (name == NULL) {
				2729	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2730	ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
				2731	ctxt->wellFormed = 0;
				2732	return(NULL);
				2733	}
				2734
				2735	/*
				2736	* read the value
				2737	*/
				2738	SKIP_BLANKS;
				2739	if (CUR == '=') {
				2740	NEXT;
				2741	SKIP_BLANKS;
				2742	val = htmlParseAttValue(ctxt);
				2743	/******
				2744	} else {
				2745	* TODO : some attribute must have values, some may not
				2746	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2747	ctxt->sax->warning(ctxt->userData,
				2748	"No value for attribute %s\n", name); */
				2749	}
				2750
				2751	*value = val;
				2752	return(name);
				2753	}
				2754
				2755	/**
				2756	* htmlCheckEncoding:
				2757	* @ctxt: an HTML parser context
				2758	* @attvalue: the attribute value
				2759	*
				2760	* Checks an http-equiv attribute from a Meta tag to detect
				2761	* the encoding
				2762	* If a new encoding is detected the parser is switched to decode
				2763	* it and pass UTF8
				2764	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2765	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2766	htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
				2767	const xmlChar *encoding;
				2768
				2769	if ((ctxt == NULL) \|\| (attvalue == NULL))
				2770	return;
				2771
				2772	/* do not change encoding */
				2773	if (ctxt->input->encoding != NULL)
				2774	return;
				2775
				2776	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
				2777	if (encoding != NULL) {
				2778	encoding += 8;
				2779	} else {
				2780	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
				2781	if (encoding != NULL)
				2782	encoding += 9;
				2783	}
				2784	if (encoding != NULL) {
				2785	xmlCharEncoding enc;
				2786	xmlCharEncodingHandlerPtr handler;
				2787
				2788	while ((encoding == ' ') \|\| (encoding == '\t')) encoding++;
				2789
				2790	if (ctxt->input->encoding != NULL)
				2791	xmlFree((xmlChar *) ctxt->input->encoding);
				2792	ctxt->input->encoding = xmlStrdup(encoding);
				2793
				2794	enc = xmlParseCharEncoding((const char *) encoding);
				2795	/*
				2796	* registered set of known encodings
				2797	*/
				2798	if (enc != XML_CHAR_ENCODING_ERROR) {
				2799	xmlSwitchEncoding(ctxt, enc);
				2800	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				2801	} else {
				2802	/*
				2803	* fallback for unknown encodings
				2804	*/
				2805	handler = xmlFindCharEncodingHandler((const char *) encoding);
				2806	if (handler != NULL) {
				2807	xmlSwitchToEncoding(ctxt, handler);
				2808	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				2809	} else {
				2810	ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
				2811	}
				2812	}
				2813
				2814	if ((ctxt->input->buf != NULL) &&
				2815	(ctxt->input->buf->encoder != NULL) &&
				2816	(ctxt->input->buf->raw != NULL) &&
				2817	(ctxt->input->buf->buffer != NULL)) {
				2818	int nbchars;
				2819	int processed;
				2820
				2821	/*
				2822	* convert as much as possible to the parser reading buffer.
				2823	*/
				2824	processed = ctxt->input->cur - ctxt->input->base;
				2825	xmlBufferShrink(ctxt->input->buf->buffer, processed);
				2826	nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
				2827	ctxt->input->buf->buffer,
				2828	ctxt->input->buf->raw);
				2829	if (nbchars < 0) {
				2830	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				2831	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2832	ctxt->sax->error(ctxt->userData,
				2833	"htmlCheckEncoding: encoder error\n");
				2834	}
				2835	ctxt->input->base =
				2836	ctxt->input->cur = ctxt->input->buf->buffer->content;
				2837	}
				2838	}
				2839	}
				2840
				2841	/**
				2842	* htmlCheckMeta:
				2843	* @ctxt: an HTML parser context
				2844	* @atts: the attributes values
				2845	*
				2846	* Checks an attributes from a Meta tag
				2847	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2848	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2849	htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
				2850	int i;
				2851	const xmlChar att, value;
				2852	int http = 0;
				2853	const xmlChar *content = NULL;
				2854
				2855	if ((ctxt == NULL) \|\| (atts == NULL))
				2856	return;
				2857
				2858	i = 0;
				2859	att = atts[i++];
				2860	while (att != NULL) {
				2861	value = atts[i++];
				2862	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
				2863	&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
				2864	http = 1;
				2865	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
				2866	content = value;
				2867	att = atts[i++];
				2868	}
				2869	if ((http) && (content != NULL))
				2870	htmlCheckEncoding(ctxt, content);
				2871
				2872	}
				2873
				2874	/**
				2875	* htmlParseStartTag:
				2876	* @ctxt: an HTML parser context
				2877	*
				2878	* parse a start of tag either for rule element or
				2879	* EmptyElement. In both case we don't parse the tag closing chars.
				2880	*
				2881	* [40] STag ::= '<' Name (S Attribute)* S? '>'
				2882	*
				2883	* [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
				2884	*
				2885	* With namespace:
				2886	*
				2887	* [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
				2888	*
				2889	* [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
				2890	*
				2891	*/
				2892
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2893	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2894	htmlParseStartTag(htmlParserCtxtPtr ctxt) {
				2895	xmlChar *name;
				2896	xmlChar *attname;
				2897	xmlChar *attvalue;
				2898	const xmlChar **atts = NULL;
				2899	int nbatts = 0;
				2900	int maxatts = 0;
				2901	int meta = 0;
				2902	int i;
				2903
				2904	if (CUR != '<') return;
				2905	NEXT;
				2906
				2907	GROW;
				2908	name = htmlParseHTMLName(ctxt);
				2909	if (name == NULL) {
				2910	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2911	ctxt->sax->error(ctxt->userData,
				2912	"htmlParseStartTag: invalid element name\n");
				2913	ctxt->wellFormed = 0;
				2914	/* Dump the bogus tag like browsers do */
				2915	while ((IS_CHAR(CUR)) && (CUR != '>'))
				2916	NEXT;
				2917	return;
				2918	}
				2919	if (xmlStrEqual(name, BAD_CAST"meta"))
				2920	meta = 1;
				2921
				2922	/*
				2923	* Check for auto-closure of HTML elements.
				2924	*/
				2925	htmlAutoClose(ctxt, name);
				2926
				2927	/*
				2928	* Check for implied HTML elements.
				2929	*/
				2930	htmlCheckImplied(ctxt, name);
				2931
				2932	/*
				2933	* Avoid html at any level > 0, head at any level != 1
				2934	* or any attempt to recurse body
				2935	*/
				2936	if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
				2937	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2938	ctxt->sax->error(ctxt->userData,
				2939	"htmlParseStartTag: misplaced <html> tag\n");
				2940	ctxt->wellFormed = 0;
				2941	xmlFree(name);
				2942	return;
				2943	}
				2944	if ((ctxt->nameNr != 1) &&
				2945	(xmlStrEqual(name, BAD_CAST"head"))) {
				2946	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2947	ctxt->sax->error(ctxt->userData,
				2948	"htmlParseStartTag: misplaced <head> tag\n");
				2949	ctxt->wellFormed = 0;
				2950	xmlFree(name);
				2951	return;
				2952	}
				2953	if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2954	int indx;
				2955	for (indx = 0;indx < ctxt->nameNr;indx++) {
				2956	if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2957	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2958	ctxt->sax->error(ctxt->userData,
				2959	"htmlParseStartTag: misplaced <body> tag\n");
				2960	ctxt->wellFormed = 0;
				2961	xmlFree(name);
				2962	return;
				2963	}
				2964	}
				2965	}
				2966
				2967	/*
				2968	* Now parse the attributes, it ends up with the ending
				2969	*
				2970	* (S Attribute)* S?
				2971	*/
				2972	SKIP_BLANKS;
				2973	while ((IS_CHAR(CUR)) &&
				2974	(CUR != '>') &&
				2975	((CUR != '/') \|\| (NXT(1) != '>'))) {
				2976	long cons = ctxt->nbChars;
				2977
				2978	GROW;
				2979	attname = htmlParseAttribute(ctxt, &attvalue);
				2980	if (attname != NULL) {
				2981
				2982	/*
				2983	* Well formedness requires at most one declaration of an attribute
				2984	*/
				2985	for (i = 0; i < nbatts;i += 2) {
				2986	if (xmlStrEqual(atts[i], attname)) {
				2987	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2988	ctxt->sax->error(ctxt->userData,
				2989	"Attribute %s redefined\n",
				2990	attname);
				2991	ctxt->wellFormed = 0;
				2992	xmlFree(attname);
				2993	if (attvalue != NULL)
				2994	xmlFree(attvalue);
				2995	goto failed;
				2996	}
				2997	}
				2998
				2999	/*
				3000	* Add the pair to atts
				3001	*/
				3002	if (atts == NULL) {
				3003	maxatts = 10;
				3004	atts = (const xmlChar *) xmlMalloc(maxatts sizeof(xmlChar *));
				3005	if (atts == NULL) {
				3006	xmlGenericError(xmlGenericErrorContext,
				3007	"malloc of %ld byte failed\n",
				3008	maxatts * (long)sizeof(xmlChar *));
				3009	if (name != NULL) xmlFree(name);
				3010	return;
				3011	}
				3012	} else if (nbatts + 4 > maxatts) {
				3013	maxatts *= 2;
				3014	atts = (const xmlChar *) xmlRealloc((void ) atts,
				3015	maxatts * sizeof(xmlChar *));
				3016	if (atts == NULL) {
				3017	xmlGenericError(xmlGenericErrorContext,
				3018	"realloc of %ld byte failed\n",
				3019	maxatts * (long)sizeof(xmlChar *));
				3020	if (name != NULL) xmlFree(name);
				3021	return;
				3022	}
				3023	}
				3024	atts[nbatts++] = attname;
				3025	atts[nbatts++] = attvalue;
				3026	atts[nbatts] = NULL;
				3027	atts[nbatts + 1] = NULL;
				3028	}
				3029	else {
				3030	/* Dump the bogus attribute string up to the next blank or
				3031	* the end of the tag. */
				3032	while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
				3033	&& ((CUR != '/') \|\| (NXT(1) != '>')))
				3034	NEXT;
				3035	}
				3036
				3037	failed:
				3038	SKIP_BLANKS;
				3039	if (cons == ctxt->nbChars) {
				3040	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3041	ctxt->sax->error(ctxt->userData,
				3042	"htmlParseStartTag: problem parsing attributes\n");
				3043	ctxt->wellFormed = 0;
				3044	break;
				3045	}
				3046	}
				3047
				3048	/*
				3049	* Handle specific association to the META tag
				3050	*/
				3051	if (meta)
				3052	htmlCheckMeta(ctxt, atts);
				3053
				3054	/*
				3055	* SAX: Start of Element !
				3056	*/
				3057	htmlnamePush(ctxt, xmlStrdup(name));
				3058	#ifdef DEBUG
				3059	xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
				3060	#endif
				3061	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				3062	ctxt->sax->startElement(ctxt->userData, name, atts);
				3063
				3064	if (atts != NULL) {
				3065	for (i = 0;i < nbatts;i++) {
				3066	if (atts[i] != NULL)
				3067	xmlFree((xmlChar *) atts[i]);
				3068	}
				3069	xmlFree((void *) atts);
				3070	}
				3071	if (name != NULL) xmlFree(name);
				3072	}
				3073
				3074	/**
				3075	* htmlParseEndTag:
				3076	* @ctxt: an HTML parser context
				3077	*
				3078	* parse an end of tag
				3079	*
				3080	* [42] ETag ::= '</' Name S? '>'
				3081	*
				3082	* With namespace
				3083	*
				3084	* [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3085	*
				3086	* Returns 1 if the current level should be closed.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3087	*/
				3088
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3089	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3090	htmlParseEndTag(htmlParserCtxtPtr ctxt) {
				3091	xmlChar *name;
				3092	xmlChar *oldname;
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3093	int i, ret;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3094
				3095	if ((CUR != '<') \|\| (NXT(1) != '/')) {
				3096	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3097	ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
				3098	ctxt->wellFormed = 0;
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3099	return(0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3100	}
				3101	SKIP(2);
				3102
				3103	name = htmlParseHTMLName(ctxt);
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3104	if (name == NULL) return(0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3105
				3106	/*
				3107	* We should definitely be at the ending "S? '>'" part
				3108	*/
				3109	SKIP_BLANKS;
				3110	if ((!IS_CHAR(CUR)) \|\| (CUR != '>')) {
				3111	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3112	ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
				3113	ctxt->wellFormed = 0;
				3114	} else
				3115	NEXT;
				3116
				3117	/*
				3118	* If the name read is not one of the element in the parsing stack
				3119	* then return, it's just an error.
				3120	*/
				3121	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
				3122	if (xmlStrEqual(name, ctxt->nameTab[i])) break;
				3123	}
				3124	if (i < 0) {
				3125	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3126	ctxt->sax->error(ctxt->userData,
				3127	"Unexpected end tag : %s\n", name);
				3128	xmlFree(name);
				3129	ctxt->wellFormed = 0;
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3130	return(0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3131	}
				3132
				3133
				3134	/*
				3135	* Check for auto-closure of HTML elements.
				3136	*/
				3137
				3138	htmlAutoCloseOnClose(ctxt, name);
				3139
				3140	/*
				3141	* Well formedness constraints, opening and closing must match.
				3142	* With the exception that the autoclose may have popped stuff out
				3143	* of the stack.
				3144	*/
				3145	if (!xmlStrEqual(name, ctxt->name)) {
				3146	#ifdef DEBUG
				3147	xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
				3148	#endif
				3149	if ((ctxt->name != NULL) &&
				3150	(!xmlStrEqual(ctxt->name, name))) {
				3151	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3152	ctxt->sax->error(ctxt->userData,
				3153	"Opening and ending tag mismatch: %s and %s\n",
				3154	name, ctxt->name);
				3155	ctxt->wellFormed = 0;
				3156	}
				3157	}
				3158
				3159	/*
				3160	* SAX: End of Tag
				3161	*/
				3162	oldname = ctxt->name;
				3163	if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
				3164	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3165	ctxt->sax->endElement(ctxt->userData, name);
				3166	oldname = htmlnamePop(ctxt);
				3167	if (oldname != NULL) {
				3168	#ifdef DEBUG
				3169	xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
				3170	#endif
				3171	xmlFree(oldname);
				3172	#ifdef DEBUG
				3173	} else {
				3174	xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
				3175	#endif
				3176	}
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3177	ret = 1;
				3178	} else {
				3179	ret = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3180	}
				3181
				3182	if (name != NULL)
				3183	xmlFree(name);
				3184
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3185	return(ret);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3186	}
				3187
				3188
				3189	/**
				3190	* htmlParseReference:
				3191	* @ctxt: an HTML parser context
				3192	*
				3193	* parse and handle entity references in content,
				3194	* this will end-up in a call to character() since this is either a
				3195	* CharRef, or a predefined entity.
				3196	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3197	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3198	htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	3199	const htmlEntityDesc * ent;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3200	xmlChar out[6];
				3201	xmlChar *name;
				3202	if (CUR != '&') return;
				3203
				3204	if (NXT(1) == '#') {
				3205	unsigned int c;
				3206	int bits, i = 0;
				3207
				3208	c = htmlParseCharRef(ctxt);
				3209	if (c == 0)
				3210	return;
				3211
				3212	if (c < 0x80) { out[i++]= c; bits= -6; }
				3213	else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				3214	else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				3215	else { out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				3216
				3217	for ( ; bits >= 0; bits-= 6) {
				3218	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
				3219	}
				3220	out[i] = 0;
				3221
				3222	htmlCheckParagraph(ctxt);
				3223	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3224	ctxt->sax->characters(ctxt->userData, out, i);
				3225	} else {
				3226	ent = htmlParseEntityRef(ctxt, &name);
				3227	if (name == NULL) {
				3228	htmlCheckParagraph(ctxt);
				3229	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3230	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
				3231	return;
				3232	}
				3233	if ((ent == NULL) \|\| (ent->value <= 0)) {
				3234	htmlCheckParagraph(ctxt);
				3235	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
				3236	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
				3237	ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
				3238	/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
				3239	}
				3240	} else {
				3241	unsigned int c;
				3242	int bits, i = 0;
				3243
				3244	c = ent->value;
				3245	if (c < 0x80)
				3246	{ out[i++]= c; bits= -6; }
				3247	else if (c < 0x800)
				3248	{ out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				3249	else if (c < 0x10000)
				3250	{ out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				3251	else
				3252	{ out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				3253
				3254	for ( ; bits >= 0; bits-= 6) {
				3255	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
				3256	}
				3257	out[i] = 0;
				3258
				3259	htmlCheckParagraph(ctxt);
				3260	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3261	ctxt->sax->characters(ctxt->userData, out, i);
				3262	}
				3263	xmlFree(name);
				3264	}
				3265	}
				3266
				3267	/**
				3268	* htmlParseContent:
				3269	* @ctxt: an HTML parser context
				3270	* @name: the node name
				3271	*
				3272	* Parse a content: comment, sub-element, reference or text.
				3273	*
				3274	*/
				3275
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3276	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3277	htmlParseContent(htmlParserCtxtPtr ctxt) {
				3278	xmlChar *currentNode;
				3279	int depth;
				3280
				3281	currentNode = xmlStrdup(ctxt->name);
				3282	depth = ctxt->nameNr;
				3283	while (1) {
				3284	long cons = ctxt->nbChars;
				3285
				3286	GROW;
				3287	/*
				3288	* Our tag or one of it's parent or children is ending.
				3289	*/
				3290	if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3291	if (htmlParseEndTag(ctxt) &&
				3292	((currentNode != NULL) \|\| (ctxt->nameNr == 0))) {
				3293	if (currentNode != NULL)
				3294	xmlFree(currentNode);
				3295	return;
				3296	}
				3297	continue; /* while */
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3298	}
				3299
				3300	/*
				3301	* Has this node been popped out during parsing of
				3302	* the next element
				3303	*/
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3304	if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
				3305	(!xmlStrEqual(currentNode, ctxt->name)))
				3306	{
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3307	if (currentNode != NULL) xmlFree(currentNode);
				3308	return;
				3309	}
				3310
Daniel Veillard	f9533d1	2001-03-03 10:04:57 +0000	[diff] [blame]	3311	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) \|\|
				3312	(xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3313	/*
				3314	* Handle SCRIPT/STYLE separately
				3315	*/
				3316	htmlParseScript(ctxt);
				3317	} else {
				3318	/*
				3319	* Sometimes DOCTYPE arrives in the middle of the document
				3320	*/
				3321	if ((CUR == '<') && (NXT(1) == '!') &&
				3322	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				3323	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				3324	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				3325	(UPP(8) == 'E')) {
				3326	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3327	ctxt->sax->error(ctxt->userData,
				3328	"Misplaced DOCTYPE declaration\n");
				3329	ctxt->wellFormed = 0;
				3330	htmlParseDocTypeDecl(ctxt);
				3331	}
				3332
				3333	/*
				3334	* First case : a comment
				3335	*/
				3336	if ((CUR == '<') && (NXT(1) == '!') &&
				3337	(NXT(2) == '-') && (NXT(3) == '-')) {
				3338	htmlParseComment(ctxt);
				3339	}
				3340
				3341	/*
				3342	* Second case : a sub-element.
				3343	*/
				3344	else if (CUR == '<') {
				3345	htmlParseElement(ctxt);
				3346	}
				3347
				3348	/*
				3349	* Third case : a reference. If if has not been resolved,
				3350	* parsing returns it's Name, create the node
				3351	*/
				3352	else if (CUR == '&') {
				3353	htmlParseReference(ctxt);
				3354	}
				3355
				3356	/*
				3357	* Fourth : end of the resource
				3358	*/
				3359	else if (CUR == 0) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3360	htmlAutoCloseOnEnd(ctxt);
				3361	break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3362	}
				3363
				3364	/*
				3365	* Last case, text. Note that References are handled directly.
				3366	*/
				3367	else {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3368	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3369	}
				3370
				3371	if (cons == ctxt->nbChars) {
				3372	if (ctxt->node != NULL) {
				3373	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3374	ctxt->sax->error(ctxt->userData,
				3375	"detected an error in element content\n");
				3376	ctxt->wellFormed = 0;
				3377	}
				3378	break;
				3379	}
				3380	}
				3381	GROW;
				3382	}
				3383	if (currentNode != NULL) xmlFree(currentNode);
				3384	}
				3385
				3386	/**
				3387	* htmlParseElement:
				3388	* @ctxt: an HTML parser context
				3389	*
				3390	* parse an HTML element, this is highly recursive
				3391	*
				3392	* [39] element ::= EmptyElemTag \| STag content ETag
				3393	*
				3394	* [41] Attribute ::= Name Eq AttValue
				3395	*/
				3396
				3397	void
				3398	htmlParseElement(htmlParserCtxtPtr ctxt) {
				3399	xmlChar *name;
				3400	xmlChar *currentNode = NULL;
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	3401	const htmlElemDesc * info;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3402	htmlParserNodeInfo node_info;
				3403	xmlChar *oldname;
				3404	int depth = ctxt->nameNr;
Daniel Veillard	3fbe8e3	2001-10-06 13:30:33 +0000	[diff] [blame]	3405	const xmlChar *oldptr;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3406
				3407	/* Capture start position */
				3408	if (ctxt->record_info) {
				3409	node_info.begin_pos = ctxt->input->consumed +
				3410	(CUR_PTR - ctxt->input->base);
				3411	node_info.begin_line = ctxt->input->line;
				3412	}
				3413
				3414	oldname = xmlStrdup(ctxt->name);
				3415	htmlParseStartTag(ctxt);
				3416	name = ctxt->name;
				3417	#ifdef DEBUG
				3418	if (oldname == NULL)
				3419	xmlGenericError(xmlGenericErrorContext,
				3420	"Start of element %s\n", name);
				3421	else if (name == NULL)
				3422	xmlGenericError(xmlGenericErrorContext,
				3423	"Start of element failed, was %s\n", oldname);
				3424	else
				3425	xmlGenericError(xmlGenericErrorContext,
				3426	"Start of element %s, was %s\n", name, oldname);
				3427	#endif
				3428	if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) \|\|
				3429	(name == NULL)) {
				3430	if (CUR == '>')
				3431	NEXT;
				3432	if (oldname != NULL)
				3433	xmlFree(oldname);
				3434	return;
				3435	}
				3436	if (oldname != NULL)
				3437	xmlFree(oldname);
				3438
				3439	/*
				3440	* Lookup the info for that element.
				3441	*/
				3442	info = htmlTagLookup(name);
				3443	if (info == NULL) {
				3444	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3445	ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
				3446	name);
				3447	ctxt->wellFormed = 0;
				3448	} else if (info->depr) {
				3449	/***************************
				3450	if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
				3451	ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
				3452	name);
				3453	***************************/
				3454	}
				3455
				3456	/*
				3457	* Check for an Empty Element labelled the XML/SGML way
				3458	*/
				3459	if ((CUR == '/') && (NXT(1) == '>')) {
				3460	SKIP(2);
				3461	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3462	ctxt->sax->endElement(ctxt->userData, name);
				3463	oldname = htmlnamePop(ctxt);
				3464	#ifdef DEBUG
				3465	xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
				3466	#endif
				3467	if (oldname != NULL)
				3468	xmlFree(oldname);
				3469	return;
				3470	}
				3471
				3472	if (CUR == '>') {
				3473	NEXT;
				3474	} else {
				3475	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3476	ctxt->sax->error(ctxt->userData,
				3477	"Couldn't find end of Start Tag %s\n",
				3478	name);
				3479	ctxt->wellFormed = 0;
				3480
				3481	/*
				3482	* end of parsing of this node.
				3483	*/
				3484	if (xmlStrEqual(name, ctxt->name)) {
				3485	nodePop(ctxt);
				3486	oldname = htmlnamePop(ctxt);
				3487	#ifdef DEBUG
				3488	xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
				3489	#endif
				3490	if (oldname != NULL)
				3491	xmlFree(oldname);
				3492	}
				3493
				3494	/*
				3495	* Capture end position and add node
				3496	*/
				3497	if ( currentNode != NULL && ctxt->record_info ) {
				3498	node_info.end_pos = ctxt->input->consumed +
				3499	(CUR_PTR - ctxt->input->base);
				3500	node_info.end_line = ctxt->input->line;
				3501	node_info.node = ctxt->node;
				3502	xmlParserAddNodeInfo(ctxt, &node_info);
				3503	}
				3504	return;
				3505	}
				3506
				3507	/*
				3508	* Check for an Empty Element from DTD definition
				3509	*/
				3510	if ((info != NULL) && (info->empty)) {
				3511	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3512	ctxt->sax->endElement(ctxt->userData, name);
				3513	oldname = htmlnamePop(ctxt);
				3514	#ifdef DEBUG
				3515	xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
				3516	#endif
				3517	if (oldname != NULL)
				3518	xmlFree(oldname);
				3519	return;
				3520	}
				3521
				3522	/*
				3523	* Parse the content of the element:
				3524	*/
				3525	currentNode = xmlStrdup(ctxt->name);
				3526	depth = ctxt->nameNr;
				3527	while (IS_CHAR(CUR)) {
William M. Brack	d28e48a	2001-09-23 01:55:08 +0000	[diff] [blame]	3528	oldptr = ctxt->input->cur;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3529	htmlParseContent(ctxt);
William M. Brack	d28e48a	2001-09-23 01:55:08 +0000	[diff] [blame]	3530	if (oldptr==ctxt->input->cur) break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3531	if (ctxt->nameNr < depth) break;
				3532	}
				3533
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3534	/*
				3535	* Capture end position and add node
				3536	*/
				3537	if ( currentNode != NULL && ctxt->record_info ) {
				3538	node_info.end_pos = ctxt->input->consumed +
				3539	(CUR_PTR - ctxt->input->base);
				3540	node_info.end_line = ctxt->input->line;
				3541	node_info.node = ctxt->node;
				3542	xmlParserAddNodeInfo(ctxt, &node_info);
				3543	}
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3544	if (!IS_CHAR(CUR)) {
				3545	htmlAutoCloseOnEnd(ctxt);
				3546	}
				3547
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3548	if (currentNode != NULL)
				3549	xmlFree(currentNode);
				3550	}
				3551
				3552	/**
				3553	* htmlParseDocument :
				3554	* @ctxt: an HTML parser context
				3555	*
				3556	* parse an HTML document (and build a tree if using the standard SAX
				3557	* interface).
				3558	*
				3559	* Returns 0, -1 in case of error. the parser context is augmented
				3560	* as a result of the parsing.
				3561	*/
				3562
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3563	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3564	htmlParseDocument(htmlParserCtxtPtr ctxt) {
				3565	xmlDtdPtr dtd;
				3566
Daniel Veillard	d046356	2001-10-13 09:15:48 +0000	[diff] [blame]	3567	xmlInitParser();
				3568
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3569	htmlDefaultSAXHandlerInit();
				3570	ctxt->html = 1;
				3571
				3572	GROW;
				3573	/*
				3574	* SAX: beginning of the document processing.
				3575	*/
				3576	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
				3577	ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
				3578
				3579	/*
				3580	* Wipe out everything which is before the first '<'
				3581	*/
				3582	SKIP_BLANKS;
				3583	if (CUR == 0) {
				3584	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3585	ctxt->sax->error(ctxt->userData, "Document is empty\n");
				3586	ctxt->wellFormed = 0;
				3587	}
				3588
				3589	if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
				3590	ctxt->sax->startDocument(ctxt->userData);
				3591
				3592
				3593	/*
				3594	* Parse possible comments before any content
				3595	*/
				3596	while ((CUR == '<') && (NXT(1) == '!') &&
				3597	(NXT(2) == '-') && (NXT(3) == '-')) {
				3598	htmlParseComment(ctxt);
				3599	SKIP_BLANKS;
				3600	}
				3601
				3602
				3603	/*
				3604	* Then possibly doc type declaration(s) and more Misc
				3605	* (doctypedecl Misc*)?
				3606	*/
				3607	if ((CUR == '<') && (NXT(1) == '!') &&
				3608	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				3609	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				3610	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				3611	(UPP(8) == 'E')) {
				3612	htmlParseDocTypeDecl(ctxt);
				3613	}
				3614	SKIP_BLANKS;
				3615
				3616	/*
				3617	* Parse possible comments before any content
				3618	*/
				3619	while ((CUR == '<') && (NXT(1) == '!') &&
				3620	(NXT(2) == '-') && (NXT(3) == '-')) {
				3621	htmlParseComment(ctxt);
				3622	SKIP_BLANKS;
				3623	}
				3624
				3625	/*
				3626	* Time to start parsing the tree itself
				3627	*/
				3628	htmlParseContent(ctxt);
				3629
				3630	/*
				3631	* autoclose
				3632	*/
				3633	if (CUR == 0)
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3634	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3635
				3636
				3637	/*
				3638	* SAX: end of the document processing.
				3639	*/
				3640	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				3641	ctxt->sax->endDocument(ctxt->userData);
				3642
				3643	if (ctxt->myDoc != NULL) {
				3644	dtd = xmlGetIntSubset(ctxt->myDoc);
				3645	if (dtd == NULL)
				3646	ctxt->myDoc->intSubset =
				3647	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
				3648	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				3649	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
				3650	}
				3651	if (! ctxt->wellFormed) return(-1);
				3652	return(0);
				3653	}
				3654
				3655
				3656	/************************************************************************
				3657	* *
				3658	* Parser contexts handling *
				3659	* *
				3660	************************************************************************/
				3661
				3662	/**
				3663	* xmlInitParserCtxt:
				3664	* @ctxt: an HTML parser context
				3665	*
				3666	* Initialize a parser context
				3667	*/
				3668
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3669	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3670	htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
				3671	{
				3672	htmlSAXHandler *sax;
				3673
				3674	if (ctxt == NULL) return;
				3675	memset(ctxt, 0, sizeof(htmlParserCtxt));
				3676
				3677	sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
				3678	if (sax == NULL) {
				3679	xmlGenericError(xmlGenericErrorContext,
				3680	"htmlInitParserCtxt: out of memory\n");
				3681	}
				3682	else
				3683	memset(sax, 0, sizeof(htmlSAXHandler));
				3684
				3685	/* Allocate the Input stack */
				3686	ctxt->inputTab = (htmlParserInputPtr *)
				3687	xmlMalloc(5 * sizeof(htmlParserInputPtr));
				3688	if (ctxt->inputTab == NULL) {
				3689	xmlGenericError(xmlGenericErrorContext,
				3690	"htmlInitParserCtxt: out of memory\n");
				3691	ctxt->inputNr = 0;
				3692	ctxt->inputMax = 0;
				3693	ctxt->input = NULL;
				3694	return;
				3695	}
				3696	ctxt->inputNr = 0;
				3697	ctxt->inputMax = 5;
				3698	ctxt->input = NULL;
				3699	ctxt->version = NULL;
				3700	ctxt->encoding = NULL;
				3701	ctxt->standalone = -1;
				3702	ctxt->instate = XML_PARSER_START;
				3703
				3704	/* Allocate the Node stack */
				3705	ctxt->nodeTab = (htmlNodePtr ) xmlMalloc(10 sizeof(htmlNodePtr));
				3706	if (ctxt->nodeTab == NULL) {
				3707	xmlGenericError(xmlGenericErrorContext,
				3708	"htmlInitParserCtxt: out of memory\n");
				3709	ctxt->nodeNr = 0;
				3710	ctxt->nodeMax = 0;
				3711	ctxt->node = NULL;
				3712	ctxt->inputNr = 0;
				3713	ctxt->inputMax = 0;
				3714	ctxt->input = NULL;
				3715	return;
				3716	}
				3717	ctxt->nodeNr = 0;
				3718	ctxt->nodeMax = 10;
				3719	ctxt->node = NULL;
				3720
				3721	/* Allocate the Name stack */
				3722	ctxt->nameTab = (xmlChar *) xmlMalloc(10 sizeof(xmlChar *));
				3723	if (ctxt->nameTab == NULL) {
				3724	xmlGenericError(xmlGenericErrorContext,
				3725	"htmlInitParserCtxt: out of memory\n");
				3726	ctxt->nameNr = 0;
				3727	ctxt->nameMax = 10;
				3728	ctxt->name = NULL;
				3729	ctxt->nodeNr = 0;
				3730	ctxt->nodeMax = 0;
				3731	ctxt->node = NULL;
				3732	ctxt->inputNr = 0;
				3733	ctxt->inputMax = 0;
				3734	ctxt->input = NULL;
				3735	return;
				3736	}
				3737	ctxt->nameNr = 0;
				3738	ctxt->nameMax = 10;
				3739	ctxt->name = NULL;
				3740
				3741	if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
				3742	else {
				3743	ctxt->sax = sax;
				3744	memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
				3745	}
				3746	ctxt->userData = ctxt;
				3747	ctxt->myDoc = NULL;
				3748	ctxt->wellFormed = 1;
				3749	ctxt->replaceEntities = 0;
				3750	ctxt->html = 1;
				3751	ctxt->record_info = 0;
				3752	ctxt->validate = 0;
				3753	ctxt->nbChars = 0;
				3754	ctxt->checkIndex = 0;
Daniel Veillard	dc2cee2	2001-08-22 16:30:37 +0000	[diff] [blame]	3755	ctxt->catalogs = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3756	xmlInitNodeInfoSeq(&ctxt->node_seq);
				3757	}
				3758
				3759	/**
				3760	* htmlFreeParserCtxt:
				3761	* @ctxt: an HTML parser context
				3762	*
				3763	* Free all the memory used by a parser context. However the parsed
				3764	* document in ctxt->myDoc is not freed.
				3765	*/
				3766
				3767	void
				3768	htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
				3769	{
				3770	xmlFreeParserCtxt(ctxt);
				3771	}
				3772
				3773	/**
				3774	* htmlCreateDocParserCtxt :
				3775	* @cur: a pointer to an array of xmlChar
				3776	* @encoding: a free form C string describing the HTML document encoding, or NULL
				3777	*
				3778	* Create a parser context for an HTML document.
				3779	*
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3780	* TODO: check the need to add encoding handling there
				3781	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3782	* Returns the new parser context or NULL
				3783	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3784	static htmlParserCtxtPtr
Daniel Veillard	c86a4fa	2001-03-26 16:28:29 +0000	[diff] [blame]	3785	htmlCreateDocParserCtxt(xmlChar cur, const char encoding ATTRIBUTE_UNUSED) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3786	htmlParserCtxtPtr ctxt;
				3787	htmlParserInputPtr input;
				3788	/* htmlCharEncoding enc; */
				3789
				3790	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				3791	if (ctxt == NULL) {
				3792	perror("malloc");
				3793	return(NULL);
				3794	}
				3795	htmlInitParserCtxt(ctxt);
				3796	input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				3797	if (input == NULL) {
				3798	perror("malloc");
				3799	xmlFree(ctxt);
				3800	return(NULL);
				3801	}
				3802	memset(input, 0, sizeof(htmlParserInput));
				3803
				3804	input->line = 1;
				3805	input->col = 1;
				3806	input->base = cur;
				3807	input->cur = cur;
				3808
				3809	inputPush(ctxt, input);
				3810	return(ctxt);
				3811	}
				3812
				3813	/************************************************************************
				3814	* *
				3815	* Progressive parsing interfaces *
				3816	* *
				3817	************************************************************************/
				3818
				3819	/**
				3820	* htmlParseLookupSequence:
				3821	* @ctxt: an HTML parser context
				3822	* @first: the first char to lookup
				3823	* @next: the next char to lookup or zero
				3824	* @third: the next char to lookup or zero
				3825	*
				3826	* Try to find if a sequence (first, next, third) or just (first next) or
				3827	* (first) is available in the input stream.
				3828	* This function has a side effect of (possibly) incrementing ctxt->checkIndex
				3829	* to avoid rescanning sequences of bytes, it DOES change the state of the
				3830	* parser, do not use liberally.
				3831	* This is basically similar to xmlParseLookupSequence()
				3832	*
				3833	* Returns the index to the current parsing point if the full sequence
				3834	* is available, -1 otherwise.
				3835	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3836	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3837	htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
				3838	xmlChar next, xmlChar third) {
				3839	int base, len;
				3840	htmlParserInputPtr in;
				3841	const xmlChar *buf;
				3842
				3843	in = ctxt->input;
				3844	if (in == NULL) return(-1);
				3845	base = in->cur - in->base;
				3846	if (base < 0) return(-1);
				3847	if (ctxt->checkIndex > base)
				3848	base = ctxt->checkIndex;
				3849	if (in->buf == NULL) {
				3850	buf = in->base;
				3851	len = in->length;
				3852	} else {
				3853	buf = in->buf->buffer->content;
				3854	len = in->buf->buffer->use;
				3855	}
				3856	/* take into account the sequence length */
				3857	if (third) len -= 2;
				3858	else if (next) len --;
				3859	for (;base < len;base++) {
				3860	if (buf[base] == first) {
				3861	if (third != 0) {
				3862	if ((buf[base + 1] != next) \|\|
				3863	(buf[base + 2] != third)) continue;
				3864	} else if (next != 0) {
				3865	if (buf[base + 1] != next) continue;
				3866	}
				3867	ctxt->checkIndex = 0;
				3868	#ifdef DEBUG_PUSH
				3869	if (next == 0)
				3870	xmlGenericError(xmlGenericErrorContext,
				3871	"HPP: lookup '%c' found at %d\n",
				3872	first, base);
				3873	else if (third == 0)
				3874	xmlGenericError(xmlGenericErrorContext,
				3875	"HPP: lookup '%c%c' found at %d\n",
				3876	first, next, base);
				3877	else
				3878	xmlGenericError(xmlGenericErrorContext,
				3879	"HPP: lookup '%c%c%c' found at %d\n",
				3880	first, next, third, base);
				3881	#endif
				3882	return(base - (in->cur - in->base));
				3883	}
				3884	}
				3885	ctxt->checkIndex = base;
				3886	#ifdef DEBUG_PUSH
				3887	if (next == 0)
				3888	xmlGenericError(xmlGenericErrorContext,
				3889	"HPP: lookup '%c' failed\n", first);
				3890	else if (third == 0)
				3891	xmlGenericError(xmlGenericErrorContext,
				3892	"HPP: lookup '%c%c' failed\n", first, next);
				3893	else
				3894	xmlGenericError(xmlGenericErrorContext,
				3895	"HPP: lookup '%c%c%c' failed\n", first, next, third);
				3896	#endif
				3897	return(-1);
				3898	}
				3899
				3900	/**
				3901	* htmlParseTryOrFinish:
				3902	* @ctxt: an HTML parser context
				3903	* @terminate: last chunk indicator
				3904	*
				3905	* Try to progress on parsing
				3906	*
				3907	* Returns zero if no parsing was possible
				3908	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3909	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3910	htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
				3911	int ret = 0;
				3912	htmlParserInputPtr in;
				3913	int avail = 0;
				3914	xmlChar cur, next;
				3915
				3916	#ifdef DEBUG_PUSH
				3917	switch (ctxt->instate) {
				3918	case XML_PARSER_EOF:
				3919	xmlGenericError(xmlGenericErrorContext,
				3920	"HPP: try EOF\n"); break;
				3921	case XML_PARSER_START:
				3922	xmlGenericError(xmlGenericErrorContext,
				3923	"HPP: try START\n"); break;
				3924	case XML_PARSER_MISC:
				3925	xmlGenericError(xmlGenericErrorContext,
				3926	"HPP: try MISC\n");break;
				3927	case XML_PARSER_COMMENT:
				3928	xmlGenericError(xmlGenericErrorContext,
				3929	"HPP: try COMMENT\n");break;
				3930	case XML_PARSER_PROLOG:
				3931	xmlGenericError(xmlGenericErrorContext,
				3932	"HPP: try PROLOG\n");break;
				3933	case XML_PARSER_START_TAG:
				3934	xmlGenericError(xmlGenericErrorContext,
				3935	"HPP: try START_TAG\n");break;
				3936	case XML_PARSER_CONTENT:
				3937	xmlGenericError(xmlGenericErrorContext,
				3938	"HPP: try CONTENT\n");break;
				3939	case XML_PARSER_CDATA_SECTION:
				3940	xmlGenericError(xmlGenericErrorContext,
				3941	"HPP: try CDATA_SECTION\n");break;
				3942	case XML_PARSER_END_TAG:
				3943	xmlGenericError(xmlGenericErrorContext,
				3944	"HPP: try END_TAG\n");break;
				3945	case XML_PARSER_ENTITY_DECL:
				3946	xmlGenericError(xmlGenericErrorContext,
				3947	"HPP: try ENTITY_DECL\n");break;
				3948	case XML_PARSER_ENTITY_VALUE:
				3949	xmlGenericError(xmlGenericErrorContext,
				3950	"HPP: try ENTITY_VALUE\n");break;
				3951	case XML_PARSER_ATTRIBUTE_VALUE:
				3952	xmlGenericError(xmlGenericErrorContext,
				3953	"HPP: try ATTRIBUTE_VALUE\n");break;
				3954	case XML_PARSER_DTD:
				3955	xmlGenericError(xmlGenericErrorContext,
				3956	"HPP: try DTD\n");break;
				3957	case XML_PARSER_EPILOG:
				3958	xmlGenericError(xmlGenericErrorContext,
				3959	"HPP: try EPILOG\n");break;
				3960	case XML_PARSER_PI:
				3961	xmlGenericError(xmlGenericErrorContext,
				3962	"HPP: try PI\n");break;
				3963	case XML_PARSER_SYSTEM_LITERAL:
				3964	xmlGenericError(xmlGenericErrorContext,
				3965	"HPP: try SYSTEM_LITERAL\n");break;
				3966	}
				3967	#endif
				3968
				3969	while (1) {
				3970
				3971	in = ctxt->input;
				3972	if (in == NULL) break;
				3973	if (in->buf == NULL)
				3974	avail = in->length - (in->cur - in->base);
				3975	else
				3976	avail = in->buf->buffer->use - (in->cur - in->base);
				3977	if ((avail == 0) && (terminate)) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3978	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3979	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
				3980	/*
				3981	* SAX: end of the document processing.
				3982	*/
				3983	ctxt->instate = XML_PARSER_EOF;
				3984	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				3985	ctxt->sax->endDocument(ctxt->userData);
				3986	}
				3987	}
				3988	if (avail < 1)
				3989	goto done;
				3990	switch (ctxt->instate) {
				3991	case XML_PARSER_EOF:
				3992	/*
				3993	* Document parsing is done !
				3994	*/
				3995	goto done;
				3996	case XML_PARSER_START:
				3997	/*
				3998	* Very first chars read from the document flow.
				3999	*/
				4000	cur = in->cur[0];
				4001	if (IS_BLANK(cur)) {
				4002	SKIP_BLANKS;
				4003	if (in->buf == NULL)
				4004	avail = in->length - (in->cur - in->base);
				4005	else
				4006	avail = in->buf->buffer->use - (in->cur - in->base);
				4007	}
				4008	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
				4009	ctxt->sax->setDocumentLocator(ctxt->userData,
				4010	&xmlDefaultSAXLocator);
				4011	if ((ctxt->sax) && (ctxt->sax->startDocument) &&
				4012	(!ctxt->disableSAX))
				4013	ctxt->sax->startDocument(ctxt->userData);
				4014
				4015	cur = in->cur[0];
				4016	next = in->cur[1];
				4017	if ((cur == '<') && (next == '!') &&
				4018	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4019	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4020	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4021	(UPP(8) == 'E')) {
				4022	if ((!terminate) &&
				4023	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4024	goto done;
				4025	#ifdef DEBUG_PUSH
				4026	xmlGenericError(xmlGenericErrorContext,
				4027	"HPP: Parsing internal subset\n");
				4028	#endif
				4029	htmlParseDocTypeDecl(ctxt);
				4030	ctxt->instate = XML_PARSER_PROLOG;
				4031	#ifdef DEBUG_PUSH
				4032	xmlGenericError(xmlGenericErrorContext,
				4033	"HPP: entering PROLOG\n");
				4034	#endif
				4035	} else {
				4036	ctxt->instate = XML_PARSER_MISC;
				4037	}
				4038	#ifdef DEBUG_PUSH
				4039	xmlGenericError(xmlGenericErrorContext,
				4040	"HPP: entering MISC\n");
				4041	#endif
				4042	break;
				4043	case XML_PARSER_MISC:
				4044	SKIP_BLANKS;
				4045	if (in->buf == NULL)
				4046	avail = in->length - (in->cur - in->base);
				4047	else
				4048	avail = in->buf->buffer->use - (in->cur - in->base);
				4049	if (avail < 2)
				4050	goto done;
				4051	cur = in->cur[0];
				4052	next = in->cur[1];
				4053	if ((cur == '<') && (next == '!') &&
				4054	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4055	if ((!terminate) &&
				4056	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4057	goto done;
				4058	#ifdef DEBUG_PUSH
				4059	xmlGenericError(xmlGenericErrorContext,
				4060	"HPP: Parsing Comment\n");
				4061	#endif
				4062	htmlParseComment(ctxt);
				4063	ctxt->instate = XML_PARSER_MISC;
				4064	} else if ((cur == '<') && (next == '!') &&
				4065	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4066	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4067	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4068	(UPP(8) == 'E')) {
				4069	if ((!terminate) &&
				4070	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4071	goto done;
				4072	#ifdef DEBUG_PUSH
				4073	xmlGenericError(xmlGenericErrorContext,
				4074	"HPP: Parsing internal subset\n");
				4075	#endif
				4076	htmlParseDocTypeDecl(ctxt);
				4077	ctxt->instate = XML_PARSER_PROLOG;
				4078	#ifdef DEBUG_PUSH
				4079	xmlGenericError(xmlGenericErrorContext,
				4080	"HPP: entering PROLOG\n");
				4081	#endif
				4082	} else if ((cur == '<') && (next == '!') &&
				4083	(avail < 9)) {
				4084	goto done;
				4085	} else {
				4086	ctxt->instate = XML_PARSER_START_TAG;
				4087	#ifdef DEBUG_PUSH
				4088	xmlGenericError(xmlGenericErrorContext,
				4089	"HPP: entering START_TAG\n");
				4090	#endif
				4091	}
				4092	break;
				4093	case XML_PARSER_PROLOG:
				4094	SKIP_BLANKS;
				4095	if (in->buf == NULL)
				4096	avail = in->length - (in->cur - in->base);
				4097	else
				4098	avail = in->buf->buffer->use - (in->cur - in->base);
				4099	if (avail < 2)
				4100	goto done;
				4101	cur = in->cur[0];
				4102	next = in->cur[1];
				4103	if ((cur == '<') && (next == '!') &&
				4104	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4105	if ((!terminate) &&
				4106	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4107	goto done;
				4108	#ifdef DEBUG_PUSH
				4109	xmlGenericError(xmlGenericErrorContext,
				4110	"HPP: Parsing Comment\n");
				4111	#endif
				4112	htmlParseComment(ctxt);
				4113	ctxt->instate = XML_PARSER_PROLOG;
				4114	} else if ((cur == '<') && (next == '!') &&
				4115	(avail < 4)) {
				4116	goto done;
				4117	} else {
				4118	ctxt->instate = XML_PARSER_START_TAG;
				4119	#ifdef DEBUG_PUSH
				4120	xmlGenericError(xmlGenericErrorContext,
				4121	"HPP: entering START_TAG\n");
				4122	#endif
				4123	}
				4124	break;
				4125	case XML_PARSER_EPILOG:
				4126	if (in->buf == NULL)
				4127	avail = in->length - (in->cur - in->base);
				4128	else
				4129	avail = in->buf->buffer->use - (in->cur - in->base);
				4130	if (avail < 1)
				4131	goto done;
				4132	cur = in->cur[0];
				4133	if (IS_BLANK(cur)) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	4134	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4135	goto done;
				4136	}
				4137	if (avail < 2)
				4138	goto done;
				4139	next = in->cur[1];
				4140	if ((cur == '<') && (next == '!') &&
				4141	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4142	if ((!terminate) &&
				4143	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4144	goto done;
				4145	#ifdef DEBUG_PUSH
				4146	xmlGenericError(xmlGenericErrorContext,
				4147	"HPP: Parsing Comment\n");
				4148	#endif
				4149	htmlParseComment(ctxt);
				4150	ctxt->instate = XML_PARSER_EPILOG;
				4151	} else if ((cur == '<') && (next == '!') &&
				4152	(avail < 4)) {
				4153	goto done;
				4154	} else {
				4155	ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4156	ctxt->wellFormed = 0;
				4157	ctxt->instate = XML_PARSER_EOF;
				4158	#ifdef DEBUG_PUSH
				4159	xmlGenericError(xmlGenericErrorContext,
				4160	"HPP: entering EOF\n");
				4161	#endif
				4162	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4163	ctxt->sax->endDocument(ctxt->userData);
				4164	goto done;
				4165	}
				4166	break;
				4167	case XML_PARSER_START_TAG: {
				4168	xmlChar name, oldname;
				4169	int depth = ctxt->nameNr;
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	4170	const htmlElemDesc * info;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4171
				4172	if (avail < 2)
				4173	goto done;
				4174	cur = in->cur[0];
				4175	if (cur != '<') {
				4176	ctxt->instate = XML_PARSER_CONTENT;
				4177	#ifdef DEBUG_PUSH
				4178	xmlGenericError(xmlGenericErrorContext,
				4179	"HPP: entering CONTENT\n");
				4180	#endif
				4181	break;
				4182	}
Daniel Veillard	f69bb4b	2001-05-19 13:24:56 +0000	[diff] [blame]	4183	if (in->cur[1] == '/') {
				4184	ctxt->instate = XML_PARSER_END_TAG;
				4185	ctxt->checkIndex = 0;
				4186	#ifdef DEBUG_PUSH
				4187	xmlGenericError(xmlGenericErrorContext,
				4188	"HPP: entering END_TAG\n");
				4189	#endif
				4190	break;
				4191	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4192	if ((!terminate) &&
				4193	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4194	goto done;
				4195
				4196	oldname = xmlStrdup(ctxt->name);
				4197	htmlParseStartTag(ctxt);
				4198	name = ctxt->name;
				4199	#ifdef DEBUG
				4200	if (oldname == NULL)
				4201	xmlGenericError(xmlGenericErrorContext,
				4202	"Start of element %s\n", name);
				4203	else if (name == NULL)
				4204	xmlGenericError(xmlGenericErrorContext,
				4205	"Start of element failed, was %s\n",
				4206	oldname);
				4207	else
				4208	xmlGenericError(xmlGenericErrorContext,
				4209	"Start of element %s, was %s\n",
				4210	name, oldname);
				4211	#endif
				4212	if (((depth == ctxt->nameNr) &&
				4213	(xmlStrEqual(oldname, ctxt->name))) \|\|
				4214	(name == NULL)) {
				4215	if (CUR == '>')
				4216	NEXT;
				4217	if (oldname != NULL)
				4218	xmlFree(oldname);
				4219	break;
				4220	}
				4221	if (oldname != NULL)
				4222	xmlFree(oldname);
				4223
				4224	/*
				4225	* Lookup the info for that element.
				4226	*/
				4227	info = htmlTagLookup(name);
				4228	if (info == NULL) {
				4229	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4230	ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
				4231	name);
				4232	ctxt->wellFormed = 0;
				4233	} else if (info->depr) {
				4234	/***************************
				4235	if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
				4236	ctxt->sax->warning(ctxt->userData,
				4237	"Tag %s is deprecated\n",
				4238	name);
				4239	***************************/
				4240	}
				4241
				4242	/*
				4243	* Check for an Empty Element labelled the XML/SGML way
				4244	*/
				4245	if ((CUR == '/') && (NXT(1) == '>')) {
				4246	SKIP(2);
				4247	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4248	ctxt->sax->endElement(ctxt->userData, name);
				4249	oldname = htmlnamePop(ctxt);
				4250	#ifdef DEBUG
				4251	xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
				4252	oldname);
				4253	#endif
				4254	if (oldname != NULL)
				4255	xmlFree(oldname);
				4256	ctxt->instate = XML_PARSER_CONTENT;
				4257	#ifdef DEBUG_PUSH
				4258	xmlGenericError(xmlGenericErrorContext,
				4259	"HPP: entering CONTENT\n");
				4260	#endif
				4261	break;
				4262	}
				4263
				4264	if (CUR == '>') {
				4265	NEXT;
				4266	} else {
				4267	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4268	ctxt->sax->error(ctxt->userData,
				4269	"Couldn't find end of Start Tag %s\n",
				4270	name);
				4271	ctxt->wellFormed = 0;
				4272
				4273	/*
				4274	* end of parsing of this node.
				4275	*/
				4276	if (xmlStrEqual(name, ctxt->name)) {
				4277	nodePop(ctxt);
				4278	oldname = htmlnamePop(ctxt);
				4279	#ifdef DEBUG
				4280	xmlGenericError(xmlGenericErrorContext,
				4281	"End of start tag problem: popping out %s\n", oldname);
				4282	#endif
				4283	if (oldname != NULL)
				4284	xmlFree(oldname);
				4285	}
				4286
				4287	ctxt->instate = XML_PARSER_CONTENT;
				4288	#ifdef DEBUG_PUSH
				4289	xmlGenericError(xmlGenericErrorContext,
				4290	"HPP: entering CONTENT\n");
				4291	#endif
				4292	break;
				4293	}
				4294
				4295	/*
				4296	* Check for an Empty Element from DTD definition
				4297	*/
				4298	if ((info != NULL) && (info->empty)) {
				4299	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4300	ctxt->sax->endElement(ctxt->userData, name);
				4301	oldname = htmlnamePop(ctxt);
				4302	#ifdef DEBUG
				4303	xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
				4304	#endif
				4305	if (oldname != NULL)
				4306	xmlFree(oldname);
				4307	}
				4308	ctxt->instate = XML_PARSER_CONTENT;
				4309	#ifdef DEBUG_PUSH
				4310	xmlGenericError(xmlGenericErrorContext,
				4311	"HPP: entering CONTENT\n");
				4312	#endif
				4313	break;
				4314	}
				4315	case XML_PARSER_CONTENT: {
				4316	long cons;
				4317	/*
				4318	* Handle preparsed entities and charRef
				4319	*/
				4320	if (ctxt->token != 0) {
				4321	xmlChar chr[2] = { 0 , 0 } ;
				4322
				4323	chr[0] = (xmlChar) ctxt->token;
				4324	htmlCheckParagraph(ctxt);
				4325	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				4326	ctxt->sax->characters(ctxt->userData, chr, 1);
				4327	ctxt->token = 0;
				4328	ctxt->checkIndex = 0;
				4329	}
				4330	if ((avail == 1) && (terminate)) {
				4331	cur = in->cur[0];
				4332	if ((cur != '<') && (cur != '&')) {
				4333	if (ctxt->sax != NULL) {
				4334	if (IS_BLANK(cur)) {
				4335	if (ctxt->sax->ignorableWhitespace != NULL)
				4336	ctxt->sax->ignorableWhitespace(
				4337	ctxt->userData, &cur, 1);
				4338	} else {
				4339	htmlCheckParagraph(ctxt);
				4340	if (ctxt->sax->characters != NULL)
				4341	ctxt->sax->characters(
				4342	ctxt->userData, &cur, 1);
				4343	}
				4344	}
				4345	ctxt->token = 0;
				4346	ctxt->checkIndex = 0;
				4347	NEXT;
William M. Brack	1633d18	2001-10-05 15:41:19 +0000	[diff] [blame]	4348	break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4349	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4350	}
				4351	if (avail < 2)
				4352	goto done;
				4353	cur = in->cur[0];
				4354	next = in->cur[1];
				4355	cons = ctxt->nbChars;
				4356	if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) \|\|
				4357	(xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
				4358	/*
				4359	* Handle SCRIPT/STYLE separately
				4360	*/
				4361	if ((!terminate) &&
				4362	(htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
				4363	goto done;
				4364	htmlParseScript(ctxt);
				4365	if ((cur == '<') && (next == '/')) {
				4366	ctxt->instate = XML_PARSER_END_TAG;
				4367	ctxt->checkIndex = 0;
				4368	#ifdef DEBUG_PUSH
				4369	xmlGenericError(xmlGenericErrorContext,
				4370	"HPP: entering END_TAG\n");
				4371	#endif
				4372	break;
				4373	}
				4374	} else {
				4375	/*
				4376	* Sometimes DOCTYPE arrives in the middle of the document
				4377	*/
				4378	if ((cur == '<') && (next == '!') &&
				4379	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4380	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4381	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4382	(UPP(8) == 'E')) {
				4383	if ((!terminate) &&
				4384	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4385	goto done;
				4386	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4387	ctxt->sax->error(ctxt->userData,
				4388	"Misplaced DOCTYPE declaration\n");
				4389	ctxt->wellFormed = 0;
				4390	htmlParseDocTypeDecl(ctxt);
				4391	} else if ((cur == '<') && (next == '!') &&
				4392	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4393	if ((!terminate) &&
				4394	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4395	goto done;
				4396	#ifdef DEBUG_PUSH
				4397	xmlGenericError(xmlGenericErrorContext,
				4398	"HPP: Parsing Comment\n");
				4399	#endif
				4400	htmlParseComment(ctxt);
				4401	ctxt->instate = XML_PARSER_CONTENT;
				4402	} else if ((cur == '<') && (next == '!') && (avail < 4)) {
				4403	goto done;
				4404	} else if ((cur == '<') && (next == '/')) {
				4405	ctxt->instate = XML_PARSER_END_TAG;
				4406	ctxt->checkIndex = 0;
				4407	#ifdef DEBUG_PUSH
				4408	xmlGenericError(xmlGenericErrorContext,
				4409	"HPP: entering END_TAG\n");
				4410	#endif
				4411	break;
				4412	} else if (cur == '<') {
				4413	ctxt->instate = XML_PARSER_START_TAG;
				4414	ctxt->checkIndex = 0;
				4415	#ifdef DEBUG_PUSH
				4416	xmlGenericError(xmlGenericErrorContext,
				4417	"HPP: entering START_TAG\n");
				4418	#endif
				4419	break;
				4420	} else if (cur == '&') {
				4421	if ((!terminate) &&
				4422	(htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
				4423	goto done;
				4424	#ifdef DEBUG_PUSH
				4425	xmlGenericError(xmlGenericErrorContext,
				4426	"HPP: Parsing Reference\n");
				4427	#endif
				4428	/* TODO: check generation of subtrees if noent !!! */
				4429	htmlParseReference(ctxt);
				4430	} else {
				4431	/* TODO Avoid the extra copy, handle directly !!!!!! */
				4432	/*
				4433	* Goal of the following test is :
				4434	* - minimize calls to the SAX 'character' callback
				4435	* when they are mergeable
				4436	*/
				4437	if ((ctxt->inputNr == 1) &&
				4438	(avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
				4439	if ((!terminate) &&
				4440	(htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
				4441	goto done;
				4442	}
				4443	ctxt->checkIndex = 0;
				4444	#ifdef DEBUG_PUSH
				4445	xmlGenericError(xmlGenericErrorContext,
				4446	"HPP: Parsing char data\n");
				4447	#endif
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	4448	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4449	}
				4450	}
				4451	if (cons == ctxt->nbChars) {
				4452	if (ctxt->node != NULL) {
				4453	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4454	ctxt->sax->error(ctxt->userData,
				4455	"detected an error in element content\n");
				4456	ctxt->wellFormed = 0;
				4457	}
				4458	NEXT;
				4459	break;
				4460	}
				4461
				4462	break;
				4463	}
				4464	case XML_PARSER_END_TAG:
				4465	if (avail < 2)
				4466	goto done;
				4467	if ((!terminate) &&
				4468	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4469	goto done;
				4470	htmlParseEndTag(ctxt);
				4471	if (ctxt->nameNr == 0) {
				4472	ctxt->instate = XML_PARSER_EPILOG;
				4473	} else {
				4474	ctxt->instate = XML_PARSER_CONTENT;
				4475	}
				4476	ctxt->checkIndex = 0;
				4477	#ifdef DEBUG_PUSH
				4478	xmlGenericError(xmlGenericErrorContext,
				4479	"HPP: entering CONTENT\n");
				4480	#endif
				4481	break;
				4482	case XML_PARSER_CDATA_SECTION:
				4483	xmlGenericError(xmlGenericErrorContext,
				4484	"HPP: internal error, state == CDATA\n");
				4485	ctxt->instate = XML_PARSER_CONTENT;
				4486	ctxt->checkIndex = 0;
				4487	#ifdef DEBUG_PUSH
				4488	xmlGenericError(xmlGenericErrorContext,
				4489	"HPP: entering CONTENT\n");
				4490	#endif
				4491	break;
				4492	case XML_PARSER_DTD:
				4493	xmlGenericError(xmlGenericErrorContext,
				4494	"HPP: internal error, state == DTD\n");
				4495	ctxt->instate = XML_PARSER_CONTENT;
				4496	ctxt->checkIndex = 0;
				4497	#ifdef DEBUG_PUSH
				4498	xmlGenericError(xmlGenericErrorContext,
				4499	"HPP: entering CONTENT\n");
				4500	#endif
				4501	break;
				4502	case XML_PARSER_COMMENT:
				4503	xmlGenericError(xmlGenericErrorContext,
				4504	"HPP: internal error, state == COMMENT\n");
				4505	ctxt->instate = XML_PARSER_CONTENT;
				4506	ctxt->checkIndex = 0;
				4507	#ifdef DEBUG_PUSH
				4508	xmlGenericError(xmlGenericErrorContext,
				4509	"HPP: entering CONTENT\n");
				4510	#endif
				4511	break;
				4512	case XML_PARSER_PI:
				4513	xmlGenericError(xmlGenericErrorContext,
				4514	"HPP: internal error, state == PI\n");
				4515	ctxt->instate = XML_PARSER_CONTENT;
				4516	ctxt->checkIndex = 0;
				4517	#ifdef DEBUG_PUSH
				4518	xmlGenericError(xmlGenericErrorContext,
				4519	"HPP: entering CONTENT\n");
				4520	#endif
				4521	break;
				4522	case XML_PARSER_ENTITY_DECL:
				4523	xmlGenericError(xmlGenericErrorContext,
				4524	"HPP: internal error, state == ENTITY_DECL\n");
				4525	ctxt->instate = XML_PARSER_CONTENT;
				4526	ctxt->checkIndex = 0;
				4527	#ifdef DEBUG_PUSH
				4528	xmlGenericError(xmlGenericErrorContext,
				4529	"HPP: entering CONTENT\n");
				4530	#endif
				4531	break;
				4532	case XML_PARSER_ENTITY_VALUE:
				4533	xmlGenericError(xmlGenericErrorContext,
				4534	"HPP: internal error, state == ENTITY_VALUE\n");
				4535	ctxt->instate = XML_PARSER_CONTENT;
				4536	ctxt->checkIndex = 0;
				4537	#ifdef DEBUG_PUSH
				4538	xmlGenericError(xmlGenericErrorContext,
				4539	"HPP: entering DTD\n");
				4540	#endif
				4541	break;
				4542	case XML_PARSER_ATTRIBUTE_VALUE:
				4543	xmlGenericError(xmlGenericErrorContext,
				4544	"HPP: internal error, state == ATTRIBUTE_VALUE\n");
				4545	ctxt->instate = XML_PARSER_START_TAG;
				4546	ctxt->checkIndex = 0;
				4547	#ifdef DEBUG_PUSH
				4548	xmlGenericError(xmlGenericErrorContext,
				4549	"HPP: entering START_TAG\n");
				4550	#endif
				4551	break;
				4552	case XML_PARSER_SYSTEM_LITERAL:
				4553	xmlGenericError(xmlGenericErrorContext,
				4554	"HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
				4555	ctxt->instate = XML_PARSER_CONTENT;
				4556	ctxt->checkIndex = 0;
				4557	#ifdef DEBUG_PUSH
				4558	xmlGenericError(xmlGenericErrorContext,
				4559	"HPP: entering CONTENT\n");
				4560	#endif
				4561	break;
				4562	case XML_PARSER_IGNORE:
				4563	xmlGenericError(xmlGenericErrorContext,
				4564	"HPP: internal error, state == XML_PARSER_IGNORE\n");
				4565	ctxt->instate = XML_PARSER_CONTENT;
				4566	ctxt->checkIndex = 0;
				4567	#ifdef DEBUG_PUSH
				4568	xmlGenericError(xmlGenericErrorContext,
				4569	"HPP: entering CONTENT\n");
				4570	#endif
				4571	break;
				4572	}
				4573	}
				4574	done:
				4575	if ((avail == 0) && (terminate)) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	4576	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4577	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
				4578	/*
				4579	* SAX: end of the document processing.
				4580	*/
				4581	ctxt->instate = XML_PARSER_EOF;
				4582	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4583	ctxt->sax->endDocument(ctxt->userData);
				4584	}
				4585	}
				4586	if ((ctxt->myDoc != NULL) &&
				4587	((terminate) \|\| (ctxt->instate == XML_PARSER_EOF) \|\|
				4588	(ctxt->instate == XML_PARSER_EPILOG))) {
				4589	xmlDtdPtr dtd;
				4590	dtd = xmlGetIntSubset(ctxt->myDoc);
				4591	if (dtd == NULL)
				4592	ctxt->myDoc->intSubset =
				4593	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
				4594	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				4595	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
				4596	}
				4597	#ifdef DEBUG_PUSH
				4598	xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
				4599	#endif
				4600	return(ret);
				4601	}
				4602
				4603	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4604	* htmlParseChunk:
				4605	* @ctxt: an XML parser context
				4606	* @chunk: an char array
				4607	* @size: the size in byte of the chunk
				4608	* @terminate: last chunk indicator
				4609	*
				4610	* Parse a Chunk of memory
				4611	*
				4612	* Returns zero if no error, the xmlParserErrors otherwise.
				4613	*/
				4614	int
				4615	htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
				4616	int terminate) {
				4617	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
				4618	(ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
				4619	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
				4620	int cur = ctxt->input->cur - ctxt->input->base;
				4621
				4622	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
				4623	ctxt->input->base = ctxt->input->buf->buffer->content + base;
				4624	ctxt->input->cur = ctxt->input->base + cur;
				4625	#ifdef DEBUG_PUSH
				4626	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
				4627	#endif
				4628
				4629	if ((terminate) \|\| (ctxt->input->buf->buffer->use > 80))
				4630	htmlParseTryOrFinish(ctxt, terminate);
				4631	} else if (ctxt->instate != XML_PARSER_EOF) {
				4632	xmlParserInputBufferPush(ctxt->input->buf, 0, "");
				4633	htmlParseTryOrFinish(ctxt, terminate);
				4634	}
				4635	if (terminate) {
				4636	if ((ctxt->instate != XML_PARSER_EOF) &&
				4637	(ctxt->instate != XML_PARSER_EPILOG) &&
				4638	(ctxt->instate != XML_PARSER_MISC)) {
				4639	ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4640	ctxt->wellFormed = 0;
				4641	}
				4642	if (ctxt->instate != XML_PARSER_EOF) {
				4643	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4644	ctxt->sax->endDocument(ctxt->userData);
				4645	}
				4646	ctxt->instate = XML_PARSER_EOF;
				4647	}
				4648	return((xmlParserErrors) ctxt->errNo);
				4649	}
				4650
				4651	/************************************************************************
				4652	* *
				4653	* User entry points *
				4654	* *
				4655	************************************************************************/
				4656
				4657	/**
				4658	* htmlCreatePushParserCtxt :
				4659	* @sax: a SAX handler
				4660	* @user_data: The user data returned on SAX callbacks
				4661	* @chunk: a pointer to an array of chars
				4662	* @size: number of chars in the array
				4663	* @filename: an optional file name or URI
				4664	* @enc: an optional encoding
				4665	*
				4666	* Create a parser context for using the HTML parser in push mode
				4667	* To allow content encoding detection, @size should be >= 4
				4668	* The value of @filename is used for fetching external entities
				4669	* and error/warning reports.
				4670	*
				4671	* Returns the new parser context or NULL
				4672	*/
				4673	htmlParserCtxtPtr
				4674	htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
				4675	const char chunk, int size, const char filename,
				4676	xmlCharEncoding enc) {
				4677	htmlParserCtxtPtr ctxt;
				4678	htmlParserInputPtr inputStream;
				4679	xmlParserInputBufferPtr buf;
				4680
Daniel Veillard	d046356	2001-10-13 09:15:48 +0000	[diff] [blame]	4681	xmlInitParser();
				4682
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4683	buf = xmlAllocParserInputBuffer(enc);
				4684	if (buf == NULL) return(NULL);
				4685
				4686	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				4687	if (ctxt == NULL) {
				4688	xmlFree(buf);
				4689	return(NULL);
				4690	}
				4691	memset(ctxt, 0, sizeof(htmlParserCtxt));
				4692	htmlInitParserCtxt(ctxt);
				4693	if (sax != NULL) {
				4694	if (ctxt->sax != &htmlDefaultSAXHandler)
				4695	xmlFree(ctxt->sax);
				4696	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
				4697	if (ctxt->sax == NULL) {
				4698	xmlFree(buf);
				4699	xmlFree(ctxt);
				4700	return(NULL);
				4701	}
				4702	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
				4703	if (user_data != NULL)
				4704	ctxt->userData = user_data;
				4705	}
				4706	if (filename == NULL) {
				4707	ctxt->directory = NULL;
				4708	} else {
				4709	ctxt->directory = xmlParserGetDirectory(filename);
				4710	}
				4711
				4712	inputStream = htmlNewInputStream(ctxt);
				4713	if (inputStream == NULL) {
				4714	xmlFreeParserCtxt(ctxt);
				4715	return(NULL);
				4716	}
				4717
				4718	if (filename == NULL)
				4719	inputStream->filename = NULL;
				4720	else
				4721	inputStream->filename = xmlMemStrdup(filename);
				4722	inputStream->buf = buf;
				4723	inputStream->base = inputStream->buf->buffer->content;
				4724	inputStream->cur = inputStream->buf->buffer->content;
				4725
				4726	inputPush(ctxt, inputStream);
				4727
				4728	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
				4729	(ctxt->input->buf != NULL)) {
				4730	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
				4731	#ifdef DEBUG_PUSH
				4732	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
				4733	#endif
				4734	}
				4735
				4736	return(ctxt);
				4737	}
				4738
				4739	/**
				4740	* htmlSAXParseDoc :
				4741	* @cur: a pointer to an array of xmlChar
				4742	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4743	* @sax: the SAX handler block
				4744	* @userData: if using SAX, this pointer will be provided on callbacks.
				4745	*
Daniel Veillard	4d65a1c	2001-07-04 22:06:23 +0000	[diff] [blame]	4746	* Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
				4747	* to handle parse events. If sax is NULL, fallback to the default DOM
				4748	* behavior and return a tree.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4749	*
Daniel Veillard	4d65a1c	2001-07-04 22:06:23 +0000	[diff] [blame]	4750	* Returns the resulting document tree unless SAX is NULL or the document is
				4751	* not well formed.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4752	*/
				4753
				4754	htmlDocPtr
				4755	htmlSAXParseDoc(xmlChar cur, const char encoding, htmlSAXHandlerPtr sax, void *userData) {
				4756	htmlDocPtr ret;
				4757	htmlParserCtxtPtr ctxt;
				4758
Daniel Veillard	d046356	2001-10-13 09:15:48 +0000	[diff] [blame]	4759	xmlInitParser();
				4760
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4761	if (cur == NULL) return(NULL);
				4762
				4763
				4764	ctxt = htmlCreateDocParserCtxt(cur, encoding);
				4765	if (ctxt == NULL) return(NULL);
				4766	if (sax != NULL) {
				4767	ctxt->sax = sax;
				4768	ctxt->userData = userData;
				4769	}
				4770
				4771	htmlParseDocument(ctxt);
				4772	ret = ctxt->myDoc;
				4773	if (sax != NULL) {
				4774	ctxt->sax = NULL;
				4775	ctxt->userData = NULL;
				4776	}
				4777	htmlFreeParserCtxt(ctxt);
				4778
				4779	return(ret);
				4780	}
				4781
				4782	/**
				4783	* htmlParseDoc :
				4784	* @cur: a pointer to an array of xmlChar
				4785	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4786	*
				4787	* parse an HTML in-memory document and build a tree.
				4788	*
				4789	* Returns the resulting document tree
				4790	*/
				4791
				4792	htmlDocPtr
				4793	htmlParseDoc(xmlChar cur, const char encoding) {
				4794	return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
				4795	}
				4796
				4797
				4798	/**
				4799	* htmlCreateFileParserCtxt :
				4800	* @filename: the filename
				4801	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4802	*
				4803	* Create a parser context for a file content.
				4804	* Automatic support for ZLIB/Compress compressed document is provided
				4805	* by default if found at compile-time.
				4806	*
				4807	* Returns the new parser context or NULL
				4808	*/
				4809	htmlParserCtxtPtr
				4810	htmlCreateFileParserCtxt(const char filename, const char encoding)
				4811	{
				4812	htmlParserCtxtPtr ctxt;
				4813	htmlParserInputPtr inputStream;
				4814	xmlParserInputBufferPtr buf;
				4815	/* htmlCharEncoding enc; */
				4816	xmlChar content, content_line = (xmlChar *) "charset=";
				4817
				4818	buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
				4819	if (buf == NULL) return(NULL);
				4820
				4821	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				4822	if (ctxt == NULL) {
				4823	perror("malloc");
				4824	return(NULL);
				4825	}
				4826	memset(ctxt, 0, sizeof(htmlParserCtxt));
				4827	htmlInitParserCtxt(ctxt);
				4828	inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				4829	if (inputStream == NULL) {
				4830	perror("malloc");
				4831	xmlFree(ctxt);
				4832	return(NULL);
				4833	}
				4834	memset(inputStream, 0, sizeof(htmlParserInput));
				4835
				4836	inputStream->filename = xmlMemStrdup(filename);
				4837	inputStream->line = 1;
				4838	inputStream->col = 1;
				4839	inputStream->buf = buf;
				4840	inputStream->directory = NULL;
				4841
				4842	inputStream->base = inputStream->buf->buffer->content;
				4843	inputStream->cur = inputStream->buf->buffer->content;
				4844	inputStream->free = NULL;
				4845
				4846	inputPush(ctxt, inputStream);
				4847
				4848	/* set encoding */
				4849	if (encoding) {
				4850	content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
				4851	if (content) {
				4852	strcpy ((char )content, (char )content_line);
				4853	strcat ((char )content, (char )encoding);
				4854	htmlCheckEncoding (ctxt, content);
				4855	xmlFree (content);
				4856	}
				4857	}
				4858
				4859	return(ctxt);
				4860	}
				4861
				4862	/**
				4863	* htmlSAXParseFile :
				4864	* @filename: the filename
				4865	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4866	* @sax: the SAX handler block
				4867	* @userData: if using SAX, this pointer will be provided on callbacks.
				4868	*
				4869	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				4870	* compressed document is provided by default if found at compile-time.
				4871	* It use the given SAX function block to handle the parsing callback.
				4872	* If sax is NULL, fallback to the default DOM tree building routines.
				4873	*
Daniel Veillard	4d65a1c	2001-07-04 22:06:23 +0000	[diff] [blame]	4874	* Returns the resulting document tree unless SAX is NULL or the document is
				4875	* not well formed.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4876	*/
				4877
				4878	htmlDocPtr
				4879	htmlSAXParseFile(const char filename, const char encoding, htmlSAXHandlerPtr sax,
				4880	void *userData) {
				4881	htmlDocPtr ret;
				4882	htmlParserCtxtPtr ctxt;
				4883	htmlSAXHandlerPtr oldsax = NULL;
				4884
Daniel Veillard	d046356	2001-10-13 09:15:48 +0000	[diff] [blame]	4885	xmlInitParser();
				4886
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4887	ctxt = htmlCreateFileParserCtxt(filename, encoding);
				4888	if (ctxt == NULL) return(NULL);
				4889	if (sax != NULL) {
				4890	oldsax = ctxt->sax;
				4891	ctxt->sax = sax;
				4892	ctxt->userData = userData;
				4893	}
				4894
				4895	htmlParseDocument(ctxt);
				4896
				4897	ret = ctxt->myDoc;
				4898	if (sax != NULL) {
				4899	ctxt->sax = oldsax;
				4900	ctxt->userData = NULL;
				4901	}
				4902	htmlFreeParserCtxt(ctxt);
				4903
				4904	return(ret);
				4905	}
				4906
				4907	/**
				4908	* htmlParseFile :
				4909	* @filename: the filename
				4910	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4911	*
				4912	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				4913	* compressed document is provided by default if found at compile-time.
				4914	*
				4915	* Returns the resulting document tree
				4916	*/
				4917
				4918	htmlDocPtr
				4919	htmlParseFile(const char filename, const char encoding) {
				4920	return(htmlSAXParseFile(filename, encoding, NULL, NULL));
				4921	}
				4922
				4923	/**
				4924	* htmlHandleOmittedElem:
				4925	* @val: int 0 or 1
				4926	*
				4927	* Set and return the previous value for handling HTML omitted tags.
				4928	*
				4929	* Returns the last value for 0 for no handling, 1 for auto insertion.
				4930	*/
				4931
				4932	int
				4933	htmlHandleOmittedElem(int val) {
				4934	int old = htmlOmittedDefaultValue;
				4935
				4936	htmlOmittedDefaultValue = val;
				4937	return(old);
				4938	}
				4939
				4940	#endif /* LIBXML_HTML_ENABLED */