Blame - HTMLparser.c - platform/external/libxml2

blob: 6824ddd1414ca261c455ff7d5de9160500e31c9a [file] [log] [blame]

Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1	/*
				2	* HTMLparser.c : an HTML 4.0 non-verifying parser
				3	*
				4	* See Copyright for the status of this software.
				5	*
Daniel Veillard	c5d6434	2001-06-24 12:13:24 +0000	[diff] [blame]	6	* daniel@veillard.com
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	7	*/
				8
Bjorn Reese	70a9da5	2001-04-21 16:57:29 +0000	[diff] [blame]	9	#include "libxml.h"
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	10	#ifdef LIBXML_HTML_ENABLED
Bjorn Reese	70a9da5	2001-04-21 16:57:29 +0000	[diff] [blame]	11
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	12	#include <string.h>
				13	#ifdef HAVE_CTYPE_H
				14	#include <ctype.h>
				15	#endif
				16	#ifdef HAVE_STDLIB_H
				17	#include <stdlib.h>
				18	#endif
				19	#ifdef HAVE_SYS_STAT_H
				20	#include <sys/stat.h>
				21	#endif
				22	#ifdef HAVE_FCNTL_H
				23	#include <fcntl.h>
				24	#endif
				25	#ifdef HAVE_UNISTD_H
				26	#include <unistd.h>
				27	#endif
				28	#ifdef HAVE_ZLIB_H
				29	#include <zlib.h>
				30	#endif
				31
				32	#include <libxml/xmlmemory.h>
				33	#include <libxml/tree.h>
				34	#include <libxml/parser.h>
				35	#include <libxml/parserInternals.h>
				36	#include <libxml/xmlerror.h>
				37	#include <libxml/HTMLparser.h>
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	38	#include <libxml/HTMLtree.h>
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	39	#include <libxml/entities.h>
				40	#include <libxml/encoding.h>
				41	#include <libxml/valid.h>
				42	#include <libxml/xmlIO.h>
				43
				44	#define HTML_MAX_NAMELEN 1000
				45	#define HTML_PARSER_BIG_BUFFER_SIZE 1000
				46	#define HTML_PARSER_BUFFER_SIZE 100
				47
				48	/* #define DEBUG */
				49	/* #define DEBUG_PUSH */
				50
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	51	static int htmlOmittedDefaultValue = 1;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	52
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	53	xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
				54	xmlChar end, xmlChar end2, xmlChar end3);
				55
				56	/************************************************************************
				57	* *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	58	* Parser stacks related functions and macros *
				59	* *
				60	************************************************************************/
				61
				62	/*
				63	* Generic function for accessing stacks in the Parser Context
				64	*/
				65
				66	#define PUSH_AND_POP(scope, type, name) \
				67	scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
				68	if (ctxt->name##Nr >= ctxt->name##Max) { \
				69	ctxt->name##Max *= 2; \
				70	ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
				71	ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
				72	if (ctxt->name##Tab == NULL) { \
				73	xmlGenericError(xmlGenericErrorContext, \
				74	"realloc failed !\n"); \
				75	return(0); \
				76	} \
				77	} \
				78	ctxt->name##Tab[ctxt->name##Nr] = value; \
				79	ctxt->name = value; \
				80	return(ctxt->name##Nr++); \
				81	} \
				82	scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
				83	type ret; \
				84	if (ctxt->name##Nr < 0) return(0); \
				85	ctxt->name##Nr--; \
				86	if (ctxt->name##Nr < 0) return(0); \
				87	if (ctxt->name##Nr > 0) \
				88	ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
				89	else \
				90	ctxt->name = NULL; \
				91	ret = ctxt->name##Tab[ctxt->name##Nr]; \
				92	ctxt->name##Tab[ctxt->name##Nr] = 0; \
				93	return(ret); \
				94	} \
				95
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	96	/* PUSH_AND_POP(static, xmlNodePtr, node) */
				97	PUSH_AND_POP(static, xmlChar*, name)
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	98
				99	/*
				100	* Macros for accessing the content. Those should be used only by the parser,
				101	* and not exported.
				102	*
				103	* Dirty macros, i.e. one need to make assumption on the context to use them
				104	*
				105	* CUR_PTR return the current pointer to the xmlChar to be parsed.
				106	* CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
				107	* in ISO-Latin or UTF-8, and the current 16 bit value if compiled
				108	* in UNICODE mode. This should be used internally by the parser
				109	* only to compare to ASCII values otherwise it would break when
				110	* running with UTF-8 encoding.
				111	* NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
				112	* to compare on ASCII based substring.
				113	* UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
				114	* it should be used only to compare on ASCII based substring.
				115	* SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
				116	* strings within the parser.
				117	*
				118	* Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
				119	*
				120	* CURRENT Returns the current char value, with the full decoding of
				121	* UTF-8 if we are using this mode. It returns an int.
				122	* NEXT Skip to the next character, this does the proper decoding
				123	* in UTF-8 mode. It also pop-up unfinished entities on the fly.
				124	* COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
				125	*/
				126
				127	#define UPPER (toupper(*ctxt->input->cur))
				128
				129	#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
				130
				131	#define NXT(val) ctxt->input->cur[(val)]
				132
				133	#define UPP(val) (toupper(ctxt->input->cur[(val)]))
				134
				135	#define CUR_PTR ctxt->input->cur
				136
				137	#define SHRINK xmlParserInputShrink(ctxt->input)
				138
				139	#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
				140
				141	#define CURRENT ((int) (*ctxt->input->cur))
				142
				143	#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
				144
				145	/* Inported from XML */
				146
				147	/* #define CUR (ctxt->token ? ctxt->token : (int) (ctxt->input->cur)) /
				148	#define CUR ((int) (*ctxt->input->cur))
				149	#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
				150
				151	#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
				152	#define NXT(val) ctxt->input->cur[(val)]
				153	#define CUR_PTR ctxt->input->cur
				154
				155
				156	#define NEXTL(l) do { \
				157	if (*(ctxt->input->cur) == '\n') { \
				158	ctxt->input->line++; ctxt->input->col = 1; \
				159	} else ctxt->input->col++; \
				160	ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
				161	} while (0)
				162
				163	/************
				164	\
				165	if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
				166	if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
				167	************/
				168
				169	#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
				170	#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
				171
				172	#define COPY_BUF(l,b,i,v) \
				173	if (l == 1) b[i++] = (xmlChar) v; \
				174	else i += xmlCopyChar(l,&b[i],v)
				175
				176	/**
				177	* htmlCurrentChar:
				178	* @ctxt: the HTML parser context
				179	* @len: pointer to the length of the char read
				180	*
				181	* The current char value, if using UTF-8 this may actaully span multiple
				182	* bytes in the input buffer. Implement the end of line normalization:
				183	* 2.11 End-of-Line Handling
				184	* If the encoding is unspecified, in the case we find an ISO-Latin-1
				185	* char, then the encoding converter is plugged in automatically.
				186	*
				187	* Returns the current char value and its lenght
				188	*/
				189
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	190	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	191	htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
				192	if (ctxt->instate == XML_PARSER_EOF)
				193	return(0);
				194
				195	if (ctxt->token != 0) {
				196	*len = 0;
				197	return(ctxt->token);
				198	}
				199	if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
				200	/*
				201	* We are supposed to handle UTF8, check it's valid
				202	* From rfc2044: encoding of the Unicode values on UTF-8:
				203	*
				204	* UCS-4 range (hex.) UTF-8 octet sequence (binary)
				205	* 0000 0000-0000 007F 0xxxxxxx
				206	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
				207	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
				208	*
				209	* Check for the 0x110000 limit too
				210	*/
				211	const unsigned char *cur = ctxt->input->cur;
				212	unsigned char c;
				213	unsigned int val;
				214
				215	c = *cur;
				216	if (c & 0x80) {
				217	if (cur[1] == 0)
				218	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				219	if ((cur[1] & 0xc0) != 0x80)
				220	goto encoding_error;
				221	if ((c & 0xe0) == 0xe0) {
				222
				223	if (cur[2] == 0)
				224	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				225	if ((cur[2] & 0xc0) != 0x80)
				226	goto encoding_error;
				227	if ((c & 0xf0) == 0xf0) {
				228	if (cur[3] == 0)
				229	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				230	if (((c & 0xf8) != 0xf0) \|\|
				231	((cur[3] & 0xc0) != 0x80))
				232	goto encoding_error;
				233	/* 4-byte code */
				234	*len = 4;
				235	val = (cur[0] & 0x7) << 18;
				236	val \|= (cur[1] & 0x3f) << 12;
				237	val \|= (cur[2] & 0x3f) << 6;
				238	val \|= cur[3] & 0x3f;
				239	} else {
				240	/* 3-byte code */
				241	*len = 3;
				242	val = (cur[0] & 0xf) << 12;
				243	val \|= (cur[1] & 0x3f) << 6;
				244	val \|= cur[2] & 0x3f;
				245	}
				246	} else {
				247	/* 2-byte code */
				248	*len = 2;
				249	val = (cur[0] & 0x1f) << 6;
				250	val \|= cur[1] & 0x3f;
				251	}
				252	if (!IS_CHAR(val)) {
				253	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				254	if ((ctxt->sax != NULL) &&
				255	(ctxt->sax->error != NULL))
				256	ctxt->sax->error(ctxt->userData,
				257	"Char 0x%X out of allowed range\n", val);
				258	ctxt->wellFormed = 0;
				259	ctxt->disableSAX = 1;
				260	}
				261	return(val);
				262	} else {
				263	/* 1-byte code */
				264	*len = 1;
				265	return((int) *ctxt->input->cur);
				266	}
				267	}
				268	/*
				269	* Assume it's a fixed lenght encoding (1) with
				270	* a compatibke encoding for the ASCII set, since
				271	* XML constructs only use < 128 chars
				272	*/
				273	*len = 1;
				274	if ((int) *ctxt->input->cur < 0x80)
				275	return((int) *ctxt->input->cur);
				276
				277	/*
				278	* Humm this is bad, do an automatic flow conversion
				279	*/
				280	xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
				281	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				282	return(xmlCurrentChar(ctxt, len));
				283
				284	encoding_error:
				285	/*
				286	* If we detect an UTF8 error that probably mean that the
				287	* input encoding didn't get properly advertized in the
				288	* declaration header. Report the error and switch the encoding
				289	* to ISO-Latin-1 (if you don't like this policy, just declare the
				290	* encoding !)
				291	*/
				292	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				293	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
				294	ctxt->sax->error(ctxt->userData,
				295	"Input is not proper UTF-8, indicate encoding !\n");
				296	ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
				297	ctxt->input->cur[0], ctxt->input->cur[1],
				298	ctxt->input->cur[2], ctxt->input->cur[3]);
				299	}
				300
				301	ctxt->charset = XML_CHAR_ENCODING_8859_1;
				302	*len = 1;
				303	return((int) *ctxt->input->cur);
				304	}
				305
				306	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	307	* htmlSkipBlankChars:
				308	* @ctxt: the HTML parser context
				309	*
				310	* skip all blanks character found at that point in the input streams.
				311	*
				312	* Returns the number of space chars skipped
				313	*/
				314
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	315	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	316	htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
				317	int res = 0;
				318
				319	while (IS_BLANK(*(ctxt->input->cur))) {
				320	if ((*ctxt->input->cur == 0) &&
				321	(xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
				322	xmlPopInput(ctxt);
				323	} else {
				324	if (*(ctxt->input->cur) == '\n') {
				325	ctxt->input->line++; ctxt->input->col = 1;
				326	} else ctxt->input->col++;
				327	ctxt->input->cur++;
				328	ctxt->nbChars++;
				329	if (*ctxt->input->cur == 0)
				330	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				331	}
				332	res++;
				333	}
				334	return(res);
				335	}
				336
				337
				338
				339	/************************************************************************
				340	* *
				341	* The list of HTML elements and their properties *
				342	* *
				343	************************************************************************/
				344
				345	/*
				346	* Start Tag: 1 means the start tag can be ommited
				347	* End Tag: 1 means the end tag can be ommited
				348	* 2 means it's forbidden (empty elements)
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	349	* 3 means the tag is stylistic and should be closed easilly
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	350	* Depr: this element is deprecated
				351	* DTD: 1 means that this element is valid only in the Loose DTD
				352	* 2 means that this element is valid only in the Frameset DTD
				353	*
Daniel Veillard	02bb170	2001-06-13 21:11:59 +0000	[diff] [blame]	354	* Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	355	*/
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	356	static const htmlElemDesc
				357	html40ElementTable[] = {
Daniel Veillard	02bb170	2001-06-13 21:11:59 +0000	[diff] [blame]	358	{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor " },
				359	{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form" },
				360	{ "acronym", 0, 0, 0, 0, 0, 0, 1, "" },
				361	{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author " },
				362	{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet " },
				363	{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area " },
				364	{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style" },
				365	{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri " },
				366	{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " },
				367	{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride " },
				368	{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style" },
				369	{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation " },
				370	{ "body", 1, 1, 0, 0, 0, 0, 0, "document body " },
				371	{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break " },
				372	{ "button", 0, 0, 0, 0, 0, 0, 2, "push button " },
				373	{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption " },
				374	{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center " },
				375	{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation" },
				376	{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment" },
				377	{ "col", 0, 2, 2, 1, 0, 0, 0, "table column " },
				378	{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group " },
				379	{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description " },
				380	{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text " },
				381	{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition" },
				382	{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list" },
				383	{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container"},
				384	{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list " },
				385	{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term " },
				386	{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis" },
				387	{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group " },
				388	{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font " },
				389	{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form " },
				390	{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " },
				391	{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" },
				392	{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading " },
				393	{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading " },
				394	{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading " },
				395	{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading " },
				396	{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading " },
				397	{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading " },
				398	{ "head", 1, 1, 0, 0, 0, 0, 0, "document head " },
				399	{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " },
				400	{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element " },
				401	{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style" },
				402	{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow " },
				403	{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image " },
				404	{ "input", 0, 2, 2, 1, 0, 0, 1, "form control " },
				405	{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text" },
				406	{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt " },
				407	{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user" },
				408	{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text " },
				409	{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend " },
				410	{ "li", 0, 1, 1, 0, 0, 0, 0, "list item " },
				411	{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link " },
				412	{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map " },
				413	{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list " },
				414	{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation " },
				415	{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering " },
				416	{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
				417	{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object " },
				418	{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list " },
				419	{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group " },
				420	{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " },
				421	{ "p", 0, 1, 1, 0, 0, 0, 0, "paragraph " },
				422	{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value " },
				423	{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text " },
				424	{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation " },
				425	{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style" },
				426	{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc." },
				427	{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements " },
				428	{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector " },
				429	{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style" },
				430	{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container " },
				431	{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text" },
				432	{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis" },
				433	{ "style", 0, 0, 0, 0, 0, 0, 0, "style info " },
				434	{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript" },
				435	{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript " },
				436	{ "table", 0, 0, 0, 0, 0, 0, 0, " " },
				437	{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body " },
				438	{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell" },
				439	{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field " },
				440	{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer " },
				441	{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell" },
				442	{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header " },
				443	{ "title", 0, 0, 0, 0, 0, 0, 0, "document title " },
				444	{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row " },
				445	{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style" },
				446	{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style" },
				447	{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list " },
				448	{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	449	};
				450
				451	/*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	452	* start tags that imply the end of current element
				453	*/
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	454	static const char *htmlStartClose[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	455	"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
				456	"dl", "ul", "ol", "menu", "dir", "address", "pre",
				457	"listing", "xmp", "head", NULL,
				458	"head", "p", NULL,
				459	"title", "p", NULL,
				460	"body", "head", "style", "link", "title", "p", NULL,
				461	"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
				462	"pre", "listing", "xmp", "head", "li", NULL,
				463	"hr", "p", "head", NULL,
				464	"h1", "p", "head", NULL,
				465	"h2", "p", "head", NULL,
				466	"h3", "p", "head", NULL,
				467	"h4", "p", "head", NULL,
				468	"h5", "p", "head", NULL,
				469	"h6", "p", "head", NULL,
				470	"dir", "p", "head", NULL,
				471	"address", "p", "head", "ul", NULL,
				472	"pre", "p", "head", "ul", NULL,
				473	"listing", "p", "head", NULL,
				474	"xmp", "p", "head", NULL,
				475	"blockquote", "p", "head", NULL,
				476	"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
				477	"xmp", "head", NULL,
				478	"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
				479	"head", "dd", NULL,
				480	"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
				481	"head", "dt", NULL,
				482	"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
				483	"listing", "xmp", NULL,
				484	"ol", "p", "head", "ul", NULL,
				485	"menu", "p", "head", "ul", NULL,
				486	"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
				487	"div", "p", "head", NULL,
				488	"noscript", "p", "head", NULL,
				489	"center", "font", "b", "i", "p", "head", NULL,
				490	"a", "a", NULL,
				491	"caption", "p", NULL,
				492	"colgroup", "caption", "colgroup", "col", "p", NULL,
				493	"col", "caption", "col", "p", NULL,
				494	"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
				495	"listing", "xmp", "a", NULL,
Daniel Veillard	43dadeb	2001-04-24 11:23:35 +0000	[diff] [blame]	496	"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
				497	"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	498	"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
				499	"thead", "caption", "col", "colgroup", NULL,
				500	"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
				501	"tbody", "p", NULL,
				502	"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
				503	"tfoot", "tbody", "p", NULL,
				504	"optgroup", "option", NULL,
				505	"option", "option", NULL,
				506	"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
				507	"pre", "listing", "xmp", "a", NULL,
				508	NULL
				509	};
				510
				511	/*
				512	* The list of HTML elements which are supposed not to have
				513	* CDATA content and where a p element will be implied
				514	*
				515	* TODO: extend that list by reading the HTML SGML DtD on
				516	* implied paragraph
				517	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	518	static const char *htmlNoContentElements[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	519	"html",
				520	"head",
				521	"body",
				522	NULL
				523	};
				524
				525	/*
				526	* The list of HTML attributes which are of content %Script;
				527	* NOTE: when adding ones, check htmlIsScriptAttribute() since
				528	* it assumes the name starts with 'on'
				529	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	530	static const char *htmlScriptAttributes[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	531	"onclick",
				532	"ondblclick",
				533	"onmousedown",
				534	"onmouseup",
				535	"onmouseover",
				536	"onmousemove",
				537	"onmouseout",
				538	"onkeypress",
				539	"onkeydown",
				540	"onkeyup",
				541	"onload",
				542	"onunload",
				543	"onfocus",
				544	"onblur",
				545	"onsubmit",
				546	"onrest",
				547	"onchange",
				548	"onselect"
				549	};
				550
Daniel Veillard	a2bc368	2001-05-03 08:27:20 +0000	[diff] [blame]	551	/*
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	552	* This table is used by the htmlparser to know what to do with
				553	* broken html pages. By assigning different priorities to different
				554	* elements the parser can decide how to handle extra endtags.
				555	* Endtags are only allowed to close elements with lower or equal
				556	* priority.
				557	*/
Daniel Veillard	a2bc368	2001-05-03 08:27:20 +0000	[diff] [blame]	558
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	559	typedef struct {
				560	const char *name;
				561	int priority;
				562	} elementPriority;
				563
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	564	static const elementPriority htmlEndPriority[] = {
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	565	{"div", 150},
				566	{"td", 160},
				567	{"th", 160},
				568	{"tr", 170},
				569	{"thead", 180},
				570	{"tbody", 180},
				571	{"tfoot", 180},
				572	{"table", 190},
				573	{"head", 200},
				574	{"body", 200},
				575	{"html", 220},
				576	{NULL, 100} /* Default priority */
				577	};
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	578
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	579	static const char** htmlStartCloseIndex[100];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	580	static int htmlStartCloseIndexinitialized = 0;
				581
				582	/************************************************************************
				583	* *
				584	* functions to handle HTML specific data *
				585	* *
				586	************************************************************************/
				587
				588	/**
				589	* htmlInitAutoClose:
				590	*
				591	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				592	* This is not reentrant. Call xmlInitParser() once before processing in
				593	* case of use in multithreaded programs.
				594	*/
				595	void
				596	htmlInitAutoClose(void) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	597	int indx, i = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	598
				599	if (htmlStartCloseIndexinitialized) return;
				600
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	601	for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
				602	indx = 0;
				603	while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
				604	htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	605	while (htmlStartClose[i] != NULL) i++;
				606	i++;
				607	}
				608	htmlStartCloseIndexinitialized = 1;
				609	}
				610
				611	/**
				612	* htmlTagLookup:
				613	* @tag: The tag name in lowercase
				614	*
				615	* Lookup the HTML tag in the ElementTable
				616	*
				617	* Returns the related htmlElemDescPtr or NULL if not found.
				618	*/
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame^]	619	const htmlElemDesc *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	620	htmlTagLookup(const xmlChar *tag) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	621	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	622
				623	for (i = 0; i < (sizeof(html40ElementTable) /
				624	sizeof(html40ElementTable[0]));i++) {
Daniel Veillard	1ed3f88	2001-04-18 09:45:35 +0000	[diff] [blame]	625	if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	626	return((const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	627	}
				628	return(NULL);
				629	}
				630
				631	/**
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	632	* htmlGetEndPriority:
				633	* @name: The name of the element to look up the priority for.
				634	*
				635	* Return value: The "endtag" priority.
				636	**/
				637	static int
				638	htmlGetEndPriority (const xmlChar *name) {
				639	int i = 0;
				640
				641	while ((htmlEndPriority[i].name != NULL) &&
				642	(!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
				643	i++;
				644
				645	return(htmlEndPriority[i].priority);
				646	}
				647
				648	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	649	* htmlCheckAutoClose:
				650	* @newtag: The new tag name
				651	* @oldtag: The old tag name
				652	*
				653	* Checks wether the new tag is one of the registered valid tags for closing old.
				654	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				655	*
				656	* Returns 0 if no, 1 if yes.
				657	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	658	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	659	htmlCheckAutoClose(const xmlChar newtag, const xmlChar oldtag) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	660	int i, indx;
				661	const char **closed = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	662
				663	if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
				664
				665	/* inefficient, but not a big deal */
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	666	for (indx = 0; indx < 100;indx++) {
				667	closed = htmlStartCloseIndex[indx];
				668	if (closed == NULL) return(0);
				669	if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	670	}
				671
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	672	i = closed - htmlStartClose;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	673	i++;
				674	while (htmlStartClose[i] != NULL) {
				675	if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
				676	return(1);
				677	}
				678	i++;
				679	}
				680	return(0);
				681	}
				682
				683	/**
				684	* htmlAutoCloseOnClose:
				685	* @ctxt: an HTML parser context
				686	* @newtag: The new tag name
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	687	* @force: force the tag closure
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	688	*
				689	* The HTmL DtD allows an ending tag to implicitely close other tags.
				690	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	691	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	692	htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame^]	693	const htmlElemDesc * info;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	694	xmlChar *oldname;
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	695	int i, priority;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	696
				697	#ifdef DEBUG
				698	xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
				699	for (i = 0;i < ctxt->nameNr;i++)
				700	xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
				701	#endif
				702
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	703	priority = htmlGetEndPriority (newtag);
				704
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	705	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	706
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	707	if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	708	/*
				709	* A missplaced endtagad can only close elements with lower
				710	* or equal priority, so if we find an element with higher
				711	* priority before we find an element with
				712	* matching name, we just ignore this endtag
				713	*/
				714	if (htmlGetEndPriority (ctxt->nameTab[i]) > priority) return;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	715	}
				716	if (i < 0) return;
				717
				718	while (!xmlStrEqual(newtag, ctxt->name)) {
				719	info = htmlTagLookup(ctxt->name);
				720	if ((info == NULL) \|\| (info->endTag == 1)) {
				721	#ifdef DEBUG
				722	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
				723	#endif
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	724	} else if (info->endTag == 3) {
				725	#ifdef DEBUG
Daniel Veillard	f69bb4b	2001-05-19 13:24:56 +0000	[diff] [blame]	726	xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", newtag, ctxt->name);
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	727	#endif
				728	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				729	ctxt->sax->error(ctxt->userData,
				730	"Opening and ending tag mismatch: %s and %s\n",
				731	newtag, ctxt->name);
				732	ctxt->wellFormed = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	733	}
				734	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				735	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				736	oldname = htmlnamePop(ctxt);
				737	if (oldname != NULL) {
				738	#ifdef DEBUG
				739	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
				740	#endif
				741	xmlFree(oldname);
				742	}
				743	}
				744	}
				745
				746	/**
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	747	* htmlAutoCloseOnEnd:
				748	* @ctxt: an HTML parser context
				749	*
				750	* Close all remaining tags at the end of the stream
				751	*/
				752	static void
				753	htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
				754	xmlChar *oldname;
				755	int i;
				756
				757	if (ctxt->nameNr == 0)
				758	return;
				759	#ifdef DEBUG
				760	xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
				761	#endif
				762
				763	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
				764	#ifdef DEBUG
				765	xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
				766	#endif
				767	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				768	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				769	oldname = htmlnamePop(ctxt);
				770	if (oldname != NULL) {
				771	#ifdef DEBUG
				772	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
				773	#endif
				774	xmlFree(oldname);
				775	}
				776	}
				777	}
				778
				779	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	780	* htmlAutoClose:
				781	* @ctxt: an HTML parser context
				782	* @newtag: The new tag name or NULL
				783	*
				784	* The HTmL DtD allows a tag to implicitely close other tags.
				785	* The list is kept in htmlStartClose array. This function is
				786	* called when a new tag has been detected and generates the
				787	* appropriates closes if possible/needed.
				788	* If newtag is NULL this mean we are at the end of the resource
				789	* and we should check
				790	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	791	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	792	htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				793	xmlChar *oldname;
				794	while ((newtag != NULL) && (ctxt->name != NULL) &&
				795	(htmlCheckAutoClose(newtag, ctxt->name))) {
				796	#ifdef DEBUG
				797	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
				798	#endif
				799	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				800	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				801	oldname = htmlnamePop(ctxt);
				802	if (oldname != NULL) {
				803	#ifdef DEBUG
				804	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
				805	#endif
				806	xmlFree(oldname);
				807	}
				808	}
				809	if (newtag == NULL) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	810	htmlAutoCloseOnEnd(ctxt);
				811	return;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	812	}
				813	while ((newtag == NULL) && (ctxt->name != NULL) &&
				814	((xmlStrEqual(ctxt->name, BAD_CAST"head")) \|\|
				815	(xmlStrEqual(ctxt->name, BAD_CAST"body")) \|\|
				816	(xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
				817	#ifdef DEBUG
				818	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
				819	#endif
				820	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				821	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				822	oldname = htmlnamePop(ctxt);
				823	if (oldname != NULL) {
				824	#ifdef DEBUG
				825	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
				826	#endif
				827	xmlFree(oldname);
				828	}
				829	}
				830
				831	}
				832
				833	/**
				834	* htmlAutoCloseTag:
				835	* @doc: the HTML document
				836	* @name: The tag name
				837	* @elem: the HTML element
				838	*
				839	* The HTmL DtD allows a tag to implicitely close other tags.
				840	* The list is kept in htmlStartClose array. This function checks
				841	* if the element or one of it's children would autoclose the
				842	* given tag.
				843	*
				844	* Returns 1 if autoclose, 0 otherwise
				845	*/
				846	int
				847	htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
				848	htmlNodePtr child;
				849
				850	if (elem == NULL) return(1);
				851	if (xmlStrEqual(name, elem->name)) return(0);
				852	if (htmlCheckAutoClose(elem->name, name)) return(1);
				853	child = elem->children;
				854	while (child != NULL) {
				855	if (htmlAutoCloseTag(doc, name, child)) return(1);
				856	child = child->next;
				857	}
				858	return(0);
				859	}
				860
				861	/**
				862	* htmlIsAutoClosed:
				863	* @doc: the HTML document
				864	* @elem: the HTML element
				865	*
				866	* The HTmL DtD allows a tag to implicitely close other tags.
				867	* The list is kept in htmlStartClose array. This function checks
				868	* if a tag is autoclosed by one of it's child
				869	*
				870	* Returns 1 if autoclosed, 0 otherwise
				871	*/
				872	int
				873	htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
				874	htmlNodePtr child;
				875
				876	if (elem == NULL) return(1);
				877	child = elem->children;
				878	while (child != NULL) {
				879	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
				880	child = child->next;
				881	}
				882	return(0);
				883	}
				884
				885	/**
				886	* htmlCheckImplied:
				887	* @ctxt: an HTML parser context
				888	* @newtag: The new tag name
				889	*
				890	* The HTML DtD allows a tag to exists only implicitely
				891	* called when a new tag has been detected and generates the
				892	* appropriates implicit tags if missing
				893	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	894	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	895	htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				896	if (!htmlOmittedDefaultValue)
				897	return;
				898	if (xmlStrEqual(newtag, BAD_CAST"html"))
				899	return;
				900	if (ctxt->nameNr <= 0) {
				901	#ifdef DEBUG
				902	xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
				903	#endif
				904	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
				905	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				906	ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
				907	}
				908	if ((xmlStrEqual(newtag, BAD_CAST"body")) \|\| (xmlStrEqual(newtag, BAD_CAST"head")))
				909	return;
				910	if ((ctxt->nameNr <= 1) &&
				911	((xmlStrEqual(newtag, BAD_CAST"script")) \|\|
				912	(xmlStrEqual(newtag, BAD_CAST"style")) \|\|
				913	(xmlStrEqual(newtag, BAD_CAST"meta")) \|\|
				914	(xmlStrEqual(newtag, BAD_CAST"link")) \|\|
				915	(xmlStrEqual(newtag, BAD_CAST"title")) \|\|
				916	(xmlStrEqual(newtag, BAD_CAST"base")))) {
				917	/*
				918	* dropped OBJECT ... i you put it first BODY will be
				919	* assumed !
				920	*/
				921	#ifdef DEBUG
				922	xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
				923	#endif
				924	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
				925	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				926	ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
				927	} else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
				928	(!xmlStrEqual(newtag, BAD_CAST"frame")) &&
				929	(!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
				930	int i;
				931	for (i = 0;i < ctxt->nameNr;i++) {
				932	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
				933	return;
				934	}
				935	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
				936	return;
				937	}
				938	}
				939
				940	#ifdef DEBUG
				941	xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
				942	#endif
				943	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
				944	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				945	ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
				946	}
				947	}
				948
				949	/**
				950	* htmlCheckParagraph
				951	* @ctxt: an HTML parser context
				952	*
				953	* Check whether a p element need to be implied before inserting
				954	* characters in the current element.
				955	*
				956	* Returns 1 if a paragraph has been inserted, 0 if not and -1
				957	* in case of error.
				958	*/
				959
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	960	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	961	htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
				962	const xmlChar *tag;
				963	int i;
				964
				965	if (ctxt == NULL)
				966	return(-1);
				967	tag = ctxt->name;
				968	if (tag == NULL) {
				969	htmlAutoClose(ctxt, BAD_CAST"p");
				970	htmlCheckImplied(ctxt, BAD_CAST"p");
				971	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
				972	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				973	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
				974	return(1);
				975	}
				976	if (!htmlOmittedDefaultValue)
				977	return(0);
				978	for (i = 0; htmlNoContentElements[i] != NULL; i++) {
				979	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
				980	#ifdef DEBUG
				981	xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
				982	#endif
				983	htmlAutoClose(ctxt, BAD_CAST"p");
				984	htmlCheckImplied(ctxt, BAD_CAST"p");
				985	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
				986	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				987	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
				988	return(1);
				989	}
				990	}
				991	return(0);
				992	}
				993
				994	/**
				995	* htmlIsScriptAttribute:
				996	* @name: an attribute name
				997	*
				998	* Check if an attribute is of content type Script
				999	*
				1000	* Returns 1 is the attribute is a script 0 otherwise
				1001	*/
				1002	int
				1003	htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1004	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1005
				1006	if (name == NULL)
				1007	return(0);
				1008	/*
				1009	* all script attributes start with 'on'
				1010	*/
				1011	if ((name[0] != 'o') \|\| (name[1] != 'n'))
				1012	return(0);
				1013	for (i = 0;
				1014	i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
				1015	i++) {
				1016	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
				1017	return(1);
				1018	}
				1019	return(0);
				1020	}
				1021
				1022	/************************************************************************
				1023	* *
				1024	* The list of HTML predefined entities *
				1025	* *
				1026	************************************************************************/
				1027
				1028
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	1029	static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1030	/*
				1031	* the 4 absolute ones, plus apostrophe.
				1032	*/
				1033	{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
				1034	{ 38, "amp", "ampersand, U+0026 ISOnum" },
				1035	{ 39, "apos", "single quote" },
				1036	{ 60, "lt", "less-than sign, U+003C ISOnum" },
				1037	{ 62, "gt", "greater-than sign, U+003E ISOnum" },
				1038
				1039	/*
				1040	* A bunch still in the 128-255 range
				1041	* Replacing them depend really on the charset used.
				1042	*/
				1043	{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
				1044	{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
				1045	{ 162, "cent", "cent sign, U+00A2 ISOnum" },
				1046	{ 163, "pound","pound sign, U+00A3 ISOnum" },
				1047	{ 164, "curren","currency sign, U+00A4 ISOnum" },
				1048	{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
				1049	{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
				1050	{ 167, "sect", "section sign, U+00A7 ISOnum" },
				1051	{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
				1052	{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
				1053	{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
				1054	{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
				1055	{ 172, "not", "not sign, U+00AC ISOnum" },
				1056	{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
				1057	{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
				1058	{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
				1059	{ 176, "deg", "degree sign, U+00B0 ISOnum" },
				1060	{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
				1061	{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
				1062	{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
				1063	{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
				1064	{ 181, "micro","micro sign, U+00B5 ISOnum" },
				1065	{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
				1066	{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
				1067	{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
				1068	{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
				1069	{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
				1070	{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
				1071	{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
				1072	{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
				1073	{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
				1074	{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
				1075	{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
				1076	{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
				1077	{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
				1078	{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
				1079	{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
				1080	{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
				1081	{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
				1082	{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
				1083	{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
				1084	{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
				1085	{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
				1086	{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
				1087	{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
				1088	{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
				1089	{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
				1090	{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
				1091	{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
				1092	{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
				1093	{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
				1094	{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
				1095	{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
				1096	{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
				1097	{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
				1098	{ 215, "times","multiplication sign, U+00D7 ISOnum" },
				1099	{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
				1100	{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
				1101	{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
				1102	{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
				1103	{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
				1104	{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
				1105	{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
				1106	{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
				1107	{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
				1108	{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
				1109	{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
				1110	{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
				1111	{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
				1112	{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
				1113	{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
				1114	{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
				1115	{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
				1116	{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
				1117	{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
				1118	{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
				1119	{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
				1120	{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
				1121	{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
				1122	{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
				1123	{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
				1124	{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
				1125	{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
				1126	{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
				1127	{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
				1128	{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
				1129	{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
				1130	{ 247, "divide","division sign, U+00F7 ISOnum" },
				1131	{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
				1132	{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
				1133	{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
				1134	{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
				1135	{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
				1136	{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
				1137	{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
				1138	{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
				1139
				1140	{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
				1141	{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
				1142	{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
				1143	{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
				1144	{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
				1145
				1146	/*
				1147	* Anything below should really be kept as entities references
				1148	*/
				1149	{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
				1150
				1151	{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
				1152	{ 732, "tilde","small tilde, U+02DC ISOdia" },
				1153
				1154	{ 913, "Alpha","greek capital letter alpha, U+0391" },
				1155	{ 914, "Beta", "greek capital letter beta, U+0392" },
				1156	{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
				1157	{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
				1158	{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
				1159	{ 918, "Zeta", "greek capital letter zeta, U+0396" },
				1160	{ 919, "Eta", "greek capital letter eta, U+0397" },
				1161	{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
				1162	{ 921, "Iota", "greek capital letter iota, U+0399" },
				1163	{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1164	{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1165	{ 924, "Mu", "greek capital letter mu, U+039C" },
				1166	{ 925, "Nu", "greek capital letter nu, U+039D" },
				1167	{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
				1168	{ 927, "Omicron","greek capital letter omicron, U+039F" },
				1169	{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
				1170	{ 929, "Rho", "greek capital letter rho, U+03A1" },
				1171	{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
				1172	{ 932, "Tau", "greek capital letter tau, U+03A4" },
				1173	{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
				1174	{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
				1175	{ 935, "Chi", "greek capital letter chi, U+03A7" },
				1176	{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
				1177	{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
				1178
				1179	{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
				1180	{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
				1181	{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
				1182	{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
				1183	{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
				1184	{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
				1185	{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
				1186	{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
				1187	{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
				1188	{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
				1189	{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
				1190	{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
				1191	{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
				1192	{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
				1193	{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
				1194	{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
				1195	{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
				1196	{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
				1197	{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
				1198	{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
				1199	{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
				1200	{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
				1201	{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
				1202	{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
				1203	{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
				1204	{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
				1205	{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
				1206	{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
				1207
				1208	{ 8194, "ensp", "en space, U+2002 ISOpub" },
				1209	{ 8195, "emsp", "em space, U+2003 ISOpub" },
				1210	{ 8201, "thinsp","thin space, U+2009 ISOpub" },
				1211	{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
				1212	{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
				1213	{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
				1214	{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
				1215	{ 8211, "ndash","en dash, U+2013 ISOpub" },
				1216	{ 8212, "mdash","em dash, U+2014 ISOpub" },
				1217	{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
				1218	{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
				1219	{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
				1220	{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
				1221	{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
				1222	{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
				1223	{ 8224, "dagger","dagger, U+2020 ISOpub" },
				1224	{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
				1225
				1226	{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
				1227	{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
				1228
				1229	{ 8240, "permil","per mille sign, U+2030 ISOtech" },
				1230
				1231	{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
				1232	{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
				1233
				1234	{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
				1235	{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
				1236
				1237	{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
				1238	{ 8260, "frasl","fraction slash, U+2044 NEW" },
				1239
				1240	{ 8364, "euro", "euro sign, U+20AC NEW" },
				1241
				1242	{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
				1243	{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
				1244	{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
				1245	{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
				1246	{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
				1247	{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
				1248	{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
				1249	{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
				1250	{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
				1251	{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
				1252	{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
				1253	{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
				1254	{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
				1255	{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
				1256	{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
				1257	{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
				1258
				1259	{ 8704, "forall","for all, U+2200 ISOtech" },
				1260	{ 8706, "part", "partial differential, U+2202 ISOtech" },
				1261	{ 8707, "exist","there exists, U+2203 ISOtech" },
				1262	{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
				1263	{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
				1264	{ 8712, "isin", "element of, U+2208 ISOtech" },
				1265	{ 8713, "notin","not an element of, U+2209 ISOtech" },
				1266	{ 8715, "ni", "contains as member, U+220B ISOtech" },
				1267	{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
				1268	{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
				1269	{ 8722, "minus","minus sign, U+2212 ISOtech" },
				1270	{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
				1271	{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
				1272	{ 8733, "prop", "proportional to, U+221D ISOtech" },
				1273	{ 8734, "infin","infinity, U+221E ISOtech" },
				1274	{ 8736, "ang", "angle, U+2220 ISOamso" },
				1275	{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
				1276	{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
				1277	{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
				1278	{ 8746, "cup", "union = cup, U+222A ISOtech" },
				1279	{ 8747, "int", "integral, U+222B ISOtech" },
				1280	{ 8756, "there4","therefore, U+2234 ISOtech" },
				1281	{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
				1282	{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
				1283	{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
				1284	{ 8800, "ne", "not equal to, U+2260 ISOtech" },
				1285	{ 8801, "equiv","identical to, U+2261 ISOtech" },
				1286	{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
				1287	{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
				1288	{ 8834, "sub", "subset of, U+2282 ISOtech" },
				1289	{ 8835, "sup", "superset of, U+2283 ISOtech" },
				1290	{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
				1291	{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
				1292	{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
				1293	{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
				1294	{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
				1295	{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
				1296	{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
				1297	{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
				1298	{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
				1299	{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
				1300	{ 8971, "rfloor","right floor, U+230B ISOamsc" },
				1301	{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
				1302	{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
				1303	{ 9674, "loz", "lozenge, U+25CA ISOpub" },
				1304
				1305	{ 9824, "spades","black spade suit, U+2660 ISOpub" },
				1306	{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
				1307	{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
				1308	{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
				1309
				1310	};
				1311
				1312	/************************************************************************
				1313	* *
				1314	* Commodity functions to handle entities *
				1315	* *
				1316	************************************************************************/
				1317
				1318	/*
				1319	* Macro used to grow the current buffer.
				1320	*/
				1321	#define growBuffer(buffer) { \
				1322	buffer##_size *= 2; \
				1323	buffer = (xmlChar ) xmlRealloc(buffer, buffer##_size sizeof(xmlChar)); \
				1324	if (buffer == NULL) { \
				1325	perror("realloc failed"); \
				1326	return(NULL); \
				1327	} \
				1328	}
				1329
				1330	/**
				1331	* htmlEntityLookup:
				1332	* @name: the entity name
				1333	*
				1334	* Lookup the given entity in EntitiesTable
				1335	*
				1336	* TODO: the linear scan is really ugly, an hash table is really needed.
				1337	*
				1338	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
				1339	*/
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame^]	1340	const htmlEntityDesc *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1341	htmlEntityLookup(const xmlChar *name) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1342	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1343
				1344	for (i = 0;i < (sizeof(html40EntitiesTable)/
				1345	sizeof(html40EntitiesTable[0]));i++) {
				1346	if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
				1347	#ifdef DEBUG
				1348	xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
				1349	#endif
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	1350	return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1351	}
				1352	}
				1353	return(NULL);
				1354	}
				1355
				1356	/**
				1357	* htmlEntityValueLookup:
				1358	* @value: the entity's unicode value
				1359	*
				1360	* Lookup the given entity in EntitiesTable
				1361	*
				1362	* TODO: the linear scan is really ugly, an hash table is really needed.
				1363	*
				1364	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
				1365	*/
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame^]	1366	const htmlEntityDesc *
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1367	htmlEntityValueLookup(unsigned int value) {
				1368	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1369	#ifdef DEBUG
Daniel Veillard	f69bb4b	2001-05-19 13:24:56 +0000	[diff] [blame]	1370	unsigned int lv = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1371	#endif
				1372
				1373	for (i = 0;i < (sizeof(html40EntitiesTable)/
				1374	sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1375	if (html40EntitiesTable[i].value >= value) {
				1376	if (html40EntitiesTable[i].value > value)
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1377	break;
				1378	#ifdef DEBUG
				1379	xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
				1380	#endif
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	1381	return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1382	}
				1383	#ifdef DEBUG
				1384	if (lv > html40EntitiesTable[i].value) {
				1385	xmlGenericError(xmlGenericErrorContext,
				1386	"html40EntitiesTable[] is not sorted (%d > %d)!\n",
				1387	lv, html40EntitiesTable[i].value);
				1388	}
				1389	lv = html40EntitiesTable[i].value;
				1390	#endif
				1391	}
				1392	return(NULL);
				1393	}
				1394
				1395	/**
				1396	* UTF8ToHtml:
				1397	* @out: a pointer to an array of bytes to store the result
				1398	* @outlen: the length of @out
				1399	* @in: a pointer to an array of UTF-8 chars
				1400	* @inlen: the length of @in
				1401	*
				1402	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				1403	* plus HTML entities block of chars out.
				1404	*
				1405	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				1406	* The value of @inlen after return is the number of octets consumed
				1407	* as the return value is positive, else unpredictiable.
				1408	* The value of @outlen after return is the number of octets consumed.
				1409	*/
				1410	int
				1411	UTF8ToHtml(unsigned char* out, int *outlen,
				1412	const unsigned char* in, int *inlen) {
				1413	const unsigned char* processed = in;
				1414	const unsigned char* outend;
				1415	const unsigned char* outstart = out;
				1416	const unsigned char* instart = in;
				1417	const unsigned char* inend;
				1418	unsigned int c, d;
				1419	int trailing;
				1420
				1421	if (in == NULL) {
				1422	/*
				1423	* initialization nothing to do
				1424	*/
				1425	*outlen = 0;
				1426	*inlen = 0;
				1427	return(0);
				1428	}
				1429	inend = in + (*inlen);
				1430	outend = out + (*outlen);
				1431	while (in < inend) {
				1432	d = *in++;
				1433	if (d < 0x80) { c= d; trailing= 0; }
				1434	else if (d < 0xC0) {
				1435	/* trailing byte in leading position */
				1436	*outlen = out - outstart;
				1437	*inlen = processed - instart;
				1438	return(-2);
				1439	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				1440	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				1441	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				1442	else {
				1443	/* no chance for this in Ascii */
				1444	*outlen = out - outstart;
				1445	*inlen = processed - instart;
				1446	return(-2);
				1447	}
				1448
				1449	if (inend - in < trailing) {
				1450	break;
				1451	}
				1452
				1453	for ( ; trailing; trailing--) {
				1454	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
				1455	break;
				1456	c <<= 6;
				1457	c \|= d & 0x3F;
				1458	}
				1459
				1460	/* assertion: c is a single UTF-4 value */
				1461	if (c < 0x80) {
				1462	if (out + 1 >= outend)
				1463	break;
				1464	*out++ = c;
				1465	} else {
				1466	int len;
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame^]	1467	const htmlEntityDesc * ent;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1468
				1469	/*
				1470	* Try to lookup a predefined HTML entity for it
				1471	*/
				1472
				1473	ent = htmlEntityValueLookup(c);
				1474	if (ent == NULL) {
				1475	/* no chance for this in Ascii */
				1476	*outlen = out - outstart;
				1477	*inlen = processed - instart;
				1478	return(-2);
				1479	}
				1480	len = strlen(ent->name);
				1481	if (out + 2 + len >= outend)
				1482	break;
				1483	*out++ = '&';
				1484	memcpy(out, ent->name, len);
				1485	out += len;
				1486	*out++ = ';';
				1487	}
				1488	processed = in;
				1489	}
				1490	*outlen = out - outstart;
				1491	*inlen = processed - instart;
				1492	return(0);
				1493	}
				1494
				1495	/**
				1496	* htmlEncodeEntities:
				1497	* @out: a pointer to an array of bytes to store the result
				1498	* @outlen: the length of @out
				1499	* @in: a pointer to an array of UTF-8 chars
				1500	* @inlen: the length of @in
				1501	* @quoteChar: the quote character to escape (' or ") or zero.
				1502	*
				1503	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				1504	* plus HTML entities block of chars out.
				1505	*
				1506	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				1507	* The value of @inlen after return is the number of octets consumed
				1508	* as the return value is positive, else unpredictiable.
				1509	* The value of @outlen after return is the number of octets consumed.
				1510	*/
				1511	int
				1512	htmlEncodeEntities(unsigned char* out, int *outlen,
				1513	const unsigned char* in, int *inlen, int quoteChar) {
				1514	const unsigned char* processed = in;
				1515	const unsigned char* outend = out + (*outlen);
				1516	const unsigned char* outstart = out;
				1517	const unsigned char* instart = in;
				1518	const unsigned char* inend = in + (*inlen);
				1519	unsigned int c, d;
				1520	int trailing;
				1521
				1522	while (in < inend) {
				1523	d = *in++;
				1524	if (d < 0x80) { c= d; trailing= 0; }
				1525	else if (d < 0xC0) {
				1526	/* trailing byte in leading position */
				1527	*outlen = out - outstart;
				1528	*inlen = processed - instart;
				1529	return(-2);
				1530	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				1531	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				1532	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				1533	else {
				1534	/* no chance for this in Ascii */
				1535	*outlen = out - outstart;
				1536	*inlen = processed - instart;
				1537	return(-2);
				1538	}
				1539
				1540	if (inend - in < trailing)
				1541	break;
				1542
				1543	while (trailing--) {
				1544	if (((d= *in++) & 0xC0) != 0x80) {
				1545	*outlen = out - outstart;
				1546	*inlen = processed - instart;
				1547	return(-2);
				1548	}
				1549	c <<= 6;
				1550	c \|= d & 0x3F;
				1551	}
				1552
				1553	/* assertion: c is a single UTF-4 value */
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1554	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
				1555	(c != '&') && (c != '<') && (c != '>')) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1556	if (out >= outend)
				1557	break;
				1558	*out++ = c;
				1559	} else {
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame^]	1560	const htmlEntityDesc * ent;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1561	const char *cp;
				1562	char nbuf[16];
				1563	int len;
				1564
				1565	/*
				1566	* Try to lookup a predefined HTML entity for it
				1567	*/
				1568	ent = htmlEntityValueLookup(c);
				1569	if (ent == NULL) {
				1570	sprintf(nbuf, "#%u", c);
				1571	cp = nbuf;
				1572	}
				1573	else
				1574	cp = ent->name;
				1575	len = strlen(cp);
				1576	if (out + 2 + len > outend)
				1577	break;
				1578	*out++ = '&';
				1579	memcpy(out, cp, len);
				1580	out += len;
				1581	*out++ = ';';
				1582	}
				1583	processed = in;
				1584	}
				1585	*outlen = out - outstart;
				1586	*inlen = processed - instart;
				1587	return(0);
				1588	}
				1589
				1590	/**
				1591	* htmlDecodeEntities:
				1592	* @ctxt: the parser context
				1593	* @len: the len to decode (in bytes !), -1 for no size limit
				1594	* @end: an end marker xmlChar, 0 if none
				1595	* @end2: an end marker xmlChar, 0 if none
				1596	* @end3: an end marker xmlChar, 0 if none
				1597	*
				1598	* Subtitute the HTML entities by their value
				1599	*
				1600	* DEPRECATED !!!!
				1601	*
				1602	* Returns A newly allocated string with the substitution done. The caller
				1603	* must deallocate it !
				1604	*/
				1605	xmlChar *
Daniel Veillard	c86a4fa	2001-03-26 16:28:29 +0000	[diff] [blame]	1606	htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
				1607	xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1608	static int deprecated = 0;
				1609	if (!deprecated) {
				1610	xmlGenericError(xmlGenericErrorContext,
				1611	"htmlDecodeEntities() deprecated function reached\n");
				1612	deprecated = 1;
				1613	}
				1614	return(NULL);
				1615	#if 0
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1616	xmlChar *name = NULL;
				1617	xmlChar *buffer = NULL;
				1618	unsigned int buffer_size = 0;
				1619	unsigned int nbchars = 0;
				1620	htmlEntityDescPtr ent;
				1621	unsigned int max = (unsigned int) len;
				1622	int c,l;
				1623
				1624	if (ctxt->depth > 40) {
				1625	ctxt->errNo = XML_ERR_ENTITY_LOOP;
				1626	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1627	ctxt->sax->error(ctxt->userData,
				1628	"Detected entity reference loop\n");
				1629	ctxt->wellFormed = 0;
				1630	ctxt->disableSAX = 1;
				1631	return(NULL);
				1632	}
				1633
				1634	/*
				1635	* allocate a translation buffer.
				1636	*/
				1637	buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
				1638	buffer = (xmlChar ) xmlMalloc(buffer_size sizeof(xmlChar));
				1639	if (buffer == NULL) {
				1640	perror("xmlDecodeEntities: malloc failed");
				1641	return(NULL);
				1642	}
				1643
				1644	/*
				1645	* Ok loop until we reach one of the ending char or a size limit.
				1646	*/
				1647	c = CUR_CHAR(l);
				1648	while ((nbchars < max) && (c != end) &&
				1649	(c != end2) && (c != end3)) {
				1650
				1651	if (c == 0) break;
				1652	if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
				1653	int val = htmlParseCharRef(ctxt);
				1654	COPY_BUF(0,buffer,nbchars,val);
				1655	NEXTL(l);
				1656	} else if ((c == '&') && (ctxt->token != '&')) {
				1657	ent = htmlParseEntityRef(ctxt, &name);
				1658	if (name != NULL) {
				1659	if (ent != NULL) {
				1660	int val = ent->value;
				1661	COPY_BUF(0,buffer,nbchars,val);
				1662	NEXTL(l);
				1663	} else {
				1664	const xmlChar *cur = name;
				1665
				1666	buffer[nbchars++] = '&';
				1667	if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
				1668	growBuffer(buffer);
				1669	}
				1670	while (*cur != 0) {
				1671	buffer[nbchars++] = *cur++;
				1672	}
				1673	buffer[nbchars++] = ';';
				1674	}
				1675	}
				1676	} else {
				1677	COPY_BUF(l,buffer,nbchars,c);
				1678	NEXTL(l);
				1679	if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
				1680	growBuffer(buffer);
				1681	}
				1682	}
				1683	c = CUR_CHAR(l);
				1684	}
				1685	buffer[nbchars++] = 0;
				1686	return(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1687	#endif
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1688	}
				1689
				1690	/************************************************************************
				1691	* *
				1692	* Commodity functions to handle streams *
				1693	* *
				1694	************************************************************************/
				1695
				1696	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1697	* htmlNewInputStream:
				1698	* @ctxt: an HTML parser context
				1699	*
				1700	* Create a new input stream structure
				1701	* Returns the new input stream or NULL
				1702	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1703	static htmlParserInputPtr
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1704	htmlNewInputStream(htmlParserCtxtPtr ctxt) {
				1705	htmlParserInputPtr input;
				1706
				1707	input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				1708	if (input == NULL) {
				1709	ctxt->errNo = XML_ERR_NO_MEMORY;
				1710	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1711	ctxt->sax->error(ctxt->userData,
				1712	"malloc: couldn't allocate a new input stream\n");
				1713	return(NULL);
				1714	}
				1715	memset(input, 0, sizeof(htmlParserInput));
				1716	input->filename = NULL;
				1717	input->directory = NULL;
				1718	input->base = NULL;
				1719	input->cur = NULL;
				1720	input->buf = NULL;
				1721	input->line = 1;
				1722	input->col = 1;
				1723	input->buf = NULL;
				1724	input->free = NULL;
				1725	input->version = NULL;
				1726	input->consumed = 0;
				1727	input->length = 0;
				1728	return(input);
				1729	}
				1730
				1731
				1732	/************************************************************************
				1733	* *
				1734	* Commodity functions, cleanup needed ? *
				1735	* *
				1736	************************************************************************/
				1737
				1738	/**
				1739	* areBlanks:
				1740	* @ctxt: an HTML parser context
				1741	* @str: a xmlChar *
				1742	* @len: the size of @str
				1743	*
				1744	* Is this a sequence of blank chars that one can ignore ?
				1745	*
				1746	* Returns 1 if ignorable 0 otherwise.
				1747	*/
				1748
				1749	static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
				1750	int i;
				1751	xmlNodePtr lastChild;
				1752
				1753	for (i = 0;i < len;i++)
				1754	if (!(IS_BLANK(str[i]))) return(0);
				1755
				1756	if (CUR == 0) return(1);
				1757	if (CUR != '<') return(0);
				1758	if (ctxt->name == NULL)
				1759	return(1);
				1760	if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
				1761	return(1);
				1762	if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
				1763	return(1);
				1764	if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
				1765	return(1);
				1766	if (ctxt->node == NULL) return(0);
				1767	lastChild = xmlGetLastChild(ctxt->node);
				1768	if (lastChild == NULL) {
Daniel Veillard	7db3773	2001-07-12 01:20:08 +0000	[diff] [blame]	1769	if ((ctxt->node->type != XML_ELEMENT_NODE) &&
				1770	(ctxt->node->content != NULL)) return(0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1771	} else if (xmlNodeIsText(lastChild)) {
				1772	return(0);
				1773	} else if (xmlStrEqual(lastChild->name, BAD_CAST"b")) {
				1774	return(0);
				1775	} else if (xmlStrEqual(lastChild->name, BAD_CAST"bold")) {
				1776	return(0);
				1777	} else if (xmlStrEqual(lastChild->name, BAD_CAST"em")) {
				1778	return(0);
				1779	}
				1780	return(1);
				1781	}
				1782
				1783	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1784	* htmlNewDocNoDtD:
				1785	* @URI: URI for the dtd, or NULL
				1786	* @ExternalID: the external ID of the DTD, or NULL
				1787	*
Daniel Veillard	5e2dace	2001-07-18 19:30:27 +0000	[diff] [blame]	1788	* Creates a new HTML document without a DTD node if @URI and @ExternalID
				1789	* are NULL
				1790	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1791	* Returns a new document, do not intialize the DTD if not provided
				1792	*/
				1793	htmlDocPtr
				1794	htmlNewDocNoDtD(const xmlChar URI, const xmlChar ExternalID) {
				1795	xmlDocPtr cur;
				1796
				1797	/*
				1798	* Allocate a new document and fill the fields.
				1799	*/
				1800	cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
				1801	if (cur == NULL) {
				1802	xmlGenericError(xmlGenericErrorContext,
				1803	"xmlNewDoc : malloc failed\n");
				1804	return(NULL);
				1805	}
				1806	memset(cur, 0, sizeof(xmlDoc));
				1807
				1808	cur->type = XML_HTML_DOCUMENT_NODE;
				1809	cur->version = NULL;
				1810	cur->intSubset = NULL;
				1811	if ((ExternalID != NULL) \|\|
				1812	(URI != NULL))
				1813	xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
				1814	cur->doc = cur;
				1815	cur->name = NULL;
				1816	cur->children = NULL;
				1817	cur->extSubset = NULL;
				1818	cur->oldNs = NULL;
				1819	cur->encoding = NULL;
				1820	cur->standalone = 1;
				1821	cur->compression = 0;
				1822	cur->ids = NULL;
				1823	cur->refs = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1824	cur->_private = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1825	return(cur);
				1826	}
				1827
				1828	/**
				1829	* htmlNewDoc:
				1830	* @URI: URI for the dtd, or NULL
				1831	* @ExternalID: the external ID of the DTD, or NULL
				1832	*
Daniel Veillard	5e2dace	2001-07-18 19:30:27 +0000	[diff] [blame]	1833	* Creates a new HTML document
				1834	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1835	* Returns a new document
				1836	*/
				1837	htmlDocPtr
				1838	htmlNewDoc(const xmlChar URI, const xmlChar ExternalID) {
				1839	if ((URI == NULL) && (ExternalID == NULL))
				1840	return(htmlNewDocNoDtD(
Daniel Veillard	6426935	2001-05-04 17:52:34 +0000	[diff] [blame]	1841	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
				1842	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1843
				1844	return(htmlNewDocNoDtD(URI, ExternalID));
				1845	}
				1846
				1847
				1848	/************************************************************************
				1849	* *
				1850	* The parser itself *
				1851	* Relates to http://www.w3.org/TR/html40 *
				1852	* *
				1853	************************************************************************/
				1854
				1855	/************************************************************************
				1856	* *
				1857	* The parser itself *
				1858	* *
				1859	************************************************************************/
				1860
				1861	/**
				1862	* htmlParseHTMLName:
				1863	* @ctxt: an HTML parser context
				1864	*
				1865	* parse an HTML tag or attribute name, note that we convert it to lowercase
				1866	* since HTML names are not case-sensitive.
				1867	*
				1868	* Returns the Tag Name parsed or NULL
				1869	*/
				1870
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1871	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1872	htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
				1873	xmlChar *ret = NULL;
				1874	int i = 0;
				1875	xmlChar loc[HTML_PARSER_BUFFER_SIZE];
				1876
				1877	if (!IS_LETTER(CUR) && (CUR != '_') &&
				1878	(CUR != ':')) return(NULL);
				1879
				1880	while ((i < HTML_PARSER_BUFFER_SIZE) &&
				1881	((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1882	(CUR == ':') \|\| (CUR == '-') \|\| (CUR == '_'))) {
				1883	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
				1884	else loc[i] = CUR;
				1885	i++;
				1886
				1887	NEXT;
				1888	}
				1889
				1890	ret = xmlStrndup(loc, i);
				1891
				1892	return(ret);
				1893	}
				1894
				1895	/**
				1896	* htmlParseName:
				1897	* @ctxt: an HTML parser context
				1898	*
				1899	* parse an HTML name, this routine is case sensistive.
				1900	*
				1901	* Returns the Name parsed or NULL
				1902	*/
				1903
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1904	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1905	htmlParseName(htmlParserCtxtPtr ctxt) {
				1906	xmlChar buf[HTML_MAX_NAMELEN];
				1907	int len = 0;
				1908
				1909	GROW;
				1910	if (!IS_LETTER(CUR) && (CUR != '_')) {
				1911	return(NULL);
				1912	}
				1913
				1914	while ((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1915	(CUR == '.') \|\| (CUR == '-') \|\|
				1916	(CUR == '_') \|\| (CUR == ':') \|\|
				1917	(IS_COMBINING(CUR)) \|\|
				1918	(IS_EXTENDER(CUR))) {
				1919	buf[len++] = CUR;
				1920	NEXT;
				1921	if (len >= HTML_MAX_NAMELEN) {
				1922	xmlGenericError(xmlGenericErrorContext,
				1923	"htmlParseName: reached HTML_MAX_NAMELEN limit\n");
				1924	while ((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1925	(CUR == '.') \|\| (CUR == '-') \|\|
				1926	(CUR == '_') \|\| (CUR == ':') \|\|
				1927	(IS_COMBINING(CUR)) \|\|
				1928	(IS_EXTENDER(CUR)))
				1929	NEXT;
				1930	break;
				1931	}
				1932	}
				1933	return(xmlStrndup(buf, len));
				1934	}
				1935
				1936	/**
				1937	* htmlParseHTMLAttribute:
				1938	* @ctxt: an HTML parser context
				1939	* @stop: a char stop value
				1940	*
				1941	* parse an HTML attribute value till the stop (quote), if
				1942	* stop is 0 then it stops at the first space
				1943	*
				1944	* Returns the attribute parsed or NULL
				1945	*/
				1946
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1947	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1948	htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
				1949	xmlChar *buffer = NULL;
				1950	int buffer_size = 0;
				1951	xmlChar *out = NULL;
				1952	xmlChar *name = NULL;
				1953
				1954	xmlChar *cur = NULL;
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame^]	1955	const htmlEntityDesc * ent;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1956
				1957	/*
				1958	* allocate a translation buffer.
				1959	*/
				1960	buffer_size = HTML_PARSER_BUFFER_SIZE;
				1961	buffer = (xmlChar ) xmlMalloc(buffer_size sizeof(xmlChar));
				1962	if (buffer == NULL) {
				1963	perror("htmlParseHTMLAttribute: malloc failed");
				1964	return(NULL);
				1965	}
				1966	out = buffer;
				1967
				1968	/*
				1969	* Ok loop until we reach one of the ending chars
				1970	*/
				1971	while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
				1972	if ((stop == 0) && (IS_BLANK(CUR))) break;
				1973	if (CUR == '&') {
				1974	if (NXT(1) == '#') {
				1975	unsigned int c;
				1976	int bits;
				1977
				1978	c = htmlParseCharRef(ctxt);
				1979	if (c < 0x80)
				1980	{ *out++ = c; bits= -6; }
				1981	else if (c < 0x800)
				1982	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				1983	else if (c < 0x10000)
				1984	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				1985	else
				1986	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				1987
				1988	for ( ; bits >= 0; bits-= 6) {
				1989	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				1990	}
				1991	} else {
				1992	ent = htmlParseEntityRef(ctxt, &name);
				1993	if (name == NULL) {
				1994	*out++ = '&';
				1995	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1996	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1997
				1998	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1999	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2000	}
				2001	} else if (ent == NULL) {
				2002	*out++ = '&';
				2003	cur = name;
				2004	while (*cur != 0) {
				2005	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2006	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2007
				2008	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2009	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2010	}
				2011	out++ = cur++;
				2012	}
				2013	xmlFree(name);
				2014	} else {
				2015	unsigned int c;
				2016	int bits;
				2017
				2018	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2019	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2020
				2021	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2022	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2023	}
				2024	c = (xmlChar)ent->value;
				2025	if (c < 0x80)
				2026	{ *out++ = c; bits= -6; }
				2027	else if (c < 0x800)
				2028	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2029	else if (c < 0x10000)
				2030	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2031	else
				2032	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2033
				2034	for ( ; bits >= 0; bits-= 6) {
				2035	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2036	}
				2037	xmlFree(name);
				2038	}
				2039	}
				2040	} else {
				2041	unsigned int c;
				2042	int bits, l;
				2043
				2044	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2045	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2046
				2047	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2048	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2049	}
				2050	c = CUR_CHAR(l);
				2051	if (c < 0x80)
				2052	{ *out++ = c; bits= -6; }
				2053	else if (c < 0x800)
				2054	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2055	else if (c < 0x10000)
				2056	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2057	else
				2058	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2059
				2060	for ( ; bits >= 0; bits-= 6) {
				2061	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2062	}
				2063	NEXT;
				2064	}
				2065	}
				2066	*out++ = 0;
				2067	return(buffer);
				2068	}
				2069
				2070	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2071	* htmlParseEntityRef:
				2072	* @ctxt: an HTML parser context
				2073	* @str: location to store the entity name
				2074	*
				2075	* parse an HTML ENTITY references
				2076	*
				2077	* [68] EntityRef ::= '&' Name ';'
				2078	*
				2079	* Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
				2080	* if non-NULL *str will have to be freed by the caller.
				2081	*/
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame^]	2082	const htmlEntityDesc *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2083	htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
				2084	xmlChar *name;
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame^]	2085	const htmlEntityDesc * ent = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2086	*str = NULL;
				2087
				2088	if (CUR == '&') {
				2089	NEXT;
				2090	name = htmlParseName(ctxt);
				2091	if (name == NULL) {
				2092	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2093	ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
				2094	ctxt->wellFormed = 0;
				2095	} else {
				2096	GROW;
				2097	if (CUR == ';') {
				2098	*str = name;
				2099
				2100	/*
				2101	* Lookup the entity in the table.
				2102	*/
				2103	ent = htmlEntityLookup(name);
				2104	if (ent != NULL) /* OK that's ugly !!! */
				2105	NEXT;
				2106	} else {
				2107	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2108	ctxt->sax->error(ctxt->userData,
				2109	"htmlParseEntityRef: expecting ';'\n");
				2110	*str = name;
				2111	}
				2112	}
				2113	}
				2114	return(ent);
				2115	}
				2116
				2117	/**
				2118	* htmlParseAttValue:
				2119	* @ctxt: an HTML parser context
				2120	*
				2121	* parse a value for an attribute
				2122	* Note: the parser won't do substitution of entities here, this
				2123	* will be handled later in xmlStringGetNodeList, unless it was
				2124	* asked for ctxt->replaceEntities != 0
				2125	*
				2126	* Returns the AttValue parsed or NULL.
				2127	*/
				2128
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2129	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2130	htmlParseAttValue(htmlParserCtxtPtr ctxt) {
				2131	xmlChar *ret = NULL;
				2132
				2133	if (CUR == '"') {
				2134	NEXT;
				2135	ret = htmlParseHTMLAttribute(ctxt, '"');
				2136	if (CUR != '"') {
				2137	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2138	ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
				2139	ctxt->wellFormed = 0;
				2140	} else
				2141	NEXT;
				2142	} else if (CUR == '\'') {
				2143	NEXT;
				2144	ret = htmlParseHTMLAttribute(ctxt, '\'');
				2145	if (CUR != '\'') {
				2146	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2147	ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
				2148	ctxt->wellFormed = 0;
				2149	} else
				2150	NEXT;
				2151	} else {
				2152	/*
				2153	* That's an HTMLism, the attribute value may not be quoted
				2154	*/
				2155	ret = htmlParseHTMLAttribute(ctxt, 0);
				2156	if (ret == NULL) {
				2157	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2158	ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
				2159	ctxt->wellFormed = 0;
				2160	}
				2161	}
				2162	return(ret);
				2163	}
				2164
				2165	/**
				2166	* htmlParseSystemLiteral:
				2167	* @ctxt: an HTML parser context
				2168	*
				2169	* parse an HTML Literal
				2170	*
				2171	* [11] SystemLiteral ::= ('"' [^"]* '"') \| ("'" [^']* "'")
				2172	*
				2173	* Returns the SystemLiteral parsed or NULL
				2174	*/
				2175
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2176	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2177	htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
				2178	const xmlChar *q;
				2179	xmlChar *ret = NULL;
				2180
				2181	if (CUR == '"') {
				2182	NEXT;
				2183	q = CUR_PTR;
				2184	while ((IS_CHAR(CUR)) && (CUR != '"'))
				2185	NEXT;
				2186	if (!IS_CHAR(CUR)) {
				2187	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2188	ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
				2189	ctxt->wellFormed = 0;
				2190	} else {
				2191	ret = xmlStrndup(q, CUR_PTR - q);
				2192	NEXT;
				2193	}
				2194	} else if (CUR == '\'') {
				2195	NEXT;
				2196	q = CUR_PTR;
				2197	while ((IS_CHAR(CUR)) && (CUR != '\''))
				2198	NEXT;
				2199	if (!IS_CHAR(CUR)) {
				2200	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2201	ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
				2202	ctxt->wellFormed = 0;
				2203	} else {
				2204	ret = xmlStrndup(q, CUR_PTR - q);
				2205	NEXT;
				2206	}
				2207	} else {
				2208	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2209	ctxt->sax->error(ctxt->userData,
				2210	"SystemLiteral \" or ' expected\n");
				2211	ctxt->wellFormed = 0;
				2212	}
				2213
				2214	return(ret);
				2215	}
				2216
				2217	/**
				2218	* htmlParsePubidLiteral:
				2219	* @ctxt: an HTML parser context
				2220	*
				2221	* parse an HTML public literal
				2222	*
				2223	* [12] PubidLiteral ::= '"' PubidChar* '"' \| "'" (PubidChar - "'")* "'"
				2224	*
				2225	* Returns the PubidLiteral parsed or NULL.
				2226	*/
				2227
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2228	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2229	htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
				2230	const xmlChar *q;
				2231	xmlChar *ret = NULL;
				2232	/*
				2233	* Name ::= (Letter \| '_') (NameChar)*
				2234	*/
				2235	if (CUR == '"') {
				2236	NEXT;
				2237	q = CUR_PTR;
				2238	while (IS_PUBIDCHAR(CUR)) NEXT;
				2239	if (CUR != '"') {
				2240	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2241	ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
				2242	ctxt->wellFormed = 0;
				2243	} else {
				2244	ret = xmlStrndup(q, CUR_PTR - q);
				2245	NEXT;
				2246	}
				2247	} else if (CUR == '\'') {
				2248	NEXT;
				2249	q = CUR_PTR;
				2250	while ((IS_LETTER(CUR)) && (CUR != '\''))
				2251	NEXT;
				2252	if (!IS_LETTER(CUR)) {
				2253	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2254	ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
				2255	ctxt->wellFormed = 0;
				2256	} else {
				2257	ret = xmlStrndup(q, CUR_PTR - q);
				2258	NEXT;
				2259	}
				2260	} else {
				2261	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2262	ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
				2263	ctxt->wellFormed = 0;
				2264	}
				2265
				2266	return(ret);
				2267	}
				2268
				2269	/**
				2270	* htmlParseScript:
				2271	* @ctxt: an HTML parser context
				2272	*
				2273	* parse the content of an HTML SCRIPT or STYLE element
				2274	* http://www.w3.org/TR/html4/sgml/dtd.html#Script
				2275	* http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
				2276	* http://www.w3.org/TR/html4/types.html#type-script
				2277	* http://www.w3.org/TR/html4/types.html#h-6.15
				2278	* http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
				2279	*
				2280	* Script data ( %Script; in the DTD) can be the content of the SCRIPT
				2281	* element and the value of intrinsic event attributes. User agents must
				2282	* not evaluate script data as HTML markup but instead must pass it on as
				2283	* data to a script engine.
				2284	* NOTES:
				2285	* - The content is passed like CDATA
				2286	* - the attributes for style and scripting "onXXX" are also described
				2287	* as CDATA but SGML allows entities references in attributes so their
				2288	* processing is identical as other attributes
				2289	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2290	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2291	htmlParseScript(htmlParserCtxtPtr ctxt) {
				2292	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
				2293	int nbchar = 0;
				2294	xmlChar cur;
				2295
				2296	SHRINK;
				2297	cur = CUR;
				2298	while (IS_CHAR(cur)) {
				2299	if ((cur == '<') && (NXT(1) == '/')) {
				2300	/*
				2301	* One should break here, the specification is clear:
				2302	* Authors should therefore escape "</" within the content.
				2303	* Escape mechanisms are specific to each scripting or
				2304	* style sheet language.
				2305	*/
				2306	if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) \|\|
				2307	((NXT(2) >= 'a') && (NXT(2) <= 'z')))
				2308	break; /* while */
				2309	}
				2310	buf[nbchar++] = cur;
				2311	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
				2312	if (ctxt->sax->cdataBlock!= NULL) {
				2313	/*
				2314	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2315	*/
				2316	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2317	}
				2318	nbchar = 0;
				2319	}
				2320	NEXT;
				2321	cur = CUR;
				2322	}
				2323	if (!(IS_CHAR(cur))) {
				2324	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2325	ctxt->sax->error(ctxt->userData,
				2326	"Invalid char in CDATA 0x%X\n", cur);
				2327	ctxt->wellFormed = 0;
				2328	NEXT;
				2329	}
				2330
				2331	if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2332	if (ctxt->sax->cdataBlock!= NULL) {
				2333	/*
				2334	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2335	*/
				2336	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2337	}
				2338	}
				2339	}
				2340
				2341
				2342	/**
				2343	* htmlParseCharData:
				2344	* @ctxt: an HTML parser context
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2345	*
				2346	* parse a CharData section.
				2347	* if we are within a CDATA section ']]>' marks an end of section.
				2348	*
				2349	* [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
				2350	*/
				2351
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2352	static void
				2353	htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2354	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
				2355	int nbchar = 0;
				2356	int cur, l;
				2357
				2358	SHRINK;
				2359	cur = CUR_CHAR(l);
				2360	while (((cur != '<') \|\| (ctxt->token == '<')) &&
				2361	((cur != '&') \|\| (ctxt->token == '&')) &&
				2362	(IS_CHAR(cur))) {
				2363	COPY_BUF(l,buf,nbchar,cur);
				2364	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
				2365	/*
				2366	* Ok the segment is to be consumed as chars.
				2367	*/
				2368	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2369	if (areBlanks(ctxt, buf, nbchar)) {
				2370	if (ctxt->sax->ignorableWhitespace != NULL)
				2371	ctxt->sax->ignorableWhitespace(ctxt->userData,
				2372	buf, nbchar);
				2373	} else {
				2374	htmlCheckParagraph(ctxt);
				2375	if (ctxt->sax->characters != NULL)
				2376	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				2377	}
				2378	}
				2379	nbchar = 0;
				2380	}
				2381	NEXTL(l);
				2382	cur = CUR_CHAR(l);
				2383	}
				2384	if (nbchar != 0) {
				2385	/*
				2386	* Ok the segment is to be consumed as chars.
				2387	*/
				2388	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2389	if (areBlanks(ctxt, buf, nbchar)) {
				2390	if (ctxt->sax->ignorableWhitespace != NULL)
				2391	ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
				2392	} else {
				2393	htmlCheckParagraph(ctxt);
				2394	if (ctxt->sax->characters != NULL)
				2395	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				2396	}
				2397	}
				2398	}
				2399	}
				2400
				2401	/**
				2402	* htmlParseExternalID:
				2403	* @ctxt: an HTML parser context
				2404	* @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2405	*
				2406	* Parse an External ID or a Public ID
				2407	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2408	* [75] ExternalID ::= 'SYSTEM' S SystemLiteral
				2409	* \| 'PUBLIC' S PubidLiteral S SystemLiteral
				2410	*
				2411	* [83] PublicID ::= 'PUBLIC' S PubidLiteral
				2412	*
				2413	* Returns the function returns SystemLiteral and in the second
				2414	* case publicID receives PubidLiteral, is strict is off
				2415	* it is possible to return NULL and have publicID set.
				2416	*/
				2417
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2418	static xmlChar *
				2419	htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2420	xmlChar *URI = NULL;
				2421
				2422	if ((UPPER == 'S') && (UPP(1) == 'Y') &&
				2423	(UPP(2) == 'S') && (UPP(3) == 'T') &&
				2424	(UPP(4) == 'E') && (UPP(5) == 'M')) {
				2425	SKIP(6);
				2426	if (!IS_BLANK(CUR)) {
				2427	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2428	ctxt->sax->error(ctxt->userData,
				2429	"Space required after 'SYSTEM'\n");
				2430	ctxt->wellFormed = 0;
				2431	}
				2432	SKIP_BLANKS;
				2433	URI = htmlParseSystemLiteral(ctxt);
				2434	if (URI == NULL) {
				2435	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2436	ctxt->sax->error(ctxt->userData,
				2437	"htmlParseExternalID: SYSTEM, no URI\n");
				2438	ctxt->wellFormed = 0;
				2439	}
				2440	} else if ((UPPER == 'P') && (UPP(1) == 'U') &&
				2441	(UPP(2) == 'B') && (UPP(3) == 'L') &&
				2442	(UPP(4) == 'I') && (UPP(5) == 'C')) {
				2443	SKIP(6);
				2444	if (!IS_BLANK(CUR)) {
				2445	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2446	ctxt->sax->error(ctxt->userData,
				2447	"Space required after 'PUBLIC'\n");
				2448	ctxt->wellFormed = 0;
				2449	}
				2450	SKIP_BLANKS;
				2451	*publicID = htmlParsePubidLiteral(ctxt);
				2452	if (*publicID == NULL) {
				2453	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2454	ctxt->sax->error(ctxt->userData,
				2455	"htmlParseExternalID: PUBLIC, no Public Identifier\n");
				2456	ctxt->wellFormed = 0;
				2457	}
				2458	SKIP_BLANKS;
				2459	if ((CUR == '"') \|\| (CUR == '\'')) {
				2460	URI = htmlParseSystemLiteral(ctxt);
				2461	}
				2462	}
				2463	return(URI);
				2464	}
				2465
				2466	/**
				2467	* htmlParseComment:
				2468	* @ctxt: an HTML parser context
				2469	*
				2470	* Parse an XML (SGML) comment <!-- .... -->
				2471	*
				2472	* [15] Comment ::= '<!--' ((Char - '-') \| ('-' (Char - '-')))* '-->'
				2473	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2474	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2475	htmlParseComment(htmlParserCtxtPtr ctxt) {
				2476	xmlChar *buf = NULL;
				2477	int len;
				2478	int size = HTML_PARSER_BUFFER_SIZE;
				2479	int q, ql;
				2480	int r, rl;
				2481	int cur, l;
				2482	xmlParserInputState state;
				2483
				2484	/*
				2485	* Check that there is a comment right here.
				2486	*/
				2487	if ((RAW != '<') \|\| (NXT(1) != '!') \|\|
				2488	(NXT(2) != '-') \|\| (NXT(3) != '-')) return;
				2489
				2490	state = ctxt->instate;
				2491	ctxt->instate = XML_PARSER_COMMENT;
				2492	SHRINK;
				2493	SKIP(4);
				2494	buf = (xmlChar ) xmlMalloc(size sizeof(xmlChar));
				2495	if (buf == NULL) {
				2496	xmlGenericError(xmlGenericErrorContext,
				2497	"malloc of %d byte failed\n", size);
				2498	ctxt->instate = state;
				2499	return;
				2500	}
				2501	q = CUR_CHAR(ql);
				2502	NEXTL(ql);
				2503	r = CUR_CHAR(rl);
				2504	NEXTL(rl);
				2505	cur = CUR_CHAR(l);
				2506	len = 0;
				2507	while (IS_CHAR(cur) &&
				2508	((cur != '>') \|\|
				2509	(r != '-') \|\| (q != '-'))) {
				2510	if (len + 5 >= size) {
				2511	size *= 2;
				2512	buf = (xmlChar ) xmlRealloc(buf, size sizeof(xmlChar));
				2513	if (buf == NULL) {
				2514	xmlGenericError(xmlGenericErrorContext,
				2515	"realloc of %d byte failed\n", size);
				2516	ctxt->instate = state;
				2517	return;
				2518	}
				2519	}
				2520	COPY_BUF(ql,buf,len,q);
				2521	q = r;
				2522	ql = rl;
				2523	r = cur;
				2524	rl = l;
				2525	NEXTL(l);
				2526	cur = CUR_CHAR(l);
				2527	if (cur == 0) {
				2528	SHRINK;
				2529	GROW;
				2530	cur = CUR_CHAR(l);
				2531	}
				2532	}
				2533	buf[len] = 0;
				2534	if (!IS_CHAR(cur)) {
				2535	ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
				2536	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2537	ctxt->sax->error(ctxt->userData,
				2538	"Comment not terminated \n<!--%.50s\n", buf);
				2539	ctxt->wellFormed = 0;
				2540	xmlFree(buf);
				2541	} else {
				2542	NEXT;
				2543	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
				2544	(!ctxt->disableSAX))
				2545	ctxt->sax->comment(ctxt->userData, buf);
				2546	xmlFree(buf);
				2547	}
				2548	ctxt->instate = state;
				2549	}
				2550
				2551	/**
				2552	* htmlParseCharRef:
				2553	* @ctxt: an HTML parser context
				2554	*
				2555	* parse Reference declarations
				2556	*
				2557	* [66] CharRef ::= '&#' [0-9]+ ';' \|
				2558	* '&#x' [0-9a-fA-F]+ ';'
				2559	*
				2560	* Returns the value parsed (as an int)
				2561	*/
				2562	int
				2563	htmlParseCharRef(htmlParserCtxtPtr ctxt) {
				2564	int val = 0;
				2565
				2566	if ((CUR == '&') && (NXT(1) == '#') &&
				2567	(NXT(2) == 'x')) {
				2568	SKIP(3);
				2569	while (CUR != ';') {
				2570	if ((CUR >= '0') && (CUR <= '9'))
				2571	val = val * 16 + (CUR - '0');
				2572	else if ((CUR >= 'a') && (CUR <= 'f'))
				2573	val = val * 16 + (CUR - 'a') + 10;
				2574	else if ((CUR >= 'A') && (CUR <= 'F'))
				2575	val = val * 16 + (CUR - 'A') + 10;
				2576	else {
				2577	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2578	ctxt->sax->error(ctxt->userData,
				2579	"htmlParseCharRef: invalid hexadecimal value\n");
				2580	ctxt->wellFormed = 0;
				2581	return(0);
				2582	}
				2583	NEXT;
				2584	}
				2585	if (CUR == ';')
				2586	NEXT;
				2587	} else if ((CUR == '&') && (NXT(1) == '#')) {
				2588	SKIP(2);
				2589	while (CUR != ';') {
				2590	if ((CUR >= '0') && (CUR <= '9'))
				2591	val = val * 10 + (CUR - '0');
				2592	else {
				2593	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2594	ctxt->sax->error(ctxt->userData,
				2595	"htmlParseCharRef: invalid decimal value\n");
				2596	ctxt->wellFormed = 0;
				2597	return(0);
				2598	}
				2599	NEXT;
				2600	}
				2601	if (CUR == ';')
				2602	NEXT;
				2603	} else {
				2604	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2605	ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
				2606	ctxt->wellFormed = 0;
				2607	}
				2608	/*
				2609	* Check the value IS_CHAR ...
				2610	*/
				2611	if (IS_CHAR(val)) {
				2612	return(val);
				2613	} else {
				2614	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2615	ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
				2616	val);
				2617	ctxt->wellFormed = 0;
				2618	}
				2619	return(0);
				2620	}
				2621
				2622
				2623	/**
				2624	* htmlParseDocTypeDecl :
				2625	* @ctxt: an HTML parser context
				2626	*
				2627	* parse a DOCTYPE declaration
				2628	*
				2629	* [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
				2630	* ('[' (markupdecl \| PEReference \| S)* ']' S?)? '>'
				2631	*/
				2632
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2633	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2634	htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
				2635	xmlChar *name;
				2636	xmlChar *ExternalID = NULL;
				2637	xmlChar *URI = NULL;
				2638
				2639	/*
				2640	* We know that '<!DOCTYPE' has been detected.
				2641	*/
				2642	SKIP(9);
				2643
				2644	SKIP_BLANKS;
				2645
				2646	/*
				2647	* Parse the DOCTYPE name.
				2648	*/
				2649	name = htmlParseName(ctxt);
				2650	if (name == NULL) {
				2651	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2652	ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
				2653	ctxt->wellFormed = 0;
				2654	}
				2655	/*
				2656	* Check that upper(name) == "HTML" !!!!!!!!!!!!!
				2657	*/
				2658
				2659	SKIP_BLANKS;
				2660
				2661	/*
				2662	* Check for SystemID and ExternalID
				2663	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2664	URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2665	SKIP_BLANKS;
				2666
				2667	/*
				2668	* We should be at the end of the DOCTYPE declaration.
				2669	*/
				2670	if (CUR != '>') {
				2671	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2672	ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
				2673	ctxt->wellFormed = 0;
				2674	/* We shouldn't try to resynchronize ... */
				2675	}
				2676	NEXT;
				2677
				2678	/*
				2679	* Create or update the document accordingly to the DOCTYPE
				2680	*/
				2681	if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
				2682	(!ctxt->disableSAX))
				2683	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
				2684
				2685	/*
				2686	* Cleanup, since we don't use all those identifiers
				2687	*/
				2688	if (URI != NULL) xmlFree(URI);
				2689	if (ExternalID != NULL) xmlFree(ExternalID);
				2690	if (name != NULL) xmlFree(name);
				2691	}
				2692
				2693	/**
				2694	* htmlParseAttribute:
				2695	* @ctxt: an HTML parser context
				2696	* @value: a xmlChar ** used to store the value of the attribute
				2697	*
				2698	* parse an attribute
				2699	*
				2700	* [41] Attribute ::= Name Eq AttValue
				2701	*
				2702	* [25] Eq ::= S? '=' S?
				2703	*
				2704	* With namespace:
				2705	*
				2706	* [NS 11] Attribute ::= QName Eq AttValue
				2707	*
				2708	* Also the case QName == xmlns:??? is handled independently as a namespace
				2709	* definition.
				2710	*
				2711	* Returns the attribute name, and the value in *value.
				2712	*/
				2713
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2714	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2715	htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
				2716	xmlChar name, val = NULL;
				2717
				2718	*value = NULL;
				2719	name = htmlParseHTMLName(ctxt);
				2720	if (name == NULL) {
				2721	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2722	ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
				2723	ctxt->wellFormed = 0;
				2724	return(NULL);
				2725	}
				2726
				2727	/*
				2728	* read the value
				2729	*/
				2730	SKIP_BLANKS;
				2731	if (CUR == '=') {
				2732	NEXT;
				2733	SKIP_BLANKS;
				2734	val = htmlParseAttValue(ctxt);
				2735	/******
				2736	} else {
				2737	* TODO : some attribute must have values, some may not
				2738	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2739	ctxt->sax->warning(ctxt->userData,
				2740	"No value for attribute %s\n", name); */
				2741	}
				2742
				2743	*value = val;
				2744	return(name);
				2745	}
				2746
				2747	/**
				2748	* htmlCheckEncoding:
				2749	* @ctxt: an HTML parser context
				2750	* @attvalue: the attribute value
				2751	*
				2752	* Checks an http-equiv attribute from a Meta tag to detect
				2753	* the encoding
				2754	* If a new encoding is detected the parser is switched to decode
				2755	* it and pass UTF8
				2756	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2757	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2758	htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
				2759	const xmlChar *encoding;
				2760
				2761	if ((ctxt == NULL) \|\| (attvalue == NULL))
				2762	return;
				2763
				2764	/* do not change encoding */
				2765	if (ctxt->input->encoding != NULL)
				2766	return;
				2767
				2768	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
				2769	if (encoding != NULL) {
				2770	encoding += 8;
				2771	} else {
				2772	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
				2773	if (encoding != NULL)
				2774	encoding += 9;
				2775	}
				2776	if (encoding != NULL) {
				2777	xmlCharEncoding enc;
				2778	xmlCharEncodingHandlerPtr handler;
				2779
				2780	while ((encoding == ' ') \|\| (encoding == '\t')) encoding++;
				2781
				2782	if (ctxt->input->encoding != NULL)
				2783	xmlFree((xmlChar *) ctxt->input->encoding);
				2784	ctxt->input->encoding = xmlStrdup(encoding);
				2785
				2786	enc = xmlParseCharEncoding((const char *) encoding);
				2787	/*
				2788	* registered set of known encodings
				2789	*/
				2790	if (enc != XML_CHAR_ENCODING_ERROR) {
				2791	xmlSwitchEncoding(ctxt, enc);
				2792	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				2793	} else {
				2794	/*
				2795	* fallback for unknown encodings
				2796	*/
				2797	handler = xmlFindCharEncodingHandler((const char *) encoding);
				2798	if (handler != NULL) {
				2799	xmlSwitchToEncoding(ctxt, handler);
				2800	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				2801	} else {
				2802	ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
				2803	}
				2804	}
				2805
				2806	if ((ctxt->input->buf != NULL) &&
				2807	(ctxt->input->buf->encoder != NULL) &&
				2808	(ctxt->input->buf->raw != NULL) &&
				2809	(ctxt->input->buf->buffer != NULL)) {
				2810	int nbchars;
				2811	int processed;
				2812
				2813	/*
				2814	* convert as much as possible to the parser reading buffer.
				2815	*/
				2816	processed = ctxt->input->cur - ctxt->input->base;
				2817	xmlBufferShrink(ctxt->input->buf->buffer, processed);
				2818	nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
				2819	ctxt->input->buf->buffer,
				2820	ctxt->input->buf->raw);
				2821	if (nbchars < 0) {
				2822	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				2823	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2824	ctxt->sax->error(ctxt->userData,
				2825	"htmlCheckEncoding: encoder error\n");
				2826	}
				2827	ctxt->input->base =
				2828	ctxt->input->cur = ctxt->input->buf->buffer->content;
				2829	}
				2830	}
				2831	}
				2832
				2833	/**
				2834	* htmlCheckMeta:
				2835	* @ctxt: an HTML parser context
				2836	* @atts: the attributes values
				2837	*
				2838	* Checks an attributes from a Meta tag
				2839	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2840	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2841	htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
				2842	int i;
				2843	const xmlChar att, value;
				2844	int http = 0;
				2845	const xmlChar *content = NULL;
				2846
				2847	if ((ctxt == NULL) \|\| (atts == NULL))
				2848	return;
				2849
				2850	i = 0;
				2851	att = atts[i++];
				2852	while (att != NULL) {
				2853	value = atts[i++];
				2854	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
				2855	&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
				2856	http = 1;
				2857	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
				2858	content = value;
				2859	att = atts[i++];
				2860	}
				2861	if ((http) && (content != NULL))
				2862	htmlCheckEncoding(ctxt, content);
				2863
				2864	}
				2865
				2866	/**
				2867	* htmlParseStartTag:
				2868	* @ctxt: an HTML parser context
				2869	*
				2870	* parse a start of tag either for rule element or
				2871	* EmptyElement. In both case we don't parse the tag closing chars.
				2872	*
				2873	* [40] STag ::= '<' Name (S Attribute)* S? '>'
				2874	*
				2875	* [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
				2876	*
				2877	* With namespace:
				2878	*
				2879	* [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
				2880	*
				2881	* [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
				2882	*
				2883	*/
				2884
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2885	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2886	htmlParseStartTag(htmlParserCtxtPtr ctxt) {
				2887	xmlChar *name;
				2888	xmlChar *attname;
				2889	xmlChar *attvalue;
				2890	const xmlChar **atts = NULL;
				2891	int nbatts = 0;
				2892	int maxatts = 0;
				2893	int meta = 0;
				2894	int i;
				2895
				2896	if (CUR != '<') return;
				2897	NEXT;
				2898
				2899	GROW;
				2900	name = htmlParseHTMLName(ctxt);
				2901	if (name == NULL) {
				2902	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2903	ctxt->sax->error(ctxt->userData,
				2904	"htmlParseStartTag: invalid element name\n");
				2905	ctxt->wellFormed = 0;
				2906	/* Dump the bogus tag like browsers do */
				2907	while ((IS_CHAR(CUR)) && (CUR != '>'))
				2908	NEXT;
				2909	return;
				2910	}
				2911	if (xmlStrEqual(name, BAD_CAST"meta"))
				2912	meta = 1;
				2913
				2914	/*
				2915	* Check for auto-closure of HTML elements.
				2916	*/
				2917	htmlAutoClose(ctxt, name);
				2918
				2919	/*
				2920	* Check for implied HTML elements.
				2921	*/
				2922	htmlCheckImplied(ctxt, name);
				2923
				2924	/*
				2925	* Avoid html at any level > 0, head at any level != 1
				2926	* or any attempt to recurse body
				2927	*/
				2928	if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
				2929	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2930	ctxt->sax->error(ctxt->userData,
				2931	"htmlParseStartTag: misplaced <html> tag\n");
				2932	ctxt->wellFormed = 0;
				2933	xmlFree(name);
				2934	return;
				2935	}
				2936	if ((ctxt->nameNr != 1) &&
				2937	(xmlStrEqual(name, BAD_CAST"head"))) {
				2938	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2939	ctxt->sax->error(ctxt->userData,
				2940	"htmlParseStartTag: misplaced <head> tag\n");
				2941	ctxt->wellFormed = 0;
				2942	xmlFree(name);
				2943	return;
				2944	}
				2945	if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2946	int indx;
				2947	for (indx = 0;indx < ctxt->nameNr;indx++) {
				2948	if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2949	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2950	ctxt->sax->error(ctxt->userData,
				2951	"htmlParseStartTag: misplaced <body> tag\n");
				2952	ctxt->wellFormed = 0;
				2953	xmlFree(name);
				2954	return;
				2955	}
				2956	}
				2957	}
				2958
				2959	/*
				2960	* Now parse the attributes, it ends up with the ending
				2961	*
				2962	* (S Attribute)* S?
				2963	*/
				2964	SKIP_BLANKS;
				2965	while ((IS_CHAR(CUR)) &&
				2966	(CUR != '>') &&
				2967	((CUR != '/') \|\| (NXT(1) != '>'))) {
				2968	long cons = ctxt->nbChars;
				2969
				2970	GROW;
				2971	attname = htmlParseAttribute(ctxt, &attvalue);
				2972	if (attname != NULL) {
				2973
				2974	/*
				2975	* Well formedness requires at most one declaration of an attribute
				2976	*/
				2977	for (i = 0; i < nbatts;i += 2) {
				2978	if (xmlStrEqual(atts[i], attname)) {
				2979	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2980	ctxt->sax->error(ctxt->userData,
				2981	"Attribute %s redefined\n",
				2982	attname);
				2983	ctxt->wellFormed = 0;
				2984	xmlFree(attname);
				2985	if (attvalue != NULL)
				2986	xmlFree(attvalue);
				2987	goto failed;
				2988	}
				2989	}
				2990
				2991	/*
				2992	* Add the pair to atts
				2993	*/
				2994	if (atts == NULL) {
				2995	maxatts = 10;
				2996	atts = (const xmlChar *) xmlMalloc(maxatts sizeof(xmlChar *));
				2997	if (atts == NULL) {
				2998	xmlGenericError(xmlGenericErrorContext,
				2999	"malloc of %ld byte failed\n",
				3000	maxatts * (long)sizeof(xmlChar *));
				3001	if (name != NULL) xmlFree(name);
				3002	return;
				3003	}
				3004	} else if (nbatts + 4 > maxatts) {
				3005	maxatts *= 2;
				3006	atts = (const xmlChar *) xmlRealloc((void ) atts,
				3007	maxatts * sizeof(xmlChar *));
				3008	if (atts == NULL) {
				3009	xmlGenericError(xmlGenericErrorContext,
				3010	"realloc of %ld byte failed\n",
				3011	maxatts * (long)sizeof(xmlChar *));
				3012	if (name != NULL) xmlFree(name);
				3013	return;
				3014	}
				3015	}
				3016	atts[nbatts++] = attname;
				3017	atts[nbatts++] = attvalue;
				3018	atts[nbatts] = NULL;
				3019	atts[nbatts + 1] = NULL;
				3020	}
				3021	else {
				3022	/* Dump the bogus attribute string up to the next blank or
				3023	* the end of the tag. */
				3024	while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
				3025	&& ((CUR != '/') \|\| (NXT(1) != '>')))
				3026	NEXT;
				3027	}
				3028
				3029	failed:
				3030	SKIP_BLANKS;
				3031	if (cons == ctxt->nbChars) {
				3032	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3033	ctxt->sax->error(ctxt->userData,
				3034	"htmlParseStartTag: problem parsing attributes\n");
				3035	ctxt->wellFormed = 0;
				3036	break;
				3037	}
				3038	}
				3039
				3040	/*
				3041	* Handle specific association to the META tag
				3042	*/
				3043	if (meta)
				3044	htmlCheckMeta(ctxt, atts);
				3045
				3046	/*
				3047	* SAX: Start of Element !
				3048	*/
				3049	htmlnamePush(ctxt, xmlStrdup(name));
				3050	#ifdef DEBUG
				3051	xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
				3052	#endif
				3053	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				3054	ctxt->sax->startElement(ctxt->userData, name, atts);
				3055
				3056	if (atts != NULL) {
				3057	for (i = 0;i < nbatts;i++) {
				3058	if (atts[i] != NULL)
				3059	xmlFree((xmlChar *) atts[i]);
				3060	}
				3061	xmlFree((void *) atts);
				3062	}
				3063	if (name != NULL) xmlFree(name);
				3064	}
				3065
				3066	/**
				3067	* htmlParseEndTag:
				3068	* @ctxt: an HTML parser context
				3069	*
				3070	* parse an end of tag
				3071	*
				3072	* [42] ETag ::= '</' Name S? '>'
				3073	*
				3074	* With namespace
				3075	*
				3076	* [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3077	*
				3078	* Returns 1 if the current level should be closed.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3079	*/
				3080
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3081	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3082	htmlParseEndTag(htmlParserCtxtPtr ctxt) {
				3083	xmlChar *name;
				3084	xmlChar *oldname;
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3085	int i, ret;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3086
				3087	if ((CUR != '<') \|\| (NXT(1) != '/')) {
				3088	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3089	ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
				3090	ctxt->wellFormed = 0;
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3091	return(0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3092	}
				3093	SKIP(2);
				3094
				3095	name = htmlParseHTMLName(ctxt);
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3096	if (name == NULL) return(0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3097
				3098	/*
				3099	* We should definitely be at the ending "S? '>'" part
				3100	*/
				3101	SKIP_BLANKS;
				3102	if ((!IS_CHAR(CUR)) \|\| (CUR != '>')) {
				3103	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3104	ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
				3105	ctxt->wellFormed = 0;
				3106	} else
				3107	NEXT;
				3108
				3109	/*
				3110	* If the name read is not one of the element in the parsing stack
				3111	* then return, it's just an error.
				3112	*/
				3113	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
				3114	if (xmlStrEqual(name, ctxt->nameTab[i])) break;
				3115	}
				3116	if (i < 0) {
				3117	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3118	ctxt->sax->error(ctxt->userData,
				3119	"Unexpected end tag : %s\n", name);
				3120	xmlFree(name);
				3121	ctxt->wellFormed = 0;
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3122	return(0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3123	}
				3124
				3125
				3126	/*
				3127	* Check for auto-closure of HTML elements.
				3128	*/
				3129
				3130	htmlAutoCloseOnClose(ctxt, name);
				3131
				3132	/*
				3133	* Well formedness constraints, opening and closing must match.
				3134	* With the exception that the autoclose may have popped stuff out
				3135	* of the stack.
				3136	*/
				3137	if (!xmlStrEqual(name, ctxt->name)) {
				3138	#ifdef DEBUG
				3139	xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
				3140	#endif
				3141	if ((ctxt->name != NULL) &&
				3142	(!xmlStrEqual(ctxt->name, name))) {
				3143	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3144	ctxt->sax->error(ctxt->userData,
				3145	"Opening and ending tag mismatch: %s and %s\n",
				3146	name, ctxt->name);
				3147	ctxt->wellFormed = 0;
				3148	}
				3149	}
				3150
				3151	/*
				3152	* SAX: End of Tag
				3153	*/
				3154	oldname = ctxt->name;
				3155	if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
				3156	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3157	ctxt->sax->endElement(ctxt->userData, name);
				3158	oldname = htmlnamePop(ctxt);
				3159	if (oldname != NULL) {
				3160	#ifdef DEBUG
				3161	xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
				3162	#endif
				3163	xmlFree(oldname);
				3164	#ifdef DEBUG
				3165	} else {
				3166	xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
				3167	#endif
				3168	}
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3169	ret = 1;
				3170	} else {
				3171	ret = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3172	}
				3173
				3174	if (name != NULL)
				3175	xmlFree(name);
				3176
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3177	return(ret);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3178	}
				3179
				3180
				3181	/**
				3182	* htmlParseReference:
				3183	* @ctxt: an HTML parser context
				3184	*
				3185	* parse and handle entity references in content,
				3186	* this will end-up in a call to character() since this is either a
				3187	* CharRef, or a predefined entity.
				3188	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3189	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3190	htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame^]	3191	const htmlEntityDesc * ent;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3192	xmlChar out[6];
				3193	xmlChar *name;
				3194	if (CUR != '&') return;
				3195
				3196	if (NXT(1) == '#') {
				3197	unsigned int c;
				3198	int bits, i = 0;
				3199
				3200	c = htmlParseCharRef(ctxt);
				3201	if (c == 0)
				3202	return;
				3203
				3204	if (c < 0x80) { out[i++]= c; bits= -6; }
				3205	else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				3206	else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				3207	else { out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				3208
				3209	for ( ; bits >= 0; bits-= 6) {
				3210	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
				3211	}
				3212	out[i] = 0;
				3213
				3214	htmlCheckParagraph(ctxt);
				3215	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3216	ctxt->sax->characters(ctxt->userData, out, i);
				3217	} else {
				3218	ent = htmlParseEntityRef(ctxt, &name);
				3219	if (name == NULL) {
				3220	htmlCheckParagraph(ctxt);
				3221	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3222	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
				3223	return;
				3224	}
				3225	if ((ent == NULL) \|\| (ent->value <= 0)) {
				3226	htmlCheckParagraph(ctxt);
				3227	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
				3228	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
				3229	ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
				3230	/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
				3231	}
				3232	} else {
				3233	unsigned int c;
				3234	int bits, i = 0;
				3235
				3236	c = ent->value;
				3237	if (c < 0x80)
				3238	{ out[i++]= c; bits= -6; }
				3239	else if (c < 0x800)
				3240	{ out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				3241	else if (c < 0x10000)
				3242	{ out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				3243	else
				3244	{ out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				3245
				3246	for ( ; bits >= 0; bits-= 6) {
				3247	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
				3248	}
				3249	out[i] = 0;
				3250
				3251	htmlCheckParagraph(ctxt);
				3252	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3253	ctxt->sax->characters(ctxt->userData, out, i);
				3254	}
				3255	xmlFree(name);
				3256	}
				3257	}
				3258
				3259	/**
				3260	* htmlParseContent:
				3261	* @ctxt: an HTML parser context
				3262	* @name: the node name
				3263	*
				3264	* Parse a content: comment, sub-element, reference or text.
				3265	*
				3266	*/
				3267
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3268	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3269	htmlParseContent(htmlParserCtxtPtr ctxt) {
				3270	xmlChar *currentNode;
				3271	int depth;
				3272
				3273	currentNode = xmlStrdup(ctxt->name);
				3274	depth = ctxt->nameNr;
				3275	while (1) {
				3276	long cons = ctxt->nbChars;
				3277
				3278	GROW;
				3279	/*
				3280	* Our tag or one of it's parent or children is ending.
				3281	*/
				3282	if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3283	if (htmlParseEndTag(ctxt) &&
				3284	((currentNode != NULL) \|\| (ctxt->nameNr == 0))) {
				3285	if (currentNode != NULL)
				3286	xmlFree(currentNode);
				3287	return;
				3288	}
				3289	continue; /* while */
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3290	}
				3291
				3292	/*
				3293	* Has this node been popped out during parsing of
				3294	* the next element
				3295	*/
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3296	if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
				3297	(!xmlStrEqual(currentNode, ctxt->name)))
				3298	{
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3299	if (currentNode != NULL) xmlFree(currentNode);
				3300	return;
				3301	}
				3302
Daniel Veillard	f9533d1	2001-03-03 10:04:57 +0000	[diff] [blame]	3303	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) \|\|
				3304	(xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3305	/*
				3306	* Handle SCRIPT/STYLE separately
				3307	*/
				3308	htmlParseScript(ctxt);
				3309	} else {
				3310	/*
				3311	* Sometimes DOCTYPE arrives in the middle of the document
				3312	*/
				3313	if ((CUR == '<') && (NXT(1) == '!') &&
				3314	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				3315	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				3316	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				3317	(UPP(8) == 'E')) {
				3318	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3319	ctxt->sax->error(ctxt->userData,
				3320	"Misplaced DOCTYPE declaration\n");
				3321	ctxt->wellFormed = 0;
				3322	htmlParseDocTypeDecl(ctxt);
				3323	}
				3324
				3325	/*
				3326	* First case : a comment
				3327	*/
				3328	if ((CUR == '<') && (NXT(1) == '!') &&
				3329	(NXT(2) == '-') && (NXT(3) == '-')) {
				3330	htmlParseComment(ctxt);
				3331	}
				3332
				3333	/*
				3334	* Second case : a sub-element.
				3335	*/
				3336	else if (CUR == '<') {
				3337	htmlParseElement(ctxt);
				3338	}
				3339
				3340	/*
				3341	* Third case : a reference. If if has not been resolved,
				3342	* parsing returns it's Name, create the node
				3343	*/
				3344	else if (CUR == '&') {
				3345	htmlParseReference(ctxt);
				3346	}
				3347
				3348	/*
				3349	* Fourth : end of the resource
				3350	*/
				3351	else if (CUR == 0) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3352	htmlAutoCloseOnEnd(ctxt);
				3353	break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3354	}
				3355
				3356	/*
				3357	* Last case, text. Note that References are handled directly.
				3358	*/
				3359	else {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3360	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3361	}
				3362
				3363	if (cons == ctxt->nbChars) {
				3364	if (ctxt->node != NULL) {
				3365	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3366	ctxt->sax->error(ctxt->userData,
				3367	"detected an error in element content\n");
				3368	ctxt->wellFormed = 0;
				3369	}
				3370	break;
				3371	}
				3372	}
				3373	GROW;
				3374	}
				3375	if (currentNode != NULL) xmlFree(currentNode);
				3376	}
				3377
				3378	/**
				3379	* htmlParseElement:
				3380	* @ctxt: an HTML parser context
				3381	*
				3382	* parse an HTML element, this is highly recursive
				3383	*
				3384	* [39] element ::= EmptyElemTag \| STag content ETag
				3385	*
				3386	* [41] Attribute ::= Name Eq AttValue
				3387	*/
				3388
				3389	void
				3390	htmlParseElement(htmlParserCtxtPtr ctxt) {
				3391	xmlChar *name;
				3392	xmlChar *currentNode = NULL;
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame^]	3393	const htmlElemDesc * info;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3394	htmlParserNodeInfo node_info;
				3395	xmlChar *oldname;
				3396	int depth = ctxt->nameNr;
				3397
				3398	/* Capture start position */
				3399	if (ctxt->record_info) {
				3400	node_info.begin_pos = ctxt->input->consumed +
				3401	(CUR_PTR - ctxt->input->base);
				3402	node_info.begin_line = ctxt->input->line;
				3403	}
				3404
				3405	oldname = xmlStrdup(ctxt->name);
				3406	htmlParseStartTag(ctxt);
				3407	name = ctxt->name;
				3408	#ifdef DEBUG
				3409	if (oldname == NULL)
				3410	xmlGenericError(xmlGenericErrorContext,
				3411	"Start of element %s\n", name);
				3412	else if (name == NULL)
				3413	xmlGenericError(xmlGenericErrorContext,
				3414	"Start of element failed, was %s\n", oldname);
				3415	else
				3416	xmlGenericError(xmlGenericErrorContext,
				3417	"Start of element %s, was %s\n", name, oldname);
				3418	#endif
				3419	if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) \|\|
				3420	(name == NULL)) {
				3421	if (CUR == '>')
				3422	NEXT;
				3423	if (oldname != NULL)
				3424	xmlFree(oldname);
				3425	return;
				3426	}
				3427	if (oldname != NULL)
				3428	xmlFree(oldname);
				3429
				3430	/*
				3431	* Lookup the info for that element.
				3432	*/
				3433	info = htmlTagLookup(name);
				3434	if (info == NULL) {
				3435	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3436	ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
				3437	name);
				3438	ctxt->wellFormed = 0;
				3439	} else if (info->depr) {
				3440	/***************************
				3441	if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
				3442	ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
				3443	name);
				3444	***************************/
				3445	}
				3446
				3447	/*
				3448	* Check for an Empty Element labelled the XML/SGML way
				3449	*/
				3450	if ((CUR == '/') && (NXT(1) == '>')) {
				3451	SKIP(2);
				3452	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3453	ctxt->sax->endElement(ctxt->userData, name);
				3454	oldname = htmlnamePop(ctxt);
				3455	#ifdef DEBUG
				3456	xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
				3457	#endif
				3458	if (oldname != NULL)
				3459	xmlFree(oldname);
				3460	return;
				3461	}
				3462
				3463	if (CUR == '>') {
				3464	NEXT;
				3465	} else {
				3466	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3467	ctxt->sax->error(ctxt->userData,
				3468	"Couldn't find end of Start Tag %s\n",
				3469	name);
				3470	ctxt->wellFormed = 0;
				3471
				3472	/*
				3473	* end of parsing of this node.
				3474	*/
				3475	if (xmlStrEqual(name, ctxt->name)) {
				3476	nodePop(ctxt);
				3477	oldname = htmlnamePop(ctxt);
				3478	#ifdef DEBUG
				3479	xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
				3480	#endif
				3481	if (oldname != NULL)
				3482	xmlFree(oldname);
				3483	}
				3484
				3485	/*
				3486	* Capture end position and add node
				3487	*/
				3488	if ( currentNode != NULL && ctxt->record_info ) {
				3489	node_info.end_pos = ctxt->input->consumed +
				3490	(CUR_PTR - ctxt->input->base);
				3491	node_info.end_line = ctxt->input->line;
				3492	node_info.node = ctxt->node;
				3493	xmlParserAddNodeInfo(ctxt, &node_info);
				3494	}
				3495	return;
				3496	}
				3497
				3498	/*
				3499	* Check for an Empty Element from DTD definition
				3500	*/
				3501	if ((info != NULL) && (info->empty)) {
				3502	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3503	ctxt->sax->endElement(ctxt->userData, name);
				3504	oldname = htmlnamePop(ctxt);
				3505	#ifdef DEBUG
				3506	xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
				3507	#endif
				3508	if (oldname != NULL)
				3509	xmlFree(oldname);
				3510	return;
				3511	}
				3512
				3513	/*
				3514	* Parse the content of the element:
				3515	*/
				3516	currentNode = xmlStrdup(ctxt->name);
				3517	depth = ctxt->nameNr;
				3518	while (IS_CHAR(CUR)) {
				3519	htmlParseContent(ctxt);
				3520	if (ctxt->nameNr < depth) break;
				3521	}
				3522
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3523	/*
				3524	* Capture end position and add node
				3525	*/
				3526	if ( currentNode != NULL && ctxt->record_info ) {
				3527	node_info.end_pos = ctxt->input->consumed +
				3528	(CUR_PTR - ctxt->input->base);
				3529	node_info.end_line = ctxt->input->line;
				3530	node_info.node = ctxt->node;
				3531	xmlParserAddNodeInfo(ctxt, &node_info);
				3532	}
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3533	if (!IS_CHAR(CUR)) {
				3534	htmlAutoCloseOnEnd(ctxt);
				3535	}
				3536
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3537	if (currentNode != NULL)
				3538	xmlFree(currentNode);
				3539	}
				3540
				3541	/**
				3542	* htmlParseDocument :
				3543	* @ctxt: an HTML parser context
				3544	*
				3545	* parse an HTML document (and build a tree if using the standard SAX
				3546	* interface).
				3547	*
				3548	* Returns 0, -1 in case of error. the parser context is augmented
				3549	* as a result of the parsing.
				3550	*/
				3551
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3552	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3553	htmlParseDocument(htmlParserCtxtPtr ctxt) {
				3554	xmlDtdPtr dtd;
				3555
				3556	htmlDefaultSAXHandlerInit();
				3557	ctxt->html = 1;
				3558
				3559	GROW;
				3560	/*
				3561	* SAX: beginning of the document processing.
				3562	*/
				3563	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
				3564	ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
				3565
				3566	/*
				3567	* Wipe out everything which is before the first '<'
				3568	*/
				3569	SKIP_BLANKS;
				3570	if (CUR == 0) {
				3571	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3572	ctxt->sax->error(ctxt->userData, "Document is empty\n");
				3573	ctxt->wellFormed = 0;
				3574	}
				3575
				3576	if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
				3577	ctxt->sax->startDocument(ctxt->userData);
				3578
				3579
				3580	/*
				3581	* Parse possible comments before any content
				3582	*/
				3583	while ((CUR == '<') && (NXT(1) == '!') &&
				3584	(NXT(2) == '-') && (NXT(3) == '-')) {
				3585	htmlParseComment(ctxt);
				3586	SKIP_BLANKS;
				3587	}
				3588
				3589
				3590	/*
				3591	* Then possibly doc type declaration(s) and more Misc
				3592	* (doctypedecl Misc*)?
				3593	*/
				3594	if ((CUR == '<') && (NXT(1) == '!') &&
				3595	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				3596	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				3597	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				3598	(UPP(8) == 'E')) {
				3599	htmlParseDocTypeDecl(ctxt);
				3600	}
				3601	SKIP_BLANKS;
				3602
				3603	/*
				3604	* Parse possible comments before any content
				3605	*/
				3606	while ((CUR == '<') && (NXT(1) == '!') &&
				3607	(NXT(2) == '-') && (NXT(3) == '-')) {
				3608	htmlParseComment(ctxt);
				3609	SKIP_BLANKS;
				3610	}
				3611
				3612	/*
				3613	* Time to start parsing the tree itself
				3614	*/
				3615	htmlParseContent(ctxt);
				3616
				3617	/*
				3618	* autoclose
				3619	*/
				3620	if (CUR == 0)
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3621	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3622
				3623
				3624	/*
				3625	* SAX: end of the document processing.
				3626	*/
				3627	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				3628	ctxt->sax->endDocument(ctxt->userData);
				3629
				3630	if (ctxt->myDoc != NULL) {
				3631	dtd = xmlGetIntSubset(ctxt->myDoc);
				3632	if (dtd == NULL)
				3633	ctxt->myDoc->intSubset =
				3634	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
				3635	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				3636	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
				3637	}
				3638	if (! ctxt->wellFormed) return(-1);
				3639	return(0);
				3640	}
				3641
				3642
				3643	/************************************************************************
				3644	* *
				3645	* Parser contexts handling *
				3646	* *
				3647	************************************************************************/
				3648
				3649	/**
				3650	* xmlInitParserCtxt:
				3651	* @ctxt: an HTML parser context
				3652	*
				3653	* Initialize a parser context
				3654	*/
				3655
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3656	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3657	htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
				3658	{
				3659	htmlSAXHandler *sax;
				3660
				3661	if (ctxt == NULL) return;
				3662	memset(ctxt, 0, sizeof(htmlParserCtxt));
				3663
				3664	sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
				3665	if (sax == NULL) {
				3666	xmlGenericError(xmlGenericErrorContext,
				3667	"htmlInitParserCtxt: out of memory\n");
				3668	}
				3669	else
				3670	memset(sax, 0, sizeof(htmlSAXHandler));
				3671
				3672	/* Allocate the Input stack */
				3673	ctxt->inputTab = (htmlParserInputPtr *)
				3674	xmlMalloc(5 * sizeof(htmlParserInputPtr));
				3675	if (ctxt->inputTab == NULL) {
				3676	xmlGenericError(xmlGenericErrorContext,
				3677	"htmlInitParserCtxt: out of memory\n");
				3678	ctxt->inputNr = 0;
				3679	ctxt->inputMax = 0;
				3680	ctxt->input = NULL;
				3681	return;
				3682	}
				3683	ctxt->inputNr = 0;
				3684	ctxt->inputMax = 5;
				3685	ctxt->input = NULL;
				3686	ctxt->version = NULL;
				3687	ctxt->encoding = NULL;
				3688	ctxt->standalone = -1;
				3689	ctxt->instate = XML_PARSER_START;
				3690
				3691	/* Allocate the Node stack */
				3692	ctxt->nodeTab = (htmlNodePtr ) xmlMalloc(10 sizeof(htmlNodePtr));
				3693	if (ctxt->nodeTab == NULL) {
				3694	xmlGenericError(xmlGenericErrorContext,
				3695	"htmlInitParserCtxt: out of memory\n");
				3696	ctxt->nodeNr = 0;
				3697	ctxt->nodeMax = 0;
				3698	ctxt->node = NULL;
				3699	ctxt->inputNr = 0;
				3700	ctxt->inputMax = 0;
				3701	ctxt->input = NULL;
				3702	return;
				3703	}
				3704	ctxt->nodeNr = 0;
				3705	ctxt->nodeMax = 10;
				3706	ctxt->node = NULL;
				3707
				3708	/* Allocate the Name stack */
				3709	ctxt->nameTab = (xmlChar *) xmlMalloc(10 sizeof(xmlChar *));
				3710	if (ctxt->nameTab == NULL) {
				3711	xmlGenericError(xmlGenericErrorContext,
				3712	"htmlInitParserCtxt: out of memory\n");
				3713	ctxt->nameNr = 0;
				3714	ctxt->nameMax = 10;
				3715	ctxt->name = NULL;
				3716	ctxt->nodeNr = 0;
				3717	ctxt->nodeMax = 0;
				3718	ctxt->node = NULL;
				3719	ctxt->inputNr = 0;
				3720	ctxt->inputMax = 0;
				3721	ctxt->input = NULL;
				3722	return;
				3723	}
				3724	ctxt->nameNr = 0;
				3725	ctxt->nameMax = 10;
				3726	ctxt->name = NULL;
				3727
				3728	if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
				3729	else {
				3730	ctxt->sax = sax;
				3731	memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
				3732	}
				3733	ctxt->userData = ctxt;
				3734	ctxt->myDoc = NULL;
				3735	ctxt->wellFormed = 1;
				3736	ctxt->replaceEntities = 0;
				3737	ctxt->html = 1;
				3738	ctxt->record_info = 0;
				3739	ctxt->validate = 0;
				3740	ctxt->nbChars = 0;
				3741	ctxt->checkIndex = 0;
				3742	xmlInitNodeInfoSeq(&ctxt->node_seq);
				3743	}
				3744
				3745	/**
				3746	* htmlFreeParserCtxt:
				3747	* @ctxt: an HTML parser context
				3748	*
				3749	* Free all the memory used by a parser context. However the parsed
				3750	* document in ctxt->myDoc is not freed.
				3751	*/
				3752
				3753	void
				3754	htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
				3755	{
				3756	xmlFreeParserCtxt(ctxt);
				3757	}
				3758
				3759	/**
				3760	* htmlCreateDocParserCtxt :
				3761	* @cur: a pointer to an array of xmlChar
				3762	* @encoding: a free form C string describing the HTML document encoding, or NULL
				3763	*
				3764	* Create a parser context for an HTML document.
				3765	*
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3766	* TODO: check the need to add encoding handling there
				3767	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3768	* Returns the new parser context or NULL
				3769	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3770	static htmlParserCtxtPtr
Daniel Veillard	c86a4fa	2001-03-26 16:28:29 +0000	[diff] [blame]	3771	htmlCreateDocParserCtxt(xmlChar cur, const char encoding ATTRIBUTE_UNUSED) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3772	htmlParserCtxtPtr ctxt;
				3773	htmlParserInputPtr input;
				3774	/* htmlCharEncoding enc; */
				3775
				3776	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				3777	if (ctxt == NULL) {
				3778	perror("malloc");
				3779	return(NULL);
				3780	}
				3781	htmlInitParserCtxt(ctxt);
				3782	input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				3783	if (input == NULL) {
				3784	perror("malloc");
				3785	xmlFree(ctxt);
				3786	return(NULL);
				3787	}
				3788	memset(input, 0, sizeof(htmlParserInput));
				3789
				3790	input->line = 1;
				3791	input->col = 1;
				3792	input->base = cur;
				3793	input->cur = cur;
				3794
				3795	inputPush(ctxt, input);
				3796	return(ctxt);
				3797	}
				3798
				3799	/************************************************************************
				3800	* *
				3801	* Progressive parsing interfaces *
				3802	* *
				3803	************************************************************************/
				3804
				3805	/**
				3806	* htmlParseLookupSequence:
				3807	* @ctxt: an HTML parser context
				3808	* @first: the first char to lookup
				3809	* @next: the next char to lookup or zero
				3810	* @third: the next char to lookup or zero
				3811	*
				3812	* Try to find if a sequence (first, next, third) or just (first next) or
				3813	* (first) is available in the input stream.
				3814	* This function has a side effect of (possibly) incrementing ctxt->checkIndex
				3815	* to avoid rescanning sequences of bytes, it DOES change the state of the
				3816	* parser, do not use liberally.
				3817	* This is basically similar to xmlParseLookupSequence()
				3818	*
				3819	* Returns the index to the current parsing point if the full sequence
				3820	* is available, -1 otherwise.
				3821	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3822	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3823	htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
				3824	xmlChar next, xmlChar third) {
				3825	int base, len;
				3826	htmlParserInputPtr in;
				3827	const xmlChar *buf;
				3828
				3829	in = ctxt->input;
				3830	if (in == NULL) return(-1);
				3831	base = in->cur - in->base;
				3832	if (base < 0) return(-1);
				3833	if (ctxt->checkIndex > base)
				3834	base = ctxt->checkIndex;
				3835	if (in->buf == NULL) {
				3836	buf = in->base;
				3837	len = in->length;
				3838	} else {
				3839	buf = in->buf->buffer->content;
				3840	len = in->buf->buffer->use;
				3841	}
				3842	/* take into account the sequence length */
				3843	if (third) len -= 2;
				3844	else if (next) len --;
				3845	for (;base < len;base++) {
				3846	if (buf[base] == first) {
				3847	if (third != 0) {
				3848	if ((buf[base + 1] != next) \|\|
				3849	(buf[base + 2] != third)) continue;
				3850	} else if (next != 0) {
				3851	if (buf[base + 1] != next) continue;
				3852	}
				3853	ctxt->checkIndex = 0;
				3854	#ifdef DEBUG_PUSH
				3855	if (next == 0)
				3856	xmlGenericError(xmlGenericErrorContext,
				3857	"HPP: lookup '%c' found at %d\n",
				3858	first, base);
				3859	else if (third == 0)
				3860	xmlGenericError(xmlGenericErrorContext,
				3861	"HPP: lookup '%c%c' found at %d\n",
				3862	first, next, base);
				3863	else
				3864	xmlGenericError(xmlGenericErrorContext,
				3865	"HPP: lookup '%c%c%c' found at %d\n",
				3866	first, next, third, base);
				3867	#endif
				3868	return(base - (in->cur - in->base));
				3869	}
				3870	}
				3871	ctxt->checkIndex = base;
				3872	#ifdef DEBUG_PUSH
				3873	if (next == 0)
				3874	xmlGenericError(xmlGenericErrorContext,
				3875	"HPP: lookup '%c' failed\n", first);
				3876	else if (third == 0)
				3877	xmlGenericError(xmlGenericErrorContext,
				3878	"HPP: lookup '%c%c' failed\n", first, next);
				3879	else
				3880	xmlGenericError(xmlGenericErrorContext,
				3881	"HPP: lookup '%c%c%c' failed\n", first, next, third);
				3882	#endif
				3883	return(-1);
				3884	}
				3885
				3886	/**
				3887	* htmlParseTryOrFinish:
				3888	* @ctxt: an HTML parser context
				3889	* @terminate: last chunk indicator
				3890	*
				3891	* Try to progress on parsing
				3892	*
				3893	* Returns zero if no parsing was possible
				3894	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3895	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3896	htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
				3897	int ret = 0;
				3898	htmlParserInputPtr in;
				3899	int avail = 0;
				3900	xmlChar cur, next;
				3901
				3902	#ifdef DEBUG_PUSH
				3903	switch (ctxt->instate) {
				3904	case XML_PARSER_EOF:
				3905	xmlGenericError(xmlGenericErrorContext,
				3906	"HPP: try EOF\n"); break;
				3907	case XML_PARSER_START:
				3908	xmlGenericError(xmlGenericErrorContext,
				3909	"HPP: try START\n"); break;
				3910	case XML_PARSER_MISC:
				3911	xmlGenericError(xmlGenericErrorContext,
				3912	"HPP: try MISC\n");break;
				3913	case XML_PARSER_COMMENT:
				3914	xmlGenericError(xmlGenericErrorContext,
				3915	"HPP: try COMMENT\n");break;
				3916	case XML_PARSER_PROLOG:
				3917	xmlGenericError(xmlGenericErrorContext,
				3918	"HPP: try PROLOG\n");break;
				3919	case XML_PARSER_START_TAG:
				3920	xmlGenericError(xmlGenericErrorContext,
				3921	"HPP: try START_TAG\n");break;
				3922	case XML_PARSER_CONTENT:
				3923	xmlGenericError(xmlGenericErrorContext,
				3924	"HPP: try CONTENT\n");break;
				3925	case XML_PARSER_CDATA_SECTION:
				3926	xmlGenericError(xmlGenericErrorContext,
				3927	"HPP: try CDATA_SECTION\n");break;
				3928	case XML_PARSER_END_TAG:
				3929	xmlGenericError(xmlGenericErrorContext,
				3930	"HPP: try END_TAG\n");break;
				3931	case XML_PARSER_ENTITY_DECL:
				3932	xmlGenericError(xmlGenericErrorContext,
				3933	"HPP: try ENTITY_DECL\n");break;
				3934	case XML_PARSER_ENTITY_VALUE:
				3935	xmlGenericError(xmlGenericErrorContext,
				3936	"HPP: try ENTITY_VALUE\n");break;
				3937	case XML_PARSER_ATTRIBUTE_VALUE:
				3938	xmlGenericError(xmlGenericErrorContext,
				3939	"HPP: try ATTRIBUTE_VALUE\n");break;
				3940	case XML_PARSER_DTD:
				3941	xmlGenericError(xmlGenericErrorContext,
				3942	"HPP: try DTD\n");break;
				3943	case XML_PARSER_EPILOG:
				3944	xmlGenericError(xmlGenericErrorContext,
				3945	"HPP: try EPILOG\n");break;
				3946	case XML_PARSER_PI:
				3947	xmlGenericError(xmlGenericErrorContext,
				3948	"HPP: try PI\n");break;
				3949	case XML_PARSER_SYSTEM_LITERAL:
				3950	xmlGenericError(xmlGenericErrorContext,
				3951	"HPP: try SYSTEM_LITERAL\n");break;
				3952	}
				3953	#endif
				3954
				3955	while (1) {
				3956
				3957	in = ctxt->input;
				3958	if (in == NULL) break;
				3959	if (in->buf == NULL)
				3960	avail = in->length - (in->cur - in->base);
				3961	else
				3962	avail = in->buf->buffer->use - (in->cur - in->base);
				3963	if ((avail == 0) && (terminate)) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3964	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3965	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
				3966	/*
				3967	* SAX: end of the document processing.
				3968	*/
				3969	ctxt->instate = XML_PARSER_EOF;
				3970	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				3971	ctxt->sax->endDocument(ctxt->userData);
				3972	}
				3973	}
				3974	if (avail < 1)
				3975	goto done;
				3976	switch (ctxt->instate) {
				3977	case XML_PARSER_EOF:
				3978	/*
				3979	* Document parsing is done !
				3980	*/
				3981	goto done;
				3982	case XML_PARSER_START:
				3983	/*
				3984	* Very first chars read from the document flow.
				3985	*/
				3986	cur = in->cur[0];
				3987	if (IS_BLANK(cur)) {
				3988	SKIP_BLANKS;
				3989	if (in->buf == NULL)
				3990	avail = in->length - (in->cur - in->base);
				3991	else
				3992	avail = in->buf->buffer->use - (in->cur - in->base);
				3993	}
				3994	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
				3995	ctxt->sax->setDocumentLocator(ctxt->userData,
				3996	&xmlDefaultSAXLocator);
				3997	if ((ctxt->sax) && (ctxt->sax->startDocument) &&
				3998	(!ctxt->disableSAX))
				3999	ctxt->sax->startDocument(ctxt->userData);
				4000
				4001	cur = in->cur[0];
				4002	next = in->cur[1];
				4003	if ((cur == '<') && (next == '!') &&
				4004	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4005	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4006	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4007	(UPP(8) == 'E')) {
				4008	if ((!terminate) &&
				4009	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4010	goto done;
				4011	#ifdef DEBUG_PUSH
				4012	xmlGenericError(xmlGenericErrorContext,
				4013	"HPP: Parsing internal subset\n");
				4014	#endif
				4015	htmlParseDocTypeDecl(ctxt);
				4016	ctxt->instate = XML_PARSER_PROLOG;
				4017	#ifdef DEBUG_PUSH
				4018	xmlGenericError(xmlGenericErrorContext,
				4019	"HPP: entering PROLOG\n");
				4020	#endif
				4021	} else {
				4022	ctxt->instate = XML_PARSER_MISC;
				4023	}
				4024	#ifdef DEBUG_PUSH
				4025	xmlGenericError(xmlGenericErrorContext,
				4026	"HPP: entering MISC\n");
				4027	#endif
				4028	break;
				4029	case XML_PARSER_MISC:
				4030	SKIP_BLANKS;
				4031	if (in->buf == NULL)
				4032	avail = in->length - (in->cur - in->base);
				4033	else
				4034	avail = in->buf->buffer->use - (in->cur - in->base);
				4035	if (avail < 2)
				4036	goto done;
				4037	cur = in->cur[0];
				4038	next = in->cur[1];
				4039	if ((cur == '<') && (next == '!') &&
				4040	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4041	if ((!terminate) &&
				4042	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4043	goto done;
				4044	#ifdef DEBUG_PUSH
				4045	xmlGenericError(xmlGenericErrorContext,
				4046	"HPP: Parsing Comment\n");
				4047	#endif
				4048	htmlParseComment(ctxt);
				4049	ctxt->instate = XML_PARSER_MISC;
				4050	} else if ((cur == '<') && (next == '!') &&
				4051	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4052	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4053	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4054	(UPP(8) == 'E')) {
				4055	if ((!terminate) &&
				4056	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4057	goto done;
				4058	#ifdef DEBUG_PUSH
				4059	xmlGenericError(xmlGenericErrorContext,
				4060	"HPP: Parsing internal subset\n");
				4061	#endif
				4062	htmlParseDocTypeDecl(ctxt);
				4063	ctxt->instate = XML_PARSER_PROLOG;
				4064	#ifdef DEBUG_PUSH
				4065	xmlGenericError(xmlGenericErrorContext,
				4066	"HPP: entering PROLOG\n");
				4067	#endif
				4068	} else if ((cur == '<') && (next == '!') &&
				4069	(avail < 9)) {
				4070	goto done;
				4071	} else {
				4072	ctxt->instate = XML_PARSER_START_TAG;
				4073	#ifdef DEBUG_PUSH
				4074	xmlGenericError(xmlGenericErrorContext,
				4075	"HPP: entering START_TAG\n");
				4076	#endif
				4077	}
				4078	break;
				4079	case XML_PARSER_PROLOG:
				4080	SKIP_BLANKS;
				4081	if (in->buf == NULL)
				4082	avail = in->length - (in->cur - in->base);
				4083	else
				4084	avail = in->buf->buffer->use - (in->cur - in->base);
				4085	if (avail < 2)
				4086	goto done;
				4087	cur = in->cur[0];
				4088	next = in->cur[1];
				4089	if ((cur == '<') && (next == '!') &&
				4090	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4091	if ((!terminate) &&
				4092	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4093	goto done;
				4094	#ifdef DEBUG_PUSH
				4095	xmlGenericError(xmlGenericErrorContext,
				4096	"HPP: Parsing Comment\n");
				4097	#endif
				4098	htmlParseComment(ctxt);
				4099	ctxt->instate = XML_PARSER_PROLOG;
				4100	} else if ((cur == '<') && (next == '!') &&
				4101	(avail < 4)) {
				4102	goto done;
				4103	} else {
				4104	ctxt->instate = XML_PARSER_START_TAG;
				4105	#ifdef DEBUG_PUSH
				4106	xmlGenericError(xmlGenericErrorContext,
				4107	"HPP: entering START_TAG\n");
				4108	#endif
				4109	}
				4110	break;
				4111	case XML_PARSER_EPILOG:
				4112	if (in->buf == NULL)
				4113	avail = in->length - (in->cur - in->base);
				4114	else
				4115	avail = in->buf->buffer->use - (in->cur - in->base);
				4116	if (avail < 1)
				4117	goto done;
				4118	cur = in->cur[0];
				4119	if (IS_BLANK(cur)) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	4120	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4121	goto done;
				4122	}
				4123	if (avail < 2)
				4124	goto done;
				4125	next = in->cur[1];
				4126	if ((cur == '<') && (next == '!') &&
				4127	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4128	if ((!terminate) &&
				4129	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4130	goto done;
				4131	#ifdef DEBUG_PUSH
				4132	xmlGenericError(xmlGenericErrorContext,
				4133	"HPP: Parsing Comment\n");
				4134	#endif
				4135	htmlParseComment(ctxt);
				4136	ctxt->instate = XML_PARSER_EPILOG;
				4137	} else if ((cur == '<') && (next == '!') &&
				4138	(avail < 4)) {
				4139	goto done;
				4140	} else {
				4141	ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4142	ctxt->wellFormed = 0;
				4143	ctxt->instate = XML_PARSER_EOF;
				4144	#ifdef DEBUG_PUSH
				4145	xmlGenericError(xmlGenericErrorContext,
				4146	"HPP: entering EOF\n");
				4147	#endif
				4148	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4149	ctxt->sax->endDocument(ctxt->userData);
				4150	goto done;
				4151	}
				4152	break;
				4153	case XML_PARSER_START_TAG: {
				4154	xmlChar name, oldname;
				4155	int depth = ctxt->nameNr;
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame^]	4156	const htmlElemDesc * info;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4157
				4158	if (avail < 2)
				4159	goto done;
				4160	cur = in->cur[0];
				4161	if (cur != '<') {
				4162	ctxt->instate = XML_PARSER_CONTENT;
				4163	#ifdef DEBUG_PUSH
				4164	xmlGenericError(xmlGenericErrorContext,
				4165	"HPP: entering CONTENT\n");
				4166	#endif
				4167	break;
				4168	}
Daniel Veillard	f69bb4b	2001-05-19 13:24:56 +0000	[diff] [blame]	4169	if (in->cur[1] == '/') {
				4170	ctxt->instate = XML_PARSER_END_TAG;
				4171	ctxt->checkIndex = 0;
				4172	#ifdef DEBUG_PUSH
				4173	xmlGenericError(xmlGenericErrorContext,
				4174	"HPP: entering END_TAG\n");
				4175	#endif
				4176	break;
				4177	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4178	if ((!terminate) &&
				4179	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4180	goto done;
				4181
				4182	oldname = xmlStrdup(ctxt->name);
				4183	htmlParseStartTag(ctxt);
				4184	name = ctxt->name;
				4185	#ifdef DEBUG
				4186	if (oldname == NULL)
				4187	xmlGenericError(xmlGenericErrorContext,
				4188	"Start of element %s\n", name);
				4189	else if (name == NULL)
				4190	xmlGenericError(xmlGenericErrorContext,
				4191	"Start of element failed, was %s\n",
				4192	oldname);
				4193	else
				4194	xmlGenericError(xmlGenericErrorContext,
				4195	"Start of element %s, was %s\n",
				4196	name, oldname);
				4197	#endif
				4198	if (((depth == ctxt->nameNr) &&
				4199	(xmlStrEqual(oldname, ctxt->name))) \|\|
				4200	(name == NULL)) {
				4201	if (CUR == '>')
				4202	NEXT;
				4203	if (oldname != NULL)
				4204	xmlFree(oldname);
				4205	break;
				4206	}
				4207	if (oldname != NULL)
				4208	xmlFree(oldname);
				4209
				4210	/*
				4211	* Lookup the info for that element.
				4212	*/
				4213	info = htmlTagLookup(name);
				4214	if (info == NULL) {
				4215	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4216	ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
				4217	name);
				4218	ctxt->wellFormed = 0;
				4219	} else if (info->depr) {
				4220	/***************************
				4221	if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
				4222	ctxt->sax->warning(ctxt->userData,
				4223	"Tag %s is deprecated\n",
				4224	name);
				4225	***************************/
				4226	}
				4227
				4228	/*
				4229	* Check for an Empty Element labelled the XML/SGML way
				4230	*/
				4231	if ((CUR == '/') && (NXT(1) == '>')) {
				4232	SKIP(2);
				4233	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4234	ctxt->sax->endElement(ctxt->userData, name);
				4235	oldname = htmlnamePop(ctxt);
				4236	#ifdef DEBUG
				4237	xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
				4238	oldname);
				4239	#endif
				4240	if (oldname != NULL)
				4241	xmlFree(oldname);
				4242	ctxt->instate = XML_PARSER_CONTENT;
				4243	#ifdef DEBUG_PUSH
				4244	xmlGenericError(xmlGenericErrorContext,
				4245	"HPP: entering CONTENT\n");
				4246	#endif
				4247	break;
				4248	}
				4249
				4250	if (CUR == '>') {
				4251	NEXT;
				4252	} else {
				4253	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4254	ctxt->sax->error(ctxt->userData,
				4255	"Couldn't find end of Start Tag %s\n",
				4256	name);
				4257	ctxt->wellFormed = 0;
				4258
				4259	/*
				4260	* end of parsing of this node.
				4261	*/
				4262	if (xmlStrEqual(name, ctxt->name)) {
				4263	nodePop(ctxt);
				4264	oldname = htmlnamePop(ctxt);
				4265	#ifdef DEBUG
				4266	xmlGenericError(xmlGenericErrorContext,
				4267	"End of start tag problem: popping out %s\n", oldname);
				4268	#endif
				4269	if (oldname != NULL)
				4270	xmlFree(oldname);
				4271	}
				4272
				4273	ctxt->instate = XML_PARSER_CONTENT;
				4274	#ifdef DEBUG_PUSH
				4275	xmlGenericError(xmlGenericErrorContext,
				4276	"HPP: entering CONTENT\n");
				4277	#endif
				4278	break;
				4279	}
				4280
				4281	/*
				4282	* Check for an Empty Element from DTD definition
				4283	*/
				4284	if ((info != NULL) && (info->empty)) {
				4285	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4286	ctxt->sax->endElement(ctxt->userData, name);
				4287	oldname = htmlnamePop(ctxt);
				4288	#ifdef DEBUG
				4289	xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
				4290	#endif
				4291	if (oldname != NULL)
				4292	xmlFree(oldname);
				4293	}
				4294	ctxt->instate = XML_PARSER_CONTENT;
				4295	#ifdef DEBUG_PUSH
				4296	xmlGenericError(xmlGenericErrorContext,
				4297	"HPP: entering CONTENT\n");
				4298	#endif
				4299	break;
				4300	}
				4301	case XML_PARSER_CONTENT: {
				4302	long cons;
				4303	/*
				4304	* Handle preparsed entities and charRef
				4305	*/
				4306	if (ctxt->token != 0) {
				4307	xmlChar chr[2] = { 0 , 0 } ;
				4308
				4309	chr[0] = (xmlChar) ctxt->token;
				4310	htmlCheckParagraph(ctxt);
				4311	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				4312	ctxt->sax->characters(ctxt->userData, chr, 1);
				4313	ctxt->token = 0;
				4314	ctxt->checkIndex = 0;
				4315	}
				4316	if ((avail == 1) && (terminate)) {
				4317	cur = in->cur[0];
				4318	if ((cur != '<') && (cur != '&')) {
				4319	if (ctxt->sax != NULL) {
				4320	if (IS_BLANK(cur)) {
				4321	if (ctxt->sax->ignorableWhitespace != NULL)
				4322	ctxt->sax->ignorableWhitespace(
				4323	ctxt->userData, &cur, 1);
				4324	} else {
				4325	htmlCheckParagraph(ctxt);
				4326	if (ctxt->sax->characters != NULL)
				4327	ctxt->sax->characters(
				4328	ctxt->userData, &cur, 1);
				4329	}
				4330	}
				4331	ctxt->token = 0;
				4332	ctxt->checkIndex = 0;
				4333	NEXT;
				4334	}
				4335	break;
				4336	}
				4337	if (avail < 2)
				4338	goto done;
				4339	cur = in->cur[0];
				4340	next = in->cur[1];
				4341	cons = ctxt->nbChars;
				4342	if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) \|\|
				4343	(xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
				4344	/*
				4345	* Handle SCRIPT/STYLE separately
				4346	*/
				4347	if ((!terminate) &&
				4348	(htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
				4349	goto done;
				4350	htmlParseScript(ctxt);
				4351	if ((cur == '<') && (next == '/')) {
				4352	ctxt->instate = XML_PARSER_END_TAG;
				4353	ctxt->checkIndex = 0;
				4354	#ifdef DEBUG_PUSH
				4355	xmlGenericError(xmlGenericErrorContext,
				4356	"HPP: entering END_TAG\n");
				4357	#endif
				4358	break;
				4359	}
				4360	} else {
				4361	/*
				4362	* Sometimes DOCTYPE arrives in the middle of the document
				4363	*/
				4364	if ((cur == '<') && (next == '!') &&
				4365	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4366	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4367	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4368	(UPP(8) == 'E')) {
				4369	if ((!terminate) &&
				4370	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4371	goto done;
				4372	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4373	ctxt->sax->error(ctxt->userData,
				4374	"Misplaced DOCTYPE declaration\n");
				4375	ctxt->wellFormed = 0;
				4376	htmlParseDocTypeDecl(ctxt);
				4377	} else if ((cur == '<') && (next == '!') &&
				4378	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4379	if ((!terminate) &&
				4380	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4381	goto done;
				4382	#ifdef DEBUG_PUSH
				4383	xmlGenericError(xmlGenericErrorContext,
				4384	"HPP: Parsing Comment\n");
				4385	#endif
				4386	htmlParseComment(ctxt);
				4387	ctxt->instate = XML_PARSER_CONTENT;
				4388	} else if ((cur == '<') && (next == '!') && (avail < 4)) {
				4389	goto done;
				4390	} else if ((cur == '<') && (next == '/')) {
				4391	ctxt->instate = XML_PARSER_END_TAG;
				4392	ctxt->checkIndex = 0;
				4393	#ifdef DEBUG_PUSH
				4394	xmlGenericError(xmlGenericErrorContext,
				4395	"HPP: entering END_TAG\n");
				4396	#endif
				4397	break;
				4398	} else if (cur == '<') {
				4399	ctxt->instate = XML_PARSER_START_TAG;
				4400	ctxt->checkIndex = 0;
				4401	#ifdef DEBUG_PUSH
				4402	xmlGenericError(xmlGenericErrorContext,
				4403	"HPP: entering START_TAG\n");
				4404	#endif
				4405	break;
				4406	} else if (cur == '&') {
				4407	if ((!terminate) &&
				4408	(htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
				4409	goto done;
				4410	#ifdef DEBUG_PUSH
				4411	xmlGenericError(xmlGenericErrorContext,
				4412	"HPP: Parsing Reference\n");
				4413	#endif
				4414	/* TODO: check generation of subtrees if noent !!! */
				4415	htmlParseReference(ctxt);
				4416	} else {
				4417	/* TODO Avoid the extra copy, handle directly !!!!!! */
				4418	/*
				4419	* Goal of the following test is :
				4420	* - minimize calls to the SAX 'character' callback
				4421	* when they are mergeable
				4422	*/
				4423	if ((ctxt->inputNr == 1) &&
				4424	(avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
				4425	if ((!terminate) &&
				4426	(htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
				4427	goto done;
				4428	}
				4429	ctxt->checkIndex = 0;
				4430	#ifdef DEBUG_PUSH
				4431	xmlGenericError(xmlGenericErrorContext,
				4432	"HPP: Parsing char data\n");
				4433	#endif
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	4434	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4435	}
				4436	}
				4437	if (cons == ctxt->nbChars) {
				4438	if (ctxt->node != NULL) {
				4439	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4440	ctxt->sax->error(ctxt->userData,
				4441	"detected an error in element content\n");
				4442	ctxt->wellFormed = 0;
				4443	}
				4444	NEXT;
				4445	break;
				4446	}
				4447
				4448	break;
				4449	}
				4450	case XML_PARSER_END_TAG:
				4451	if (avail < 2)
				4452	goto done;
				4453	if ((!terminate) &&
				4454	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4455	goto done;
				4456	htmlParseEndTag(ctxt);
				4457	if (ctxt->nameNr == 0) {
				4458	ctxt->instate = XML_PARSER_EPILOG;
				4459	} else {
				4460	ctxt->instate = XML_PARSER_CONTENT;
				4461	}
				4462	ctxt->checkIndex = 0;
				4463	#ifdef DEBUG_PUSH
				4464	xmlGenericError(xmlGenericErrorContext,
				4465	"HPP: entering CONTENT\n");
				4466	#endif
				4467	break;
				4468	case XML_PARSER_CDATA_SECTION:
				4469	xmlGenericError(xmlGenericErrorContext,
				4470	"HPP: internal error, state == CDATA\n");
				4471	ctxt->instate = XML_PARSER_CONTENT;
				4472	ctxt->checkIndex = 0;
				4473	#ifdef DEBUG_PUSH
				4474	xmlGenericError(xmlGenericErrorContext,
				4475	"HPP: entering CONTENT\n");
				4476	#endif
				4477	break;
				4478	case XML_PARSER_DTD:
				4479	xmlGenericError(xmlGenericErrorContext,
				4480	"HPP: internal error, state == DTD\n");
				4481	ctxt->instate = XML_PARSER_CONTENT;
				4482	ctxt->checkIndex = 0;
				4483	#ifdef DEBUG_PUSH
				4484	xmlGenericError(xmlGenericErrorContext,
				4485	"HPP: entering CONTENT\n");
				4486	#endif
				4487	break;
				4488	case XML_PARSER_COMMENT:
				4489	xmlGenericError(xmlGenericErrorContext,
				4490	"HPP: internal error, state == COMMENT\n");
				4491	ctxt->instate = XML_PARSER_CONTENT;
				4492	ctxt->checkIndex = 0;
				4493	#ifdef DEBUG_PUSH
				4494	xmlGenericError(xmlGenericErrorContext,
				4495	"HPP: entering CONTENT\n");
				4496	#endif
				4497	break;
				4498	case XML_PARSER_PI:
				4499	xmlGenericError(xmlGenericErrorContext,
				4500	"HPP: internal error, state == PI\n");
				4501	ctxt->instate = XML_PARSER_CONTENT;
				4502	ctxt->checkIndex = 0;
				4503	#ifdef DEBUG_PUSH
				4504	xmlGenericError(xmlGenericErrorContext,
				4505	"HPP: entering CONTENT\n");
				4506	#endif
				4507	break;
				4508	case XML_PARSER_ENTITY_DECL:
				4509	xmlGenericError(xmlGenericErrorContext,
				4510	"HPP: internal error, state == ENTITY_DECL\n");
				4511	ctxt->instate = XML_PARSER_CONTENT;
				4512	ctxt->checkIndex = 0;
				4513	#ifdef DEBUG_PUSH
				4514	xmlGenericError(xmlGenericErrorContext,
				4515	"HPP: entering CONTENT\n");
				4516	#endif
				4517	break;
				4518	case XML_PARSER_ENTITY_VALUE:
				4519	xmlGenericError(xmlGenericErrorContext,
				4520	"HPP: internal error, state == ENTITY_VALUE\n");
				4521	ctxt->instate = XML_PARSER_CONTENT;
				4522	ctxt->checkIndex = 0;
				4523	#ifdef DEBUG_PUSH
				4524	xmlGenericError(xmlGenericErrorContext,
				4525	"HPP: entering DTD\n");
				4526	#endif
				4527	break;
				4528	case XML_PARSER_ATTRIBUTE_VALUE:
				4529	xmlGenericError(xmlGenericErrorContext,
				4530	"HPP: internal error, state == ATTRIBUTE_VALUE\n");
				4531	ctxt->instate = XML_PARSER_START_TAG;
				4532	ctxt->checkIndex = 0;
				4533	#ifdef DEBUG_PUSH
				4534	xmlGenericError(xmlGenericErrorContext,
				4535	"HPP: entering START_TAG\n");
				4536	#endif
				4537	break;
				4538	case XML_PARSER_SYSTEM_LITERAL:
				4539	xmlGenericError(xmlGenericErrorContext,
				4540	"HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
				4541	ctxt->instate = XML_PARSER_CONTENT;
				4542	ctxt->checkIndex = 0;
				4543	#ifdef DEBUG_PUSH
				4544	xmlGenericError(xmlGenericErrorContext,
				4545	"HPP: entering CONTENT\n");
				4546	#endif
				4547	break;
				4548	case XML_PARSER_IGNORE:
				4549	xmlGenericError(xmlGenericErrorContext,
				4550	"HPP: internal error, state == XML_PARSER_IGNORE\n");
				4551	ctxt->instate = XML_PARSER_CONTENT;
				4552	ctxt->checkIndex = 0;
				4553	#ifdef DEBUG_PUSH
				4554	xmlGenericError(xmlGenericErrorContext,
				4555	"HPP: entering CONTENT\n");
				4556	#endif
				4557	break;
				4558	}
				4559	}
				4560	done:
				4561	if ((avail == 0) && (terminate)) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	4562	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4563	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
				4564	/*
				4565	* SAX: end of the document processing.
				4566	*/
				4567	ctxt->instate = XML_PARSER_EOF;
				4568	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4569	ctxt->sax->endDocument(ctxt->userData);
				4570	}
				4571	}
				4572	if ((ctxt->myDoc != NULL) &&
				4573	((terminate) \|\| (ctxt->instate == XML_PARSER_EOF) \|\|
				4574	(ctxt->instate == XML_PARSER_EPILOG))) {
				4575	xmlDtdPtr dtd;
				4576	dtd = xmlGetIntSubset(ctxt->myDoc);
				4577	if (dtd == NULL)
				4578	ctxt->myDoc->intSubset =
				4579	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
				4580	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				4581	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
				4582	}
				4583	#ifdef DEBUG_PUSH
				4584	xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
				4585	#endif
				4586	return(ret);
				4587	}
				4588
				4589	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4590	* htmlParseChunk:
				4591	* @ctxt: an XML parser context
				4592	* @chunk: an char array
				4593	* @size: the size in byte of the chunk
				4594	* @terminate: last chunk indicator
				4595	*
				4596	* Parse a Chunk of memory
				4597	*
				4598	* Returns zero if no error, the xmlParserErrors otherwise.
				4599	*/
				4600	int
				4601	htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
				4602	int terminate) {
				4603	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
				4604	(ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
				4605	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
				4606	int cur = ctxt->input->cur - ctxt->input->base;
				4607
				4608	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
				4609	ctxt->input->base = ctxt->input->buf->buffer->content + base;
				4610	ctxt->input->cur = ctxt->input->base + cur;
				4611	#ifdef DEBUG_PUSH
				4612	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
				4613	#endif
				4614
				4615	if ((terminate) \|\| (ctxt->input->buf->buffer->use > 80))
				4616	htmlParseTryOrFinish(ctxt, terminate);
				4617	} else if (ctxt->instate != XML_PARSER_EOF) {
				4618	xmlParserInputBufferPush(ctxt->input->buf, 0, "");
				4619	htmlParseTryOrFinish(ctxt, terminate);
				4620	}
				4621	if (terminate) {
				4622	if ((ctxt->instate != XML_PARSER_EOF) &&
				4623	(ctxt->instate != XML_PARSER_EPILOG) &&
				4624	(ctxt->instate != XML_PARSER_MISC)) {
				4625	ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4626	ctxt->wellFormed = 0;
				4627	}
				4628	if (ctxt->instate != XML_PARSER_EOF) {
				4629	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4630	ctxt->sax->endDocument(ctxt->userData);
				4631	}
				4632	ctxt->instate = XML_PARSER_EOF;
				4633	}
				4634	return((xmlParserErrors) ctxt->errNo);
				4635	}
				4636
				4637	/************************************************************************
				4638	* *
				4639	* User entry points *
				4640	* *
				4641	************************************************************************/
				4642
				4643	/**
				4644	* htmlCreatePushParserCtxt :
				4645	* @sax: a SAX handler
				4646	* @user_data: The user data returned on SAX callbacks
				4647	* @chunk: a pointer to an array of chars
				4648	* @size: number of chars in the array
				4649	* @filename: an optional file name or URI
				4650	* @enc: an optional encoding
				4651	*
				4652	* Create a parser context for using the HTML parser in push mode
				4653	* To allow content encoding detection, @size should be >= 4
				4654	* The value of @filename is used for fetching external entities
				4655	* and error/warning reports.
				4656	*
				4657	* Returns the new parser context or NULL
				4658	*/
				4659	htmlParserCtxtPtr
				4660	htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
				4661	const char chunk, int size, const char filename,
				4662	xmlCharEncoding enc) {
				4663	htmlParserCtxtPtr ctxt;
				4664	htmlParserInputPtr inputStream;
				4665	xmlParserInputBufferPtr buf;
				4666
				4667	buf = xmlAllocParserInputBuffer(enc);
				4668	if (buf == NULL) return(NULL);
				4669
				4670	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				4671	if (ctxt == NULL) {
				4672	xmlFree(buf);
				4673	return(NULL);
				4674	}
				4675	memset(ctxt, 0, sizeof(htmlParserCtxt));
				4676	htmlInitParserCtxt(ctxt);
				4677	if (sax != NULL) {
				4678	if (ctxt->sax != &htmlDefaultSAXHandler)
				4679	xmlFree(ctxt->sax);
				4680	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
				4681	if (ctxt->sax == NULL) {
				4682	xmlFree(buf);
				4683	xmlFree(ctxt);
				4684	return(NULL);
				4685	}
				4686	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
				4687	if (user_data != NULL)
				4688	ctxt->userData = user_data;
				4689	}
				4690	if (filename == NULL) {
				4691	ctxt->directory = NULL;
				4692	} else {
				4693	ctxt->directory = xmlParserGetDirectory(filename);
				4694	}
				4695
				4696	inputStream = htmlNewInputStream(ctxt);
				4697	if (inputStream == NULL) {
				4698	xmlFreeParserCtxt(ctxt);
				4699	return(NULL);
				4700	}
				4701
				4702	if (filename == NULL)
				4703	inputStream->filename = NULL;
				4704	else
				4705	inputStream->filename = xmlMemStrdup(filename);
				4706	inputStream->buf = buf;
				4707	inputStream->base = inputStream->buf->buffer->content;
				4708	inputStream->cur = inputStream->buf->buffer->content;
				4709
				4710	inputPush(ctxt, inputStream);
				4711
				4712	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
				4713	(ctxt->input->buf != NULL)) {
				4714	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
				4715	#ifdef DEBUG_PUSH
				4716	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
				4717	#endif
				4718	}
				4719
				4720	return(ctxt);
				4721	}
				4722
				4723	/**
				4724	* htmlSAXParseDoc :
				4725	* @cur: a pointer to an array of xmlChar
				4726	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4727	* @sax: the SAX handler block
				4728	* @userData: if using SAX, this pointer will be provided on callbacks.
				4729	*
Daniel Veillard	4d65a1c	2001-07-04 22:06:23 +0000	[diff] [blame]	4730	* Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
				4731	* to handle parse events. If sax is NULL, fallback to the default DOM
				4732	* behavior and return a tree.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4733	*
Daniel Veillard	4d65a1c	2001-07-04 22:06:23 +0000	[diff] [blame]	4734	* Returns the resulting document tree unless SAX is NULL or the document is
				4735	* not well formed.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4736	*/
				4737
				4738	htmlDocPtr
				4739	htmlSAXParseDoc(xmlChar cur, const char encoding, htmlSAXHandlerPtr sax, void *userData) {
				4740	htmlDocPtr ret;
				4741	htmlParserCtxtPtr ctxt;
				4742
				4743	if (cur == NULL) return(NULL);
				4744
				4745
				4746	ctxt = htmlCreateDocParserCtxt(cur, encoding);
				4747	if (ctxt == NULL) return(NULL);
				4748	if (sax != NULL) {
				4749	ctxt->sax = sax;
				4750	ctxt->userData = userData;
				4751	}
				4752
				4753	htmlParseDocument(ctxt);
				4754	ret = ctxt->myDoc;
				4755	if (sax != NULL) {
				4756	ctxt->sax = NULL;
				4757	ctxt->userData = NULL;
				4758	}
				4759	htmlFreeParserCtxt(ctxt);
				4760
				4761	return(ret);
				4762	}
				4763
				4764	/**
				4765	* htmlParseDoc :
				4766	* @cur: a pointer to an array of xmlChar
				4767	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4768	*
				4769	* parse an HTML in-memory document and build a tree.
				4770	*
				4771	* Returns the resulting document tree
				4772	*/
				4773
				4774	htmlDocPtr
				4775	htmlParseDoc(xmlChar cur, const char encoding) {
				4776	return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
				4777	}
				4778
				4779
				4780	/**
				4781	* htmlCreateFileParserCtxt :
				4782	* @filename: the filename
				4783	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4784	*
				4785	* Create a parser context for a file content.
				4786	* Automatic support for ZLIB/Compress compressed document is provided
				4787	* by default if found at compile-time.
				4788	*
				4789	* Returns the new parser context or NULL
				4790	*/
				4791	htmlParserCtxtPtr
				4792	htmlCreateFileParserCtxt(const char filename, const char encoding)
				4793	{
				4794	htmlParserCtxtPtr ctxt;
				4795	htmlParserInputPtr inputStream;
				4796	xmlParserInputBufferPtr buf;
				4797	/* htmlCharEncoding enc; */
				4798	xmlChar content, content_line = (xmlChar *) "charset=";
				4799
				4800	buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
				4801	if (buf == NULL) return(NULL);
				4802
				4803	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				4804	if (ctxt == NULL) {
				4805	perror("malloc");
				4806	return(NULL);
				4807	}
				4808	memset(ctxt, 0, sizeof(htmlParserCtxt));
				4809	htmlInitParserCtxt(ctxt);
				4810	inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				4811	if (inputStream == NULL) {
				4812	perror("malloc");
				4813	xmlFree(ctxt);
				4814	return(NULL);
				4815	}
				4816	memset(inputStream, 0, sizeof(htmlParserInput));
				4817
				4818	inputStream->filename = xmlMemStrdup(filename);
				4819	inputStream->line = 1;
				4820	inputStream->col = 1;
				4821	inputStream->buf = buf;
				4822	inputStream->directory = NULL;
				4823
				4824	inputStream->base = inputStream->buf->buffer->content;
				4825	inputStream->cur = inputStream->buf->buffer->content;
				4826	inputStream->free = NULL;
				4827
				4828	inputPush(ctxt, inputStream);
				4829
				4830	/* set encoding */
				4831	if (encoding) {
				4832	content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
				4833	if (content) {
				4834	strcpy ((char )content, (char )content_line);
				4835	strcat ((char )content, (char )encoding);
				4836	htmlCheckEncoding (ctxt, content);
				4837	xmlFree (content);
				4838	}
				4839	}
				4840
				4841	return(ctxt);
				4842	}
				4843
				4844	/**
				4845	* htmlSAXParseFile :
				4846	* @filename: the filename
				4847	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4848	* @sax: the SAX handler block
				4849	* @userData: if using SAX, this pointer will be provided on callbacks.
				4850	*
				4851	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				4852	* compressed document is provided by default if found at compile-time.
				4853	* It use the given SAX function block to handle the parsing callback.
				4854	* If sax is NULL, fallback to the default DOM tree building routines.
				4855	*
Daniel Veillard	4d65a1c	2001-07-04 22:06:23 +0000	[diff] [blame]	4856	* Returns the resulting document tree unless SAX is NULL or the document is
				4857	* not well formed.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4858	*/
				4859
				4860	htmlDocPtr
				4861	htmlSAXParseFile(const char filename, const char encoding, htmlSAXHandlerPtr sax,
				4862	void *userData) {
				4863	htmlDocPtr ret;
				4864	htmlParserCtxtPtr ctxt;
				4865	htmlSAXHandlerPtr oldsax = NULL;
				4866
				4867	ctxt = htmlCreateFileParserCtxt(filename, encoding);
				4868	if (ctxt == NULL) return(NULL);
				4869	if (sax != NULL) {
				4870	oldsax = ctxt->sax;
				4871	ctxt->sax = sax;
				4872	ctxt->userData = userData;
				4873	}
				4874
				4875	htmlParseDocument(ctxt);
				4876
				4877	ret = ctxt->myDoc;
				4878	if (sax != NULL) {
				4879	ctxt->sax = oldsax;
				4880	ctxt->userData = NULL;
				4881	}
				4882	htmlFreeParserCtxt(ctxt);
				4883
				4884	return(ret);
				4885	}
				4886
				4887	/**
				4888	* htmlParseFile :
				4889	* @filename: the filename
				4890	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4891	*
				4892	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				4893	* compressed document is provided by default if found at compile-time.
				4894	*
				4895	* Returns the resulting document tree
				4896	*/
				4897
				4898	htmlDocPtr
				4899	htmlParseFile(const char filename, const char encoding) {
				4900	return(htmlSAXParseFile(filename, encoding, NULL, NULL));
				4901	}
				4902
				4903	/**
				4904	* htmlHandleOmittedElem:
				4905	* @val: int 0 or 1
				4906	*
				4907	* Set and return the previous value for handling HTML omitted tags.
				4908	*
				4909	* Returns the last value for 0 for no handling, 1 for auto insertion.
				4910	*/
				4911
				4912	int
				4913	htmlHandleOmittedElem(int val) {
				4914	int old = htmlOmittedDefaultValue;
				4915
				4916	htmlOmittedDefaultValue = val;
				4917	return(old);
				4918	}
				4919
				4920	#endif /* LIBXML_HTML_ENABLED */