Blame - HTMLparser.c - platform/external/libxml2

blob: a559d9b7dd983ec514d5d8b0c30ce68959e00e98 [file] [log] [blame]

Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1	/*
				2	* HTMLparser.c : an HTML 4.0 non-verifying parser
				3	*
				4	* See Copyright for the status of this software.
				5	*
				6	* Daniel.Veillard@w3.org
				7	*/
				8
				9	#ifdef WIN32
				10	#define HAVE_FCNTL_H
				11	#include <io.h>
				12	#else
				13	#include <config.h>
				14	#endif
				15	#include <stdio.h>
				16	#include <ctype.h>
				17	#include <string.h> /* for memset() only */
				18	#include <stdlib.h>
				19	#include <sys/stat.h>
				20	#ifdef HAVE_FCNTL_H
				21	#include <fcntl.h>
				22	#endif
				23	#ifdef HAVE_UNISTD_H
				24	#include <unistd.h>
				25	#endif
				26	#ifdef HAVE_ZLIB_H
				27	#include <zlib.h>
				28	#endif
				29
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	30	#include "xmlmemory.h"
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	31	#include "tree.h"
				32	#include "HTMLparser.h"
				33	#include "entities.h"
				34	#include "encoding.h"
				35	#include "valid.h"
				36	#include "parserInternals.h"
Daniel Veillard	e2d034d	1999-07-27 19:52:06 +0000	[diff] [blame]	37	#include "xmlIO.h"
				38
				39	#define HTML_MAX_NAMELEN 1000
				40	#define INPUT_CHUNK 50
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	41
Daniel Veillard	82150d8	1999-07-07 07:32:15 +0000	[diff] [blame]	42	/* #define DEBUG */
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	43
				44	/************************************************************************
				45	* *
				46	* Parser stacks related functions and macros *
				47	* *
				48	************************************************************************/
				49
				50	/*
				51	* Generic function for accessing stacks in the Parser Context
				52	*/
				53
				54	#define PUSH_AND_POP(type, name) \
				55	int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
				56	if (ctxt->name##Nr >= ctxt->name##Max) { \
				57	ctxt->name##Max *= 2; \
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	58	ctxt->name##Tab = (void *) xmlRealloc(ctxt->name##Tab, \
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	59	ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
				60	if (ctxt->name##Tab == NULL) { \
				61	fprintf(stderr, "realloc failed !\n"); \
				62	exit(1); \
				63	} \
				64	} \
				65	ctxt->name##Tab[ctxt->name##Nr] = value; \
				66	ctxt->name = value; \
				67	return(ctxt->name##Nr++); \
				68	} \
				69	type html##name##Pop(htmlParserCtxtPtr ctxt) { \
				70	type ret; \
				71	if (ctxt->name##Nr <= 0) return(0); \
				72	ctxt->name##Nr--; \
				73	if (ctxt->name##Nr > 0) \
				74	ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
				75	else \
				76	ctxt->name = NULL; \
				77	ret = ctxt->name##Tab[ctxt->name##Nr]; \
				78	ctxt->name##Tab[ctxt->name##Nr] = 0; \
				79	return(ret); \
				80	} \
				81
				82	PUSH_AND_POP(xmlNodePtr, node)
				83
				84	/*
				85	* Macros for accessing the content. Those should be used only by the parser,
				86	* and not exported.
				87	*
				88	* Dirty macros, i.e. one need to make assumption on the context to use them
				89	*
				90	* CUR_PTR return the current pointer to the CHAR to be parsed.
				91	* CUR returns the current CHAR value, i.e. a 8 bit value if compiled
				92	* in ISO-Latin or UTF-8, and the current 16 bit value if compiled
				93	* in UNICODE mode. This should be used internally by the parser
				94	* only to compare to ASCII values otherwise it would break when
				95	* running with UTF-8 encoding.
				96	* NXT(n) returns the n'th next CHAR. Same as CUR is should be used only
				97	* to compare on ASCII based substring.
				98	* UPP(n) returns the n'th next CHAR converted to uppercase. Same as CUR
				99	* it should be used only to compare on ASCII based substring.
				100	* SKIP(n) Skip n CHAR, and must also be used only to skip ASCII defined
				101	* strings within the parser.
				102	*
				103	* Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
				104	*
				105	* CURRENT Returns the current char value, with the full decoding of
				106	* UTF-8 if we are using this mode. It returns an int.
				107	* NEXT Skip to the next character, this does the proper decoding
				108	* in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	109	* COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
				110	*/
				111
				112	#define CUR (*ctxt->input->cur)
				113	#define UPPER (toupper(*ctxt->input->cur))
				114	#define SKIP(val) ctxt->input->cur += (val)
				115	#define NXT(val) ctxt->input->cur[(val)]
				116	#define UPP(val) (toupper(ctxt->input->cur[(val)]))
				117	#define CUR_PTR ctxt->input->cur
Daniel Veillard	e2d034d	1999-07-27 19:52:06 +0000	[diff] [blame]	118	#define SHRINK xmlParserInputShrink(ctxt->input)
				119	#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	120
				121	#define SKIP_BLANKS \
				122	while (IS_BLANK(*(ctxt->input->cur))) NEXT
				123
				124	#ifndef USE_UTF_8
				125	#define CURRENT (*ctxt->input->cur)
Daniel Veillard	e2d034d	1999-07-27 19:52:06 +0000	[diff] [blame]	126	#define NEXT { \
				127	if ((*ctxt->input->cur == 0) && \
				128	(xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) { \
				129	xmlPopInput(ctxt); \
				130	} else { \
				131	if (*(ctxt->input->cur) == '\n') { \
				132	ctxt->input->line++; ctxt->input->col = 1; \
				133	} else ctxt->input->col++; \
				134	ctxt->input->cur++; \
				135	if (*ctxt->input->cur == 0) \
				136	xmlParserInputGrow(ctxt->input, INPUT_CHUNK); \
				137	}}
				138
				139	/****************************************
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	140	#define NEXT ((*ctxt->input->cur) ? \
				141	(((*(ctxt->input->cur) == '\n') ? \
				142	(ctxt->input->line++, ctxt->input->col = 1) : \
Daniel Veillard	e2d034d	1999-07-27 19:52:06 +0000	[diff] [blame]	143	(ctxt->input->col++)), \
				144	(ctxt->input->cur++), \
				145	((*ctxt->input->cur) ? \
				146	(xmlParserInputGrow(ctxt->input, 100), \
				147	ctxt->input->cur): \
				148	(ctxt->input->cur))) : \
				149	((xmlParserInputGrow(ctxt->input, 100) > 0) ? \
				150	ctxt->input->cur: \
				151	(xmlPopInput(ctxt), ctxt->input->cur)))
				152	****************************************/
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	153	#else
				154	#endif
				155
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	156
Daniel Veillard	e2d034d	1999-07-27 19:52:06 +0000	[diff] [blame]	157
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	158	/************************************************************************
				159	* *
				160	* The list of HTML elements and their properties *
				161	* *
				162	************************************************************************/
				163
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	164	/*
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	165	* Start Tag: 1 means the start tag can be ommited
				166	* End Tag: 1 means the end tag can be ommited
				167	* 2 means it's forbidden (empty elements)
				168	* Depr: this element is deprecated
				169	* DTD: 1 means that this element is valid only in the Loose DTD
				170	* 2 means that this element is valid only in the Frameset DTD
				171	*
				172	* Name,Start Tag,End Tag, Empty, Depr., DTD, Description
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	173	*/
				174	htmlElemDesc html40ElementTable[] = {
				175	{ "A", 0, 0, 0, 0, 0, "anchor " },
				176	{ "ABBR", 0, 0, 0, 0, 0, "abbreviated form" },
				177	{ "ACRONYM", 0, 0, 0, 0, 0, "" },
				178	{ "ADDRESS", 0, 0, 0, 0, 0, "information on author " },
				179	{ "APPLET", 0, 0, 0, 1, 1, "Java applet " },
				180	{ "AREA", 0, 2, 1, 0, 0, "client-side image map area " },
				181	{ "B", 0, 0, 0, 0, 0, "bold text style" },
				182	{ "BASE", 0, 2, 1, 0, 0, "document base URI " },
				183	{ "BASEFONT", 0, 2, 1, 1, 1, "base font size " },
				184	{ "BDO", 0, 0, 0, 0, 0, "I18N BiDi over-ride " },
				185	{ "BIG", 0, 0, 0, 0, 0, "large text style" },
				186	{ "BLOCKQUOTE", 0, 0, 0, 0, 0, "long quotation " },
				187	{ "BODY", 1, 1, 0, 0, 0, "document body " },
				188	{ "BR", 0, 2, 1, 0, 0, "forced line break " },
				189	{ "BUTTON", 0, 0, 0, 0, 0, "push button " },
				190	{ "CAPTION", 0, 0, 0, 0, 0, "table caption " },
				191	{ "CENTER", 0, 0, 0, 1, 1, "shorthand for DIV align=center " },
				192	{ "CITE", 0, 0, 0, 0, 0, "citation" },
				193	{ "CODE", 0, 0, 0, 0, 0, "computer code fragment" },
				194	{ "COL", 0, 2, 1, 0, 0, "table column " },
				195	{ "COLGROUP", 0, 1, 0, 0, 0, "table column group " },
				196	{ "DD", 0, 1, 0, 0, 0, "definition description " },
				197	{ "DEL", 0, 0, 0, 0, 0, "deleted text " },
				198	{ "DFN", 0, 0, 0, 0, 0, "instance definition" },
				199	{ "DIR", 0, 0, 0, 1, 1, "directory list" },
				200	{ "DIV", 0, 0, 0, 0, 0, "generic language/style container"},
				201	{ "DL", 0, 0, 0, 0, 0, "definition list " },
				202	{ "DT", 0, 1, 0, 0, 0, "definition term " },
				203	{ "EM", 0, 0, 0, 0, 0, "emphasis" },
				204	{ "FIELDSET", 0, 0, 0, 0, 0, "form control group " },
				205	{ "FONT", 0, 0, 0, 1, 1, "local change to font " },
				206	{ "FORM", 0, 0, 0, 0, 0, "interactive form " },
				207	{ "FRAME", 0, 2, 1, 0, 2, "subwindow " },
				208	{ "FRAMESET", 0, 0, 0, 0, 2, "window subdivision" },
				209	{ "H1", 0, 0, 0, 0, 0, "heading " },
				210	{ "H2", 0, 0, 0, 0, 0, "heading " },
				211	{ "H3", 0, 0, 0, 0, 0, "heading " },
				212	{ "H4", 0, 0, 0, 0, 0, "heading " },
				213	{ "H5", 0, 0, 0, 0, 0, "heading " },
				214	{ "H6", 0, 0, 0, 0, 0, "heading " },
				215	{ "HEAD", 1, 1, 0, 0, 0, "document head " },
				216	{ "HR", 0, 2, 1, 0, 0, "horizontal rule " },
				217	{ "HTML", 1, 1, 0, 0, 0, "document root element " },
				218	{ "I", 0, 0, 0, 0, 0, "italic text style" },
				219	{ "IFRAME", 0, 0, 0, 0, 1, "inline subwindow " },
				220	{ "IMG", 0, 2, 1, 0, 0, "Embedded image " },
				221	{ "INPUT", 0, 2, 1, 0, 0, "form control " },
				222	{ "INS", 0, 0, 0, 0, 0, "inserted text" },
				223	{ "ISINDEX", 0, 2, 1, 1, 1, "single line prompt " },
				224	{ "KBD", 0, 0, 0, 0, 0, "text to be entered by the user" },
				225	{ "LABEL", 0, 0, 0, 0, 0, "form field label text " },
				226	{ "LEGEND", 0, 0, 0, 0, 0, "fieldset legend " },
				227	{ "LI", 0, 1, 0, 0, 0, "list item " },
				228	{ "LINK", 0, 2, 1, 0, 0, "a media-independent link " },
				229	{ "MAP", 0, 0, 0, 0, 0, "client-side image map " },
				230	{ "MENU", 0, 0, 0, 1, 1, "menu list " },
				231	{ "META", 0, 2, 1, 0, 0, "generic metainformation " },
				232	{ "NOFRAMES", 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " },
				233	{ "NOSCRIPT", 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
				234	{ "OBJECT", 0, 0, 0, 0, 0, "generic embedded object " },
				235	{ "OL", 0, 0, 0, 0, 0, "ordered list " },
				236	{ "OPTGROUP", 0, 0, 0, 0, 0, "option group " },
				237	{ "OPTION", 0, 1, 0, 0, 0, "selectable choice " },
				238	{ "P", 0, 1, 0, 0, 0, "paragraph " },
				239	{ "PARAM", 0, 2, 1, 0, 0, "named property value " },
				240	{ "PRE", 0, 0, 0, 0, 0, "preformatted text " },
				241	{ "Q", 0, 0, 0, 0, 0, "short inline quotation " },
				242	{ "S", 0, 0, 0, 1, 1, "strike-through text style" },
				243	{ "SAMP", 0, 0, 0, 0, 0, "sample program output, scripts, etc." },
				244	{ "SCRIPT", 0, 0, 0, 0, 0, "script statements " },
				245	{ "SELECT", 0, 0, 0, 0, 0, "option selector " },
				246	{ "SMALL", 0, 0, 0, 0, 0, "small text style" },
				247	{ "SPAN", 0, 0, 0, 0, 0, "generic language/style container " },
				248	{ "STRIKE", 0, 0, 0, 1, 1, "strike-through text" },
				249	{ "STRONG", 0, 0, 0, 0, 0, "strong emphasis" },
				250	{ "STYLE", 0, 0, 0, 0, 0, "style info " },
				251	{ "SUB", 0, 0, 0, 0, 0, "subscript" },
				252	{ "SUP", 0, 0, 0, 0, 0, "superscript " },
				253	{ "TABLE", 0, 0, 0, 0, 0, " " },
				254	{ "TBODY", 1, 1, 0, 0, 0, "table body " },
				255	{ "TD", 0, 1, 0, 0, 0, "table data cell" },
				256	{ "TEXTAREA", 0, 0, 0, 0, 0, "multi-line text field " },
				257	{ "TFOOT", 0, 1, 0, 0, 0, "table footer " },
				258	{ "TH", 0, 1, 0, 0, 0, "table header cell" },
				259	{ "THEAD", 0, 1, 0, 0, 0, "table header " },
				260	{ "TITLE", 0, 0, 0, 0, 0, "document title " },
				261	{ "TR", 0, 1, 0, 0, 0, "table row " },
				262	{ "TT", 0, 0, 0, 0, 0, "teletype or monospaced text style" },
				263	{ "U", 0, 0, 0, 1, 1, "underlined text style" },
				264	{ "UL", 0, 0, 0, 0, 0, "unordered list " },
				265	{ "VAR", 0, 0, 0, 0, 0, "instance of a variable or program argument" },
				266	};
				267
				268	/*
				269	* start tags that imply the end of a current element
				270	* any tag of each line implies the end of the current element if the type of
				271	* that element is in the same line
				272	*/
Daniel Veillard	b96e643	1999-08-29 21:02:19 +0000	[diff] [blame]	273	char *htmlEquEnd[] = {
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	274	"DT", "DD", "LI", "OPTION", NULL,
				275	"H1", "H2", "H3", "H4", "H5", "H6", NULL,
				276	"OL", "MENU", "DIR", "ADDRESS", "PRE", "LISTING", "XMP", NULL,
				277	NULL
				278	};
				279	/*
				280	* acording the HTML DTD, HR should be added to the 2nd line above, as it
				281	* is not allowed within a H1, H2, H3, etc. But we should tolerate that case
				282	* because many documents contain rules in headings...
				283	*/
				284
				285	/*
				286	* start tags that imply the end of current element
				287	*/
Daniel Veillard	b96e643	1999-08-29 21:02:19 +0000	[diff] [blame]	288	char *htmlStartClose[] = {
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	289	"FORM", "FORM", "P", "HR", "H1", "H2", "H3", "H4", "H5", "H6",
				290	"DL", "UL", "OL", "MENU", "DIR", "ADDRESS", "PRE",
				291	"LISTING", "XMP", "HEAD", NULL,
				292	"HEAD", "P", NULL,
				293	"TITLE", "P", NULL,
				294	"BODY", "HEAD", "STYLE", "LINK", "TITLE", "P", NULL,
				295	"LI", "P", "H1", "H2", "H3", "H4", "H5", "H6", "DL", "ADDRESS",
				296	"PRE", "LISTING", "XMP", "HEAD", NULL,
				297	"HR", "P", "HEAD", NULL,
				298	"H1", "P", "HEAD", NULL,
				299	"H2", "P", "HEAD", NULL,
				300	"H3", "P", "HEAD", NULL,
				301	"H4", "P", "HEAD", NULL,
				302	"H5", "P", "HEAD", NULL,
				303	"H6", "P", "HEAD", NULL,
				304	"DIR", "P", "HEAD", NULL,
				305	"ADDRESS", "P", "HEAD", "UL", NULL,
				306	"PRE", "P", "HEAD", "UL", NULL,
				307	"LISTING", "P", "HEAD", NULL,
				308	"XMP", "P", "HEAD", NULL,
				309	"BLOCKQUOTE", "P", "HEAD", NULL,
				310	"DL", "P", "DT", "MENU", "DIR", "ADDRESS", "PRE", "LISTING",
				311	"XMP", "HEAD", NULL,
				312	"DT", "P", "MENU", "DIR", "ADDRESS", "PRE", "LISTING", "XMP", "HEAD", NULL,
				313	"DD", "P", "MENU", "DIR", "ADDRESS", "PRE", "LISTING", "XMP", "HEAD", NULL,
				314	"UL", "P", "HEAD", "OL", "MENU", "DIR", "ADDRESS", "PRE",
				315	"LISTING", "XMP", NULL,
				316	"OL", "P", "HEAD", "UL", NULL,
				317	"MENU", "P", "HEAD", "UL", NULL,
				318	"P", "P", "HEAD", "H1", "H2", "H3", "H4", "H5", "H6", NULL,
				319	"DIV", "P", "HEAD", NULL,
				320	"NOSCRIPT", "P", "HEAD", NULL,
				321	"CENTER", "FONT", "B", "I", "P", "HEAD", NULL,
				322	"A", "A", NULL,
				323	"CAPTION", "P", NULL,
				324	"COLGROUP", "CAPTION", "COLGROUP", "COL", "P", NULL,
				325	"COL", "CAPTION", "COL", "P", NULL,
				326	"TABLE", "P", "HEAD", "H1", "H2", "H3", "H4", "H5", "H6", "PRE",
				327	"LISTING", "XMP", "A", NULL,
				328	"TH", "TH", "TD", NULL,
				329	"TD", "TH", "TD", NULL,
				330	"TR", "TH", "TD", "TR", "CAPTION", "COL", "COLGROUP", NULL,
				331	"THEAD", "CAPTION", "COL", "COLGROUP", NULL,
				332	"TFOOT", "TH", "TD", "TR", "CAPTION", "COL", "COLGROUP", "THEAD",
				333	"TBODY", NULL,
				334	"TBODY", "TH", "TD", "TR", "CAPTION", "COL", "COLGROUP", "THEAD",
				335	"TFOOT", "TBODY", NULL,
				336	"OPTGROUP", "OPTION", NULL,
				337	"FIELDSET", "LEGEND", "P", "HEAD", "H1", "H2", "H3", "H4", "H5", "H6",
				338	"PRE", "LISTING", "XMP", "A", NULL,
				339	NULL
				340	};
				341
Daniel Veillard	b96e643	1999-08-29 21:02:19 +0000	[diff] [blame]	342	static char** htmlStartCloseIndex[100];
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	343	static int htmlStartCloseIndexinitialized = 0;
				344
				345	/************************************************************************
				346	* *
				347	* functions to handle HTML specific data *
				348	* *
				349	************************************************************************/
				350
				351	/**
				352	* htmlInitAutoClose:
				353	*
				354	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				355	*
				356	*/
				357	void
				358	htmlInitAutoClose(void) {
				359	int index, i = 0;
				360
				361	if (htmlStartCloseIndexinitialized) return;
				362
				363	for (index = 0;index < 100;index ++) htmlStartCloseIndex[index] = NULL;
				364	index = 0;
				365	while ((htmlStartClose[i] != NULL) && (index < 100 - 1)) {
				366	htmlStartCloseIndex[index++] = &htmlStartClose[i];
				367	while (htmlStartClose[i] != NULL) i++;
				368	i++;
				369	}
				370	}
				371
				372	/**
				373	* htmlTagLookup:
				374	* @tag: The tag name
				375	*
				376	* Lookup the HTML tag in the ElementTable
				377	*
				378	* Returns the related htmlElemDescPtr or NULL if not found.
				379	*/
				380	htmlElemDescPtr
				381	htmlTagLookup(const CHAR *tag) {
				382	int i = 0;
				383
				384	for (i = 0; i < (sizeof(html40ElementTable) /
				385	sizeof(html40ElementTable[0]));i++) {
Daniel Veillard	b96e643	1999-08-29 21:02:19 +0000	[diff] [blame]	386	if (!xmlStrcmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	387	return(&html40ElementTable[i]);
				388	}
				389	return(NULL);
				390	}
				391
				392	/**
				393	* htmlCheckAutoClose:
				394	* @new: The new tag name
				395	* @old: The old tag name
				396	*
				397	* Checks wether the new tag is one of the registered valid tags for closing old.
				398	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				399	*
				400	* Returns 0 if no, 1 if yes.
				401	*/
				402	int
				403	htmlCheckAutoClose(const CHAR new, const CHAR old) {
				404	int i, index;
Daniel Veillard	b96e643	1999-08-29 21:02:19 +0000	[diff] [blame]	405	char **close;
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	406
				407	if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
				408
				409	/* inefficient, but not a big deal */
				410	for (index = 0; index < 100;index++) {
				411	close = htmlStartCloseIndex[index];
				412	if (close == NULL) return(0);
Daniel Veillard	b96e643	1999-08-29 21:02:19 +0000	[diff] [blame]	413	if (!xmlStrcmp(BAD_CAST *close, new)) break;
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	414	}
				415
				416	i = close - htmlStartClose;
				417	i++;
				418	while (htmlStartClose[i] != NULL) {
Daniel Veillard	b96e643	1999-08-29 21:02:19 +0000	[diff] [blame]	419	if (!xmlStrcmp(BAD_CAST htmlStartClose[i], old)) {
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	420	return(1);
				421	}
				422	i++;
				423	}
				424	return(0);
				425	}
				426
				427	/**
				428	* htmlAutoClose:
				429	* @ctxt: an HTML parser context
				430	* @new: The new tag name
				431	*
				432	* The HTmL DtD allows a tag to implicitely close other tags.
				433	* The list is kept in htmlStartClose array. This function is
				434	* called when a new tag has been detected and generates the
				435	* appropriates closes if possible/needed.
				436	*/
				437	void
				438	htmlAutoClose(htmlParserCtxtPtr ctxt, const CHAR *new) {
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	439
				440	while ((ctxt->node != NULL) &&
				441	(htmlCheckAutoClose(new, ctxt->node->name))) {
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	442	#ifdef DEBUG
				443	printf("htmlAutoClose: %s closes %s\n", new, ctxt->node->name);
				444	#endif
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	445	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				446	ctxt->sax->endElement(ctxt->userData, ctxt->node->name);
				447	}
				448	}
				449
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	450	/**
				451	* htmlAutoCloseOnClose:
				452	* @ctxt: an HTML parser context
				453	* @new: The new tag name
				454	*
				455	* The HTmL DtD allows an ending tag to implicitely close other tags.
				456	*/
				457	void
				458	htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const CHAR *new) {
				459	htmlElemDescPtr info;
				460
				461	while ((ctxt->node != NULL) &&
				462	(xmlStrcmp(new, ctxt->node->name))) {
				463	info = htmlTagLookup(ctxt->node->name);
				464	if ((info == NULL) \|\| (info->endTag == 1)) {
				465	#ifdef DEBUG
				466	printf("htmlAutoCloseOnClose: %s closes %s\n", new, ctxt->node->name);
				467	#endif
				468	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				469	ctxt->sax->endElement(ctxt->userData, ctxt->node->name);
				470	} else
				471	break;
				472	}
				473	}
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	474
				475	/************************************************************************
				476	* *
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	477	* The list of HTML predefined entities *
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	478	* *
				479	************************************************************************/
				480
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	481
				482	htmlEntityDesc html40EntitiesTable[] = {
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	483	/*
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	484	* the 4 absolute ones,
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	485	*/
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	486	{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
				487	{ 38, "amp", "ampersand, U+0026 ISOnum" },
Daniel Veillard	1566d3a	1999-07-15 14:24:29 +0000	[diff] [blame]	488	{ 39, "apos", "single quote" },
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	489	{ 60, "lt", "less-than sign, U+003C ISOnum" },
				490	{ 62, "gt", "greater-than sign, U+003E ISOnum" },
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	491
				492	/*
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	493	* A bunch still in the 128-255 range
				494	* Replacing them depend really on the charset used.
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	495	*/
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	496	{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
				497	{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
				498	{ 162, "cent", "cent sign, U+00A2 ISOnum" },
				499	{ 163, "pound","pound sign, U+00A3 ISOnum" },
				500	{ 164, "curren","currency sign, U+00A4 ISOnum" },
				501	{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
				502	{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
				503	{ 167, "sect", "section sign, U+00A7 ISOnum" },
				504	{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
				505	{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
				506	{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
				507	{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
				508	{ 172, "not", "not sign, U+00AC ISOnum" },
				509	{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
				510	{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
				511	{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
				512	{ 176, "deg", "degree sign, U+00B0 ISOnum" },
				513	{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
				514	{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
				515	{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
				516	{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
				517	{ 181, "micro","micro sign, U+00B5 ISOnum" },
				518	{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
Daniel Veillard	b05deb7	1999-08-10 19:04:08 +0000	[diff] [blame]	519	{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	520	{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
				521	{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
				522	{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
Daniel Veillard	b05deb7	1999-08-10 19:04:08 +0000	[diff] [blame]	523	{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	524	{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
				525	{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
				526	{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
				527	{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
				528	{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
				529	{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
				530	{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
				531	{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
				532	{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
				533	{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
				534	{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
				535	{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
				536	{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
				537	{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
				538	{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
				539	{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
				540	{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
				541	{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
				542	{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
				543	{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
				544	{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
				545	{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
				546	{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
				547	{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
				548	{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
				549	{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
				550	{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
				551	{ 215, "times","multiplication sign, U+00D7 ISOnum" },
Daniel Veillard	b05deb7	1999-08-10 19:04:08 +0000	[diff] [blame]	552	{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	553	{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
				554	{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
				555	{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
				556	{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
				557	{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
				558	{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
				559	{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
				560	{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
				561	{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
				562	{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
				563	{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
				564	{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
				565	{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
				566	{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
				567	{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
				568	{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
				569	{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
				570	{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
				571	{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
				572	{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
				573	{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
				574	{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
				575	{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
				576	{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
				577	{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
				578	{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
				579	{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
				580	{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
				581	{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
				582	{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
				583	{ 247, "divide","division sign, U+00F7 ISOnum" },
				584	{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
				585	{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
				586	{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
				587	{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
				588	{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
				589	{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
				590	{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
				591	{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	592
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	593	/*
				594	* Anything below should really be kept as entities references
				595	*/
				596	{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	597
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	598	{ 913, "Alpha","greek capital letter alpha, U+0391" },
				599	{ 914, "Beta", "greek capital letter beta, U+0392" },
				600	{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
				601	{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
				602	{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
				603	{ 918, "Zeta", "greek capital letter zeta, U+0396" },
				604	{ 919, "Eta", "greek capital letter eta, U+0397" },
				605	{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
				606	{ 921, "Iota", "greek capital letter iota, U+0399" },
				607	{ 922, "Kappa","greek capital letter kappa, U+039A" },
				608	{ 923, "Lambda""greek capital letter lambda, U+039B ISOgrk3" },
				609	{ 924, "Mu", "greek capital letter mu, U+039C" },
				610	{ 925, "Nu", "greek capital letter nu, U+039D" },
				611	{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
				612	{ 927, "Omicron","greek capital letter omicron, U+039F" },
				613	{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
				614	{ 929, "Rho", "greek capital letter rho, U+03A1" },
				615	{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
				616	{ 932, "Tau", "greek capital letter tau, U+03A4" },
				617	{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
				618	{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
				619	{ 935, "Chi", "greek capital letter chi, U+03A7" },
				620	{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
				621	{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	622
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	623	{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
				624	{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
				625	{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
				626	{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
				627	{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
				628	{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
				629	{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
				630	{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
				631	{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
				632	{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
				633	{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
				634	{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
				635	{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
				636	{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
				637	{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
				638	{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
				639	{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
				640	{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
				641	{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
				642	{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
				643	{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
				644	{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
				645	{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
				646	{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
				647	{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
				648	{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
				649	{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
				650	{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	651
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	652	{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
				653	{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
				654	{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
				655	{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
				656	{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
				657	{ 8260, "frasl","fraction slash, U+2044 NEW" },
				658
Daniel Veillard	b05deb7	1999-08-10 19:04:08 +0000	[diff] [blame]	659	{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	660	{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
				661	{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
				662	{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
				663	{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
				664	{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
				665	{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
				666	{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
				667	{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
				668	{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
				669	{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
				670	{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
				671	{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
				672	{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
				673	{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
				674	{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
				675
				676
				677	{ 8704, "forall","for all, U+2200 ISOtech" },
				678	{ 8706, "part", "partial differential, U+2202 ISOtech" },
				679	{ 8707, "exist","there exists, U+2203 ISOtech" },
				680	{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
				681	{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
				682	{ 8712, "isin", "element of, U+2208 ISOtech" },
				683	{ 8713, "notin","not an element of, U+2209 ISOtech" },
				684	{ 8715, "ni", "contains as member, U+220B ISOtech" },
				685	{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
				686	{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
				687	{ 8722, "minus","minus sign, U+2212 ISOtech" },
				688	{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
				689	{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
				690	{ 8733, "prop", "proportional to, U+221D ISOtech" },
				691	{ 8734, "infin","infinity, U+221E ISOtech" },
				692	{ 8736, "ang", "angle, U+2220 ISOamso" },
				693	{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
				694	{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
				695	{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
				696	{ 8746, "cup", "union = cup, U+222A ISOtech" },
				697	{ 8747, "int", "integral, U+222B ISOtech" },
				698	{ 8756, "there4","therefore, U+2234 ISOtech" },
				699	{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
				700	{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
				701	{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
				702	{ 8800, "ne", "not equal to, U+2260 ISOtech" },
				703	{ 8801, "equiv","identical to, U+2261 ISOtech" },
				704	{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
				705	{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
				706	{ 8834, "sub", "subset of, U+2282 ISOtech" },
				707	{ 8835, "sup", "superset of, U+2283 ISOtech" },
				708	{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
				709	{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
				710	{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
				711	{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
				712	{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
				713	{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
				714	{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
				715	{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
				716	{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
				717	{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
				718	{ 8971, "rfloor","right floor, U+230B ISOamsc" },
				719	{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
				720	{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
				721	{ 9674, "loz", "lozenge, U+25CA ISOpub" },
				722
				723	{ 9824, "spades","black spade suit, U+2660 ISOpub" },
				724	{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
				725	{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
				726	{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
				727
				728	{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
				729	{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
				730	{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
				731	{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
				732	{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
				733	{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
				734	{ 732, "tilde","small tilde, U+02DC ISOdia" },
				735
				736	{ 8194, "ensp", "en space, U+2002 ISOpub" },
				737	{ 8195, "emsp", "em space, U+2003 ISOpub" },
				738	{ 8201, "thinsp","thin space, U+2009 ISOpub" },
				739	{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
				740	{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
				741	{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
				742	{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
				743	{ 8211, "ndash","en dash, U+2013 ISOpub" },
				744	{ 8212, "mdash","em dash, U+2014 ISOpub" },
				745	{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
				746	{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
				747	{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
				748	{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
				749	{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
				750	{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
				751	{ 8224, "dagger","dagger, U+2020 ISOpub" },
				752	{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
				753	{ 8240, "permil","per mille sign, U+2030 ISOtech" },
				754	{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
Daniel Veillard	b05deb7	1999-08-10 19:04:08 +0000	[diff] [blame]	755	{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	756	{ 8364, "euro", "euro sign, U+20AC NEW" }
				757	};
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	758
				759	/************************************************************************
				760	* *
				761	* Commodity functions to handle entities *
				762	* *
				763	************************************************************************/
				764
				765	/*
				766	* Macro used to grow the current buffer.
				767	*/
				768	#define growBuffer(buffer) { \
				769	buffer##_size *= 2; \
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	770	buffer = (CHAR ) xmlRealloc(buffer, buffer##_size sizeof(CHAR)); \
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	771	if (buffer == NULL) { \
				772	perror("realloc failed"); \
				773	exit(1); \
				774	} \
				775	}
				776
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	777	/**
				778	* htmlEntityLookup:
				779	* @name: the entity name
				780	*
				781	* Lookup the given entity in EntitiesTable
				782	*
				783	* TODO: the linear scan is really ugly, an hash table is really needed.
				784	*
				785	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
				786	*/
				787	htmlEntityDescPtr
				788	htmlEntityLookup(const CHAR *name) {
				789	int i;
				790
				791	for (i = 0;i < (sizeof(html40EntitiesTable)/
				792	sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard	b96e643	1999-08-29 21:02:19 +0000	[diff] [blame]	793	if (!xmlStrcmp(name, BAD_CAST html40EntitiesTable[i].name)) {
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	794	#ifdef DEBUG
				795	printf("Found entity %s\n", name);
				796	#endif
				797	return(&html40EntitiesTable[i]);
				798	}
				799	}
				800	return(NULL);
				801	}
				802
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	803
				804	/**
				805	* htmlDecodeEntities:
				806	* @ctxt: the parser context
				807	* @len: the len to decode (in bytes !), -1 for no size limit
				808	* @end: an end marker CHAR, 0 if none
				809	* @end2: an end marker CHAR, 0 if none
				810	* @end3: an end marker CHAR, 0 if none
				811	*
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	812	* Subtitute the HTML entities by their value
				813	*
				814	* TODO: once the internal representation will be UTF-8, all entities
				815	* will be substituable, in the meantime we only apply the substitution
				816	* to the one with values in the 0-255 UNICODE range
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	817	*
				818	* Returns A newly allocated string with the substitution done. The caller
				819	* must deallocate it !
				820	*/
				821	CHAR *
				822	htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
				823	CHAR end, CHAR end2, CHAR end3) {
				824	CHAR *buffer = NULL;
				825	int buffer_size = 0;
				826	CHAR *out = NULL;
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	827	CHAR *name = NULL;
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	828
				829	CHAR *cur = NULL;
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	830	htmlEntityDescPtr ent;
Daniel Veillard	e2d034d	1999-07-27 19:52:06 +0000	[diff] [blame]	831	int nbchars = 0;
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	832	unsigned int max = (unsigned int) len;
				833
				834	/*
				835	* allocate a translation buffer.
				836	*/
				837	buffer_size = 1000;
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	838	buffer = (CHAR ) xmlMalloc(buffer_size sizeof(CHAR));
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	839	if (buffer == NULL) {
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	840	perror("htmlDecodeEntities: malloc failed");
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	841	return(NULL);
				842	}
				843	out = buffer;
				844
				845	/*
				846	* Ok loop until we reach one of the ending char or a size limit.
				847	*/
Daniel Veillard	e2d034d	1999-07-27 19:52:06 +0000	[diff] [blame]	848	while ((nbchars < max) && (CUR != end) &&
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	849	(CUR != end2) && (CUR != end3)) {
				850
				851	if (CUR == '&') {
				852	if (NXT(1) == '#') {
				853	int val = htmlParseCharRef(ctxt);
Daniel Veillard	b96e643	1999-08-29 21:02:19 +0000	[diff] [blame]	854	/* invalid for UTF-8 variable encoding !!!!! */
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	855	*out++ = val;
Daniel Veillard	e2d034d	1999-07-27 19:52:06 +0000	[diff] [blame]	856	nbchars += 3; /* !!!! */
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	857	} else {
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	858	ent = htmlParseEntityRef(ctxt, &name);
				859	if (name != NULL) {
				860	if ((ent == NULL) \|\| (ent->value <= 0) \|\|
				861	(ent->value >= 255)) {
				862	*out++ = '&';
				863	cur = name;
				864	while (*cur != 0) {
				865	if (out - buffer > buffer_size - 100) {
				866	int index = out - buffer;
				867
				868	growBuffer(buffer);
				869	out = &buffer[index];
				870	}
				871	out++ = cur++;
				872	}
				873	*out++ = ';';
				874	} else {
Daniel Veillard	b96e643	1999-08-29 21:02:19 +0000	[diff] [blame]	875	/* invalid for UTF-8 variable encoding !!!!! */
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	876	*out++ = (CHAR)ent->value;
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	877	if (out - buffer > buffer_size - 100) {
				878	int index = out - buffer;
				879
				880	growBuffer(buffer);
				881	out = &buffer[index];
				882	}
				883	}
Daniel Veillard	e2d034d	1999-07-27 19:52:06 +0000	[diff] [blame]	884	nbchars += 2 + xmlStrlen(name);
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	885	xmlFree(name);
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	886	}
				887	}
				888	} else {
Daniel Veillard	b96e643	1999-08-29 21:02:19 +0000	[diff] [blame]	889	/* invalid for UTF-8 , use COPY(out); !!!!! */
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	890	*out++ = CUR;
Daniel Veillard	e2d034d	1999-07-27 19:52:06 +0000	[diff] [blame]	891	nbchars++;
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	892	if (out - buffer > buffer_size - 100) {
				893	int index = out - buffer;
				894
				895	growBuffer(buffer);
				896	out = &buffer[index];
				897	}
				898	NEXT;
				899	}
				900	}
				901	*out++ = 0;
				902	return(buffer);
				903	}
				904
				905
				906	/************************************************************************
				907	* *
				908	* Commodity functions to handle encodings *
				909	* *
				910	************************************************************************/
				911
				912	/**
				913	* htmlSwitchEncoding:
				914	* @ctxt: the parser context
				915	* @len: the len of @cur
				916	*
				917	* change the input functions when discovering the character encoding
				918	* of a given entity.
				919	*
				920	*/
				921	void
				922	htmlSwitchEncoding(htmlParserCtxtPtr ctxt, xmlCharEncoding enc)
				923	{
				924	switch (enc) {
				925	case XML_CHAR_ENCODING_ERROR:
				926	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				927	ctxt->sax->error(ctxt->userData, "encoding unknown\n");
				928	ctxt->wellFormed = 0;
				929	break;
				930	case XML_CHAR_ENCODING_NONE:
				931	/* let's assume it's UTF-8 without the XML decl */
				932	return;
				933	case XML_CHAR_ENCODING_UTF8:
				934	/* default encoding, no conversion should be needed */
				935	return;
				936	case XML_CHAR_ENCODING_UTF16LE:
				937	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				938	ctxt->sax->error(ctxt->userData,
				939	"char encoding UTF16 little endian not supported\n");
				940	break;
				941	case XML_CHAR_ENCODING_UTF16BE:
				942	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				943	ctxt->sax->error(ctxt->userData,
				944	"char encoding UTF16 big endian not supported\n");
				945	break;
				946	case XML_CHAR_ENCODING_UCS4LE:
				947	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				948	ctxt->sax->error(ctxt->userData,
				949	"char encoding USC4 little endian not supported\n");
				950	break;
				951	case XML_CHAR_ENCODING_UCS4BE:
				952	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				953	ctxt->sax->error(ctxt->userData,
				954	"char encoding USC4 big endian not supported\n");
				955	break;
				956	case XML_CHAR_ENCODING_EBCDIC:
				957	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				958	ctxt->sax->error(ctxt->userData,
				959	"char encoding EBCDIC not supported\n");
				960	break;
				961	case XML_CHAR_ENCODING_UCS4_2143:
				962	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				963	ctxt->sax->error(ctxt->userData,
				964	"char encoding UCS4 2143 not supported\n");
				965	break;
				966	case XML_CHAR_ENCODING_UCS4_3412:
				967	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				968	ctxt->sax->error(ctxt->userData,
				969	"char encoding UCS4 3412 not supported\n");
				970	break;
				971	case XML_CHAR_ENCODING_UCS2:
				972	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				973	ctxt->sax->error(ctxt->userData,
				974	"char encoding UCS2 not supported\n");
				975	break;
				976	case XML_CHAR_ENCODING_8859_1:
				977	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				978	ctxt->sax->error(ctxt->userData,
				979	"char encoding ISO_8859_1 ISO Latin 1 not supported\n");
				980	break;
				981	case XML_CHAR_ENCODING_8859_2:
				982	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				983	ctxt->sax->error(ctxt->userData,
				984	"char encoding ISO_8859_2 ISO Latin 2 not supported\n");
				985	break;
				986	case XML_CHAR_ENCODING_8859_3:
				987	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				988	ctxt->sax->error(ctxt->userData,
				989	"char encoding ISO_8859_3 not supported\n");
				990	break;
				991	case XML_CHAR_ENCODING_8859_4:
				992	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				993	ctxt->sax->error(ctxt->userData,
				994	"char encoding ISO_8859_4 not supported\n");
				995	break;
				996	case XML_CHAR_ENCODING_8859_5:
				997	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				998	ctxt->sax->error(ctxt->userData,
				999	"char encoding ISO_8859_5 not supported\n");
				1000	break;
				1001	case XML_CHAR_ENCODING_8859_6:
				1002	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1003	ctxt->sax->error(ctxt->userData,
				1004	"char encoding ISO_8859_6 not supported\n");
				1005	break;
				1006	case XML_CHAR_ENCODING_8859_7:
				1007	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1008	ctxt->sax->error(ctxt->userData,
				1009	"char encoding ISO_8859_7 not supported\n");
				1010	break;
				1011	case XML_CHAR_ENCODING_8859_8:
				1012	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1013	ctxt->sax->error(ctxt->userData,
				1014	"char encoding ISO_8859_8 not supported\n");
				1015	break;
				1016	case XML_CHAR_ENCODING_8859_9:
				1017	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1018	ctxt->sax->error(ctxt->userData,
				1019	"char encoding ISO_8859_9 not supported\n");
				1020	break;
				1021	case XML_CHAR_ENCODING_2022_JP:
				1022	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1023	ctxt->sax->error(ctxt->userData,
				1024	"char encoding ISO-2022-JPnot supported\n");
				1025	break;
				1026	case XML_CHAR_ENCODING_SHIFT_JIS:
				1027	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1028	ctxt->sax->error(ctxt->userData,
				1029	"char encoding Shift_JISnot supported\n");
				1030	break;
				1031	case XML_CHAR_ENCODING_EUC_JP:
				1032	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1033	ctxt->sax->error(ctxt->userData,
				1034	"char encoding EUC-JPnot supported\n");
				1035	break;
				1036	}
				1037	}
				1038
				1039
				1040	/************************************************************************
				1041	* *
				1042	* Commodity functions, cleanup needed ? *
				1043	* *
				1044	************************************************************************/
				1045
				1046	/**
				1047	* areBlanks:
				1048	* @ctxt: an HTML parser context
				1049	* @str: a CHAR *
				1050	* @len: the size of @str
				1051	*
				1052	* Is this a sequence of blank chars that one can ignore ?
				1053	*
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1054	* Returns 1 if ignorable 0 otherwise.
				1055	*/
				1056
				1057	static int areBlanks(htmlParserCtxtPtr ctxt, const CHAR *str, int len) {
				1058	int i;
				1059	xmlNodePtr lastChild;
				1060
				1061	for (i = 0;i < len;i++)
				1062	if (!(IS_BLANK(str[i]))) return(0);
				1063
				1064	if (CUR != '<') return(0);
				1065	if (ctxt->node == NULL) return(0);
				1066	lastChild = xmlGetLastChild(ctxt->node);
				1067	if (lastChild == NULL) {
				1068	if (ctxt->node->content != NULL) return(0);
				1069	} else if (xmlNodeIsText(lastChild))
				1070	return(0);
				1071	return(1);
				1072	}
				1073
				1074	/**
				1075	* htmlHandleEntity:
				1076	* @ctxt: an HTML parser context
				1077	* @entity: an XML entity pointer.
				1078	*
				1079	* Default handling of an HTML entity, call the parser with the
				1080	* substitution string
				1081	*/
				1082
				1083	void
				1084	htmlHandleEntity(htmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
				1085	int len;
				1086
				1087	if (entity->content == NULL) {
				1088	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1089	ctxt->sax->error(ctxt->userData, "htmlHandleEntity %s: content == NULL\n",
				1090	entity->name);
				1091	ctxt->wellFormed = 0;
				1092	return;
				1093	}
				1094	len = xmlStrlen(entity->content);
				1095
				1096	/*
				1097	* Just handle the content as a set of chars.
				1098	*/
				1099	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				1100	ctxt->sax->characters(ctxt->userData, entity->content, len);
				1101
				1102	}
				1103
				1104	/**
				1105	* htmlNewDoc:
				1106	* @URI: URI for the dtd, or NULL
				1107	* @ExternalID: the external ID of the DTD, or NULL
				1108	*
				1109	* Returns a new document
				1110	*/
				1111	htmlDocPtr
				1112	htmlNewDoc(const CHAR URI, const CHAR ExternalID) {
				1113	xmlDocPtr cur;
				1114
				1115	/*
				1116	* Allocate a new document and fill the fields.
				1117	*/
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	1118	cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1119	if (cur == NULL) {
				1120	fprintf(stderr, "xmlNewDoc : malloc failed\n");
				1121	return(NULL);
				1122	}
Daniel Veillard	e7a5a77	1999-08-30 13:05:42 +0000	[diff] [blame]	1123	memset(cur, 0, sizeof(xmlDoc));
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1124
				1125	cur->type = XML_DOCUMENT_NODE;
				1126	cur->version = NULL;
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	1127	cur->intSubset = NULL;
Daniel Veillard	b96e643	1999-08-29 21:02:19 +0000	[diff] [blame]	1128	xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1129	cur->name = NULL;
				1130	cur->root = NULL;
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1131	cur->extSubset = NULL;
				1132	cur->oldNs = NULL;
				1133	cur->encoding = NULL;
				1134	cur->standalone = 1;
				1135	cur->compression = 0;
Daniel Veillard	c08a2c6	1999-09-08 21:35:25 +0000	[diff] [blame]	1136	cur->ids = NULL;
				1137	cur->refs = NULL;
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1138	#ifndef XML_WITHOUT_CORBA
				1139	cur->_private = NULL;
				1140	cur->vepv = NULL;
				1141	#endif
				1142	return(cur);
				1143	}
				1144
				1145
				1146	/************************************************************************
				1147	* *
				1148	* The parser itself *
				1149	* Relates to http://www.w3.org/TR/html40 *
				1150	* *
				1151	************************************************************************/
				1152
				1153	/************************************************************************
				1154	* *
				1155	* The parser itself *
				1156	* *
				1157	************************************************************************/
				1158
				1159	/**
				1160	* htmlParseHTMLName:
				1161	* @ctxt: an HTML parser context
				1162	*
				1163	* parse an HTML tag or attribute name, note that we convert it to uppercase
				1164	* since HTML names are not case-sensitive.
				1165	*
				1166	* Returns the Tag Name parsed or NULL
				1167	*/
				1168
				1169	CHAR *
				1170	htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
				1171	CHAR *ret = NULL;
				1172	int i = 0;
				1173	CHAR loc[100];
				1174
				1175	if (!IS_LETTER(CUR) && (CUR != '_') &&
				1176	(CUR != ':')) return(NULL);
				1177
				1178	while ((i < 100) && ((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)))) {
				1179	if ((CUR >= 0x61) && (CUR <= 0x7a)) loc[i] = CUR - 0x20;
				1180	else loc[i] = CUR;
				1181	i++;
				1182
				1183	NEXT;
				1184	}
				1185
				1186	ret = xmlStrndup(loc, i);
				1187
				1188	return(ret);
				1189	}
				1190
				1191	/**
				1192	* htmlParseName:
				1193	* @ctxt: an HTML parser context
				1194	*
				1195	* parse an HTML name, this routine is case sensistive.
				1196	*
				1197	* Returns the Name parsed or NULL
				1198	*/
				1199
				1200	CHAR *
				1201	htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillard	e2d034d	1999-07-27 19:52:06 +0000	[diff] [blame]	1202	CHAR buf[HTML_MAX_NAMELEN];
				1203	int len = 0;
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1204
Daniel Veillard	e2d034d	1999-07-27 19:52:06 +0000	[diff] [blame]	1205	GROW;
				1206	if (!IS_LETTER(CUR) && (CUR != '_')) {
				1207	return(NULL);
				1208	}
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1209
				1210	while ((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1211	(CUR == '.') \|\| (CUR == '-') \|\|
				1212	(CUR == '_') \|\| (CUR == ':') \|\|
				1213	(IS_COMBINING(CUR)) \|\|
Daniel Veillard	e2d034d	1999-07-27 19:52:06 +0000	[diff] [blame]	1214	(IS_EXTENDER(CUR))) {
				1215	buf[len++] = CUR;
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1216	NEXT;
Daniel Veillard	e2d034d	1999-07-27 19:52:06 +0000	[diff] [blame]	1217	if (len >= HTML_MAX_NAMELEN) {
				1218	fprintf(stderr,
				1219	"htmlParseName: reached HTML_MAX_NAMELEN limit\n");
				1220	while ((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1221	(CUR == '.') \|\| (CUR == '-') \|\|
				1222	(CUR == '_') \|\| (CUR == ':') \|\|
				1223	(IS_COMBINING(CUR)) \|\|
				1224	(IS_EXTENDER(CUR)))
				1225	NEXT;
				1226	break;
				1227	}
				1228	}
				1229	return(xmlStrndup(buf, len));
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1230	}
				1231
				1232	/**
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	1233	* htmlParseHTMLAttribute:
				1234	* @ctxt: an HTML parser context
				1235	*
Daniel Veillard	e2d034d	1999-07-27 19:52:06 +0000	[diff] [blame]	1236	* parse an HTML attribute value (without quotes).
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	1237	*
				1238	* Returns the Nmtoken parsed or NULL
				1239	*/
				1240
				1241	CHAR *
				1242	htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt) {
Daniel Veillard	e2d034d	1999-07-27 19:52:06 +0000	[diff] [blame]	1243	CHAR buf[HTML_MAX_NAMELEN];
				1244	int len = 0;
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	1245
Daniel Veillard	e2d034d	1999-07-27 19:52:06 +0000	[diff] [blame]	1246	GROW;
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	1247	while ((!IS_BLANK(CUR)) && (CUR != '<') &&
				1248	(CUR != '&') && (CUR != '>') &&
Daniel Veillard	e2d034d	1999-07-27 19:52:06 +0000	[diff] [blame]	1249	(CUR != '\'') && (CUR != '"')) {
				1250	buf[len++] = CUR;
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	1251	NEXT;
Daniel Veillard	e2d034d	1999-07-27 19:52:06 +0000	[diff] [blame]	1252	if (len >= HTML_MAX_NAMELEN) {
				1253	fprintf(stderr,
				1254	"htmlParseHTMLAttribute: reached HTML_MAX_NAMELEN limit\n");
				1255	while ((!IS_BLANK(CUR)) && (CUR != '<') &&
				1256	(CUR != '&') && (CUR != '>') &&
				1257	(CUR != '\'') && (CUR != '"'))
				1258	NEXT;
				1259	break;
				1260	}
				1261	}
				1262	return(xmlStrndup(buf, len));
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	1263	}
				1264
				1265	/**
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1266	* htmlParseNmtoken:
				1267	* @ctxt: an HTML parser context
				1268	*
				1269	* parse an HTML Nmtoken.
				1270	*
				1271	* Returns the Nmtoken parsed or NULL
				1272	*/
				1273
				1274	CHAR *
				1275	htmlParseNmtoken(htmlParserCtxtPtr ctxt) {
Daniel Veillard	e2d034d	1999-07-27 19:52:06 +0000	[diff] [blame]	1276	CHAR buf[HTML_MAX_NAMELEN];
				1277	int len = 0;
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1278
Daniel Veillard	e2d034d	1999-07-27 19:52:06 +0000	[diff] [blame]	1279	GROW;
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1280	while ((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1281	(CUR == '.') \|\| (CUR == '-') \|\|
				1282	(CUR == '_') \|\| (CUR == ':') \|\|
				1283	(IS_COMBINING(CUR)) \|\|
Daniel Veillard	e2d034d	1999-07-27 19:52:06 +0000	[diff] [blame]	1284	(IS_EXTENDER(CUR))) {
				1285	buf[len++] = CUR;
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1286	NEXT;
Daniel Veillard	e2d034d	1999-07-27 19:52:06 +0000	[diff] [blame]	1287	if (len >= HTML_MAX_NAMELEN) {
				1288	fprintf(stderr,
				1289	"htmlParseNmtoken: reached HTML_MAX_NAMELEN limit\n");
				1290	while ((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1291	(CUR == '.') \|\| (CUR == '-') \|\|
				1292	(CUR == '_') \|\| (CUR == ':') \|\|
				1293	(IS_COMBINING(CUR)) \|\|
				1294	(IS_EXTENDER(CUR)))
				1295	NEXT;
				1296	break;
				1297	}
				1298	}
				1299	return(xmlStrndup(buf, len));
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1300	}
				1301
				1302	/**
				1303	* htmlParseEntityRef:
				1304	* @ctxt: an HTML parser context
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	1305	* @str: location to store the entity name
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1306	*
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	1307	* parse an HTML ENTITY references
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1308	*
				1309	* [68] EntityRef ::= '&' Name ';'
				1310	*
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	1311	* Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
				1312	* if non-NULL *str will have to be freed by the caller.
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1313	*/
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	1314	htmlEntityDescPtr
				1315	htmlParseEntityRef(htmlParserCtxtPtr ctxt, CHAR **str) {
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1316	CHAR *name;
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	1317	htmlEntityDescPtr ent = NULL;
				1318	*str = NULL;
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1319
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1320	if (CUR == '&') {
				1321	NEXT;
				1322	name = htmlParseName(ctxt);
				1323	if (name == NULL) {
				1324	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1325	ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
				1326	ctxt->wellFormed = 0;
				1327	} else {
Daniel Veillard	e2d034d	1999-07-27 19:52:06 +0000	[diff] [blame]	1328	GROW;
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1329	if (CUR == ';') {
				1330	NEXT;
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	1331	*str = name;
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1332
				1333	/*
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	1334	* Lookup the entity in the table.
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1335	*/
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	1336	ent = htmlEntityLookup(name);
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1337	} else {
				1338	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1339	ctxt->sax->error(ctxt->userData,
				1340	"htmlParseEntityRef: expecting ';'\n");
				1341	ctxt->wellFormed = 0;
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	1342	if (ctxt->sax->characters != NULL) {
Daniel Veillard	b96e643	1999-08-29 21:02:19 +0000	[diff] [blame]	1343	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	1344	ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
				1345	}
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	1346	xmlFree(name);
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1347	}
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1348	}
				1349	}
				1350	return(ent);
				1351	}
				1352
				1353	/**
				1354	* htmlParseAttValue:
				1355	* @ctxt: an HTML parser context
				1356	*
				1357	* parse a value for an attribute
				1358	* Note: the parser won't do substitution of entities here, this
				1359	* will be handled later in xmlStringGetNodeList, unless it was
				1360	* asked for ctxt->replaceEntities != 0
				1361	*
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1362	* Returns the AttValue parsed or NULL.
				1363	*/
				1364
				1365	CHAR *
				1366	htmlParseAttValue(htmlParserCtxtPtr ctxt) {
				1367	CHAR *ret = NULL;
				1368
				1369	if (CUR == '"') {
				1370	NEXT;
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	1371	ret = htmlDecodeEntities(ctxt, -1, '"', '<', 0);
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1372	if (CUR == '<') {
				1373	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1374	ctxt->sax->error(ctxt->userData,
				1375	"Unescaped '<' not allowed in attributes values\n");
				1376	ctxt->wellFormed = 0;
				1377	}
				1378	if (CUR != '"') {
				1379	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1380	ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
				1381	ctxt->wellFormed = 0;
				1382	} else
				1383	NEXT;
				1384	} else if (CUR == '\'') {
				1385	NEXT;
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	1386	ret = htmlDecodeEntities(ctxt, -1, '\'', '<', 0);
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1387	if (CUR == '<') {
				1388	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1389	ctxt->sax->error(ctxt->userData,
				1390	"Unescaped '<' not allowed in attributes values\n");
				1391	ctxt->wellFormed = 0;
				1392	}
				1393	if (CUR != '\'') {
				1394	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1395	ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
				1396	ctxt->wellFormed = 0;
				1397	} else
				1398	NEXT;
				1399	} else {
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	1400	/*
				1401	* That's an HTMLism, the attribute value may not be quoted
				1402	*/
				1403	ret = htmlParseHTMLAttribute(ctxt);
				1404	if (ret == NULL) {
				1405	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1406	ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
				1407	ctxt->wellFormed = 0;
				1408	}
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1409	}
				1410
				1411	return(ret);
				1412	}
				1413
				1414	/**
				1415	* htmlParseSystemLiteral:
				1416	* @ctxt: an HTML parser context
				1417	*
				1418	* parse an HTML Literal
				1419	*
				1420	* [11] SystemLiteral ::= ('"' [^"]* '"') \| ("'" [^']* "'")
				1421	*
				1422	* Returns the SystemLiteral parsed or NULL
				1423	*/
				1424
				1425	CHAR *
				1426	htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
				1427	const CHAR *q;
				1428	CHAR *ret = NULL;
				1429
				1430	if (CUR == '"') {
				1431	NEXT;
				1432	q = CUR_PTR;
				1433	while ((IS_CHAR(CUR)) && (CUR != '"'))
				1434	NEXT;
				1435	if (!IS_CHAR(CUR)) {
				1436	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1437	ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
				1438	ctxt->wellFormed = 0;
				1439	} else {
				1440	ret = xmlStrndup(q, CUR_PTR - q);
				1441	NEXT;
				1442	}
				1443	} else if (CUR == '\'') {
				1444	NEXT;
				1445	q = CUR_PTR;
				1446	while ((IS_CHAR(CUR)) && (CUR != '\''))
				1447	NEXT;
				1448	if (!IS_CHAR(CUR)) {
				1449	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1450	ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
				1451	ctxt->wellFormed = 0;
				1452	} else {
				1453	ret = xmlStrndup(q, CUR_PTR - q);
				1454	NEXT;
				1455	}
				1456	} else {
				1457	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1458	ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
				1459	ctxt->wellFormed = 0;
				1460	}
				1461
				1462	return(ret);
				1463	}
				1464
				1465	/**
				1466	* htmlParsePubidLiteral:
				1467	* @ctxt: an HTML parser context
				1468	*
				1469	* parse an HTML public literal
				1470	*
				1471	* [12] PubidLiteral ::= '"' PubidChar* '"' \| "'" (PubidChar - "'")* "'"
				1472	*
				1473	* Returns the PubidLiteral parsed or NULL.
				1474	*/
				1475
				1476	CHAR *
				1477	htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
				1478	const CHAR *q;
				1479	CHAR *ret = NULL;
				1480	/*
				1481	* Name ::= (Letter \| '_') (NameChar)*
				1482	*/
				1483	if (CUR == '"') {
				1484	NEXT;
				1485	q = CUR_PTR;
				1486	while (IS_PUBIDCHAR(CUR)) NEXT;
				1487	if (CUR != '"') {
				1488	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1489	ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
				1490	ctxt->wellFormed = 0;
				1491	} else {
				1492	ret = xmlStrndup(q, CUR_PTR - q);
				1493	NEXT;
				1494	}
				1495	} else if (CUR == '\'') {
				1496	NEXT;
				1497	q = CUR_PTR;
				1498	while ((IS_LETTER(CUR)) && (CUR != '\''))
				1499	NEXT;
				1500	if (!IS_LETTER(CUR)) {
				1501	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1502	ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
				1503	ctxt->wellFormed = 0;
				1504	} else {
				1505	ret = xmlStrndup(q, CUR_PTR - q);
				1506	NEXT;
				1507	}
				1508	} else {
				1509	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1510	ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
				1511	ctxt->wellFormed = 0;
				1512	}
				1513
				1514	return(ret);
				1515	}
				1516
				1517	/**
				1518	* htmlParseCharData:
				1519	* @ctxt: an HTML parser context
				1520	* @cdata: int indicating whether we are within a CDATA section
				1521	*
				1522	* parse a CharData section.
				1523	* if we are within a CDATA section ']]>' marks an end of section.
				1524	*
				1525	* [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
				1526	*/
				1527
				1528	void
				1529	htmlParseCharData(htmlParserCtxtPtr ctxt, int cdata) {
				1530	const CHAR *q;
				1531
				1532	q = CUR_PTR;
				1533	while ((IS_CHAR(CUR)) && (CUR != '<') &&
				1534	(CUR != '&')) {
				1535	if ((CUR == ']') && (NXT(1) == ']') &&
				1536	(NXT(2) == '>')) {
				1537	if (cdata) break;
				1538	else {
				1539	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1540	ctxt->sax->error(ctxt->userData,
				1541	"Sequence ']]>' not allowed in content\n");
				1542	ctxt->wellFormed = 0;
				1543	}
				1544	}
				1545	NEXT;
				1546	}
				1547	if (q == CUR_PTR) return;
				1548
				1549	/*
				1550	* Ok the segment [q CUR_PTR] is to be consumed as chars.
				1551	*/
				1552	if (ctxt->sax != NULL) {
				1553	if (areBlanks(ctxt, q, CUR_PTR - q)) {
				1554	if (ctxt->sax->ignorableWhitespace != NULL)
				1555	ctxt->sax->ignorableWhitespace(ctxt->userData, q, CUR_PTR - q);
				1556	} else {
				1557	if (ctxt->sax->characters != NULL)
				1558	ctxt->sax->characters(ctxt->userData, q, CUR_PTR - q);
				1559	}
				1560	}
				1561	}
				1562
				1563	/**
				1564	* htmlParseExternalID:
				1565	* @ctxt: an HTML parser context
				1566	* @publicID: a CHAR** receiving PubidLiteral
				1567	* @strict: indicate whether we should restrict parsing to only
				1568	* production [75], see NOTE below
				1569	*
				1570	* Parse an External ID or a Public ID
				1571	*
				1572	* NOTE: Productions [75] and [83] interract badly since [75] can generate
				1573	* 'PUBLIC' S PubidLiteral S SystemLiteral
				1574	*
				1575	* [75] ExternalID ::= 'SYSTEM' S SystemLiteral
				1576	* \| 'PUBLIC' S PubidLiteral S SystemLiteral
				1577	*
				1578	* [83] PublicID ::= 'PUBLIC' S PubidLiteral
				1579	*
				1580	* Returns the function returns SystemLiteral and in the second
				1581	* case publicID receives PubidLiteral, is strict is off
				1582	* it is possible to return NULL and have publicID set.
				1583	*/
				1584
				1585	CHAR *
				1586	htmlParseExternalID(htmlParserCtxtPtr ctxt, CHAR **publicID, int strict) {
				1587	CHAR *URI = NULL;
				1588
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	1589	if ((UPPER == 'S') && (UPP(1) == 'Y') &&
				1590	(UPP(2) == 'S') && (UPP(3) == 'T') &&
				1591	(UPP(4) == 'E') && (UPP(5) == 'M')) {
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1592	SKIP(6);
				1593	if (!IS_BLANK(CUR)) {
				1594	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1595	ctxt->sax->error(ctxt->userData,
				1596	"Space required after 'SYSTEM'\n");
				1597	ctxt->wellFormed = 0;
				1598	}
				1599	SKIP_BLANKS;
				1600	URI = htmlParseSystemLiteral(ctxt);
				1601	if (URI == NULL) {
				1602	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1603	ctxt->sax->error(ctxt->userData,
				1604	"htmlParseExternalID: SYSTEM, no URI\n");
				1605	ctxt->wellFormed = 0;
				1606	}
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	1607	} else if ((UPPER == 'P') && (UPP(1) == 'U') &&
				1608	(UPP(2) == 'B') && (UPP(3) == 'L') &&
				1609	(UPP(4) == 'I') && (UPP(5) == 'C')) {
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1610	SKIP(6);
				1611	if (!IS_BLANK(CUR)) {
				1612	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1613	ctxt->sax->error(ctxt->userData,
				1614	"Space required after 'PUBLIC'\n");
				1615	ctxt->wellFormed = 0;
				1616	}
				1617	SKIP_BLANKS;
				1618	*publicID = htmlParsePubidLiteral(ctxt);
				1619	if (*publicID == NULL) {
				1620	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1621	ctxt->sax->error(ctxt->userData,
				1622	"htmlParseExternalID: PUBLIC, no Public Identifier\n");
				1623	ctxt->wellFormed = 0;
				1624	}
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1625	SKIP_BLANKS;
Daniel Veillard	e2d034d	1999-07-27 19:52:06 +0000	[diff] [blame]	1626	if ((CUR == '"') \|\| (CUR == '\'')) {
				1627	URI = htmlParseSystemLiteral(ctxt);
				1628	}
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1629	}
				1630	return(URI);
				1631	}
				1632
				1633	/**
				1634	* htmlParseComment:
				1635	* @ctxt: an HTML parser context
				1636	* @create: should we create a node, or just skip the content
				1637	*
				1638	* Parse an XML (SGML) comment <!-- .... -->
				1639	*
				1640	* [15] Comment ::= '<!--' ((Char - '-') \| ('-' (Char - '-')))* '-->'
				1641	*/
				1642	void
				1643	htmlParseComment(htmlParserCtxtPtr ctxt, int create) {
				1644	const CHAR q, start;
				1645	const CHAR *r;
				1646	CHAR *val;
				1647
				1648	/*
				1649	* Check that there is a comment right here.
				1650	*/
				1651	if ((CUR != '<') \|\| (NXT(1) != '!') \|\|
				1652	(NXT(2) != '-') \|\| (NXT(3) != '-')) return;
				1653
				1654	SKIP(4);
				1655	start = q = CUR_PTR;
				1656	NEXT;
				1657	r = CUR_PTR;
				1658	NEXT;
				1659	while (IS_CHAR(CUR) &&
				1660	((CUR == ':') \|\| (CUR != '>') \|\|
				1661	(r != '-') \|\| (q != '-'))) {
				1662	if ((r == '-') && (q == '-')) {
				1663	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1664	ctxt->sax->error(ctxt->userData,
				1665	"Comment must not contain '--' (double-hyphen)`\n");
				1666	ctxt->wellFormed = 0;
				1667	}
				1668	NEXT;r++;q++;
				1669	}
				1670	if (!IS_CHAR(CUR)) {
				1671	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1672	ctxt->sax->error(ctxt->userData, "Comment not terminated \n<!--%.50s\n", start);
				1673	ctxt->wellFormed = 0;
				1674	} else {
				1675	NEXT;
				1676	if (create) {
				1677	val = xmlStrndup(start, q - start);
				1678	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL))
				1679	ctxt->sax->comment(ctxt->userData, val);
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	1680	xmlFree(val);
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1681	}
				1682	}
				1683	}
				1684
				1685	/**
				1686	* htmlParseCharRef:
				1687	* @ctxt: an HTML parser context
				1688	*
				1689	* parse Reference declarations
				1690	*
				1691	* [66] CharRef ::= '&#' [0-9]+ ';' \|
				1692	* '&#x' [0-9a-fA-F]+ ';'
				1693	*
				1694	* Returns the value parsed (as an int)
				1695	*/
				1696	int
				1697	htmlParseCharRef(htmlParserCtxtPtr ctxt) {
				1698	int val = 0;
				1699
				1700	if ((CUR == '&') && (NXT(1) == '#') &&
				1701	(NXT(2) == 'x')) {
				1702	SKIP(3);
				1703	while (CUR != ';') {
				1704	if ((CUR >= '0') && (CUR <= '9'))
				1705	val = val * 16 + (CUR - '0');
				1706	else if ((CUR >= 'a') && (CUR <= 'f'))
				1707	val = val * 16 + (CUR - 'a') + 10;
				1708	else if ((CUR >= 'A') && (CUR <= 'F'))
				1709	val = val * 16 + (CUR - 'A') + 10;
				1710	else {
				1711	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1712	ctxt->sax->error(ctxt->userData,
				1713	"htmlParseCharRef: invalid hexadecimal value\n");
				1714	ctxt->wellFormed = 0;
				1715	val = 0;
				1716	break;
				1717	}
				1718	NEXT;
				1719	}
				1720	if (CUR == ';')
				1721	NEXT;
				1722	} else if ((CUR == '&') && (NXT(1) == '#')) {
				1723	SKIP(2);
				1724	while (CUR != ';') {
				1725	if ((CUR >= '0') && (CUR <= '9'))
				1726	val = val * 10 + (CUR - '0');
				1727	else {
				1728	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1729	ctxt->sax->error(ctxt->userData,
				1730	"htmlParseCharRef: invalid decimal value\n");
				1731	ctxt->wellFormed = 0;
				1732	val = 0;
				1733	break;
				1734	}
				1735	NEXT;
				1736	}
				1737	if (CUR == ';')
				1738	NEXT;
				1739	} else {
				1740	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1741	ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
				1742	ctxt->wellFormed = 0;
				1743	}
				1744	/*
				1745	* Check the value IS_CHAR ...
				1746	*/
				1747	if (IS_CHAR(val)) {
				1748	return(val);
				1749	} else {
				1750	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1751	ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid CHAR value %d\n",
				1752	val);
				1753	ctxt->wellFormed = 0;
				1754	}
				1755	return(0);
				1756	}
				1757
				1758
				1759	/**
				1760	* htmlParseDocTypeDecl :
				1761	* @ctxt: an HTML parser context
				1762	*
				1763	* parse a DOCTYPE declaration
				1764	*
				1765	* [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
				1766	* ('[' (markupdecl \| PEReference \| S)* ']' S?)? '>'
				1767	*/
				1768
				1769	void
				1770	htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
				1771	CHAR *name;
				1772	CHAR *ExternalID = NULL;
				1773	CHAR *URI = NULL;
				1774
				1775	/*
				1776	* We know that '<!DOCTYPE' has been detected.
				1777	*/
				1778	SKIP(9);
				1779
				1780	SKIP_BLANKS;
				1781
				1782	/*
				1783	* Parse the DOCTYPE name.
				1784	*/
				1785	name = htmlParseName(ctxt);
				1786	if (name == NULL) {
				1787	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1788	ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
				1789	ctxt->wellFormed = 0;
				1790	}
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	1791	/*
				1792	* Check that upper(name) == "HTML" !!!!!!!!!!!!!
				1793	*/
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1794
				1795	SKIP_BLANKS;
				1796
				1797	/*
				1798	* Check for SystemID and ExternalID
				1799	*/
Daniel Veillard	e2d034d	1999-07-27 19:52:06 +0000	[diff] [blame]	1800	URI = htmlParseExternalID(ctxt, &ExternalID, 0);
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1801	SKIP_BLANKS;
				1802
				1803	/*
				1804	* We should be at the end of the DOCTYPE declaration.
				1805	*/
				1806	if (CUR != '>') {
				1807	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1808	ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
				1809	ctxt->wellFormed = 0;
				1810	/* We shouldn't try to resynchronize ... */
				1811	} else {
				1812	}
				1813	NEXT;
				1814
				1815	/*
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	1816	* Create the document accordingly to the DOCTYPE
				1817	*/
				1818	ctxt->myDoc = htmlNewDoc(URI, ExternalID);
				1819
				1820	/*
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1821	* Cleanup, since we don't use all those identifiers
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1822	*/
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	1823	if (URI != NULL) xmlFree(URI);
				1824	if (ExternalID != NULL) xmlFree(ExternalID);
				1825	if (name != NULL) xmlFree(name);
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1826	}
				1827
				1828	/**
				1829	* htmlParseAttribute:
				1830	* @ctxt: an HTML parser context
				1831	* @value: a CHAR ** used to store the value of the attribute
				1832	*
				1833	* parse an attribute
				1834	*
				1835	* [41] Attribute ::= Name Eq AttValue
				1836	*
				1837	* [25] Eq ::= S? '=' S?
				1838	*
				1839	* With namespace:
				1840	*
				1841	* [NS 11] Attribute ::= QName Eq AttValue
				1842	*
				1843	* Also the case QName == xmlns:??? is handled independently as a namespace
				1844	* definition.
				1845	*
				1846	* Returns the attribute name, and the value in *value.
				1847	*/
				1848
				1849	CHAR *
				1850	htmlParseAttribute(htmlParserCtxtPtr ctxt, CHAR **value) {
				1851	CHAR name, val;
				1852
				1853	*value = NULL;
				1854	name = htmlParseName(ctxt);
				1855	if (name == NULL) {
				1856	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1857	ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
				1858	ctxt->wellFormed = 0;
				1859	return(NULL);
				1860	}
				1861
				1862	/*
				1863	* read the value
				1864	*/
				1865	SKIP_BLANKS;
				1866	if (CUR == '=') {
				1867	NEXT;
				1868	SKIP_BLANKS;
				1869	val = htmlParseAttValue(ctxt);
				1870	} else {
				1871	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1872	ctxt->sax->error(ctxt->userData,
				1873	"Specification mandate value for attribute %s\n", name);
				1874	ctxt->wellFormed = 0;
				1875	return(NULL);
				1876	}
				1877
				1878	*value = val;
				1879	return(name);
				1880	}
				1881
				1882	/**
				1883	* htmlParseStartTag:
				1884	* @ctxt: an HTML parser context
				1885	*
				1886	* parse a start of tag either for rule element or
				1887	* EmptyElement. In both case we don't parse the tag closing chars.
				1888	*
				1889	* [40] STag ::= '<' Name (S Attribute)* S? '>'
				1890	*
				1891	* [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
				1892	*
				1893	* With namespace:
				1894	*
				1895	* [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
				1896	*
				1897	* [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
				1898	*
				1899	* Returns the element name parsed
				1900	*/
				1901
				1902	CHAR *
				1903	htmlParseStartTag(htmlParserCtxtPtr ctxt) {
				1904	CHAR *name;
				1905	CHAR *attname;
				1906	CHAR *attvalue;
				1907	const CHAR **atts = NULL;
				1908	int nbatts = 0;
				1909	int maxatts = 0;
				1910	int i;
				1911
				1912	if (CUR != '<') return(NULL);
				1913	NEXT;
				1914
				1915	name = htmlParseHTMLName(ctxt);
				1916	if (name == NULL) {
				1917	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1918	ctxt->sax->error(ctxt->userData,
				1919	"htmlParseStartTag: invalid element name\n");
				1920	ctxt->wellFormed = 0;
				1921	return(NULL);
				1922	}
				1923
				1924	/*
				1925	* Check for auto-closure of HTML elements.
				1926	*/
				1927	htmlAutoClose(ctxt, name);
				1928
				1929	/*
				1930	* Now parse the attributes, it ends up with the ending
				1931	*
				1932	* (S Attribute)* S?
				1933	*/
				1934	SKIP_BLANKS;
				1935	while ((IS_CHAR(CUR)) &&
				1936	(CUR != '>') &&
				1937	((CUR != '/') \|\| (NXT(1) != '>'))) {
				1938	const CHAR *q = CUR_PTR;
				1939
				1940	attname = htmlParseAttribute(ctxt, &attvalue);
				1941	if ((attname != NULL) && (attvalue != NULL)) {
				1942	/*
				1943	* Well formedness requires at most one declaration of an attribute
				1944	*/
				1945	for (i = 0; i < nbatts;i += 2) {
				1946	if (!xmlStrcmp(atts[i], attname)) {
				1947	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1948	ctxt->sax->error(ctxt->userData, "Attribute %s redefined\n",
				1949	name);
				1950	ctxt->wellFormed = 0;
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	1951	xmlFree(attname);
				1952	xmlFree(attvalue);
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1953	break;
				1954	}
				1955	}
				1956
				1957	/*
				1958	* Add the pair to atts
				1959	*/
				1960	if (atts == NULL) {
				1961	maxatts = 10;
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	1962	atts = (const CHAR *) xmlMalloc(maxatts sizeof(CHAR *));
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1963	if (atts == NULL) {
				1964	fprintf(stderr, "malloc of %ld byte failed\n",
Daniel Veillard	82150d8	1999-07-07 07:32:15 +0000	[diff] [blame]	1965	maxatts * (long)sizeof(CHAR *));
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1966	return(NULL);
				1967	}
				1968	} else if (nbatts + 2 < maxatts) {
				1969	maxatts *= 2;
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	1970	atts = (const CHAR *) xmlRealloc(atts, maxatts sizeof(CHAR *));
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1971	if (atts == NULL) {
				1972	fprintf(stderr, "realloc of %ld byte failed\n",
Daniel Veillard	82150d8	1999-07-07 07:32:15 +0000	[diff] [blame]	1973	maxatts * (long)sizeof(CHAR *));
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	1974	return(NULL);
				1975	}
				1976	}
				1977	atts[nbatts++] = attname;
				1978	atts[nbatts++] = attvalue;
				1979	atts[nbatts] = NULL;
				1980	atts[nbatts + 1] = NULL;
				1981	}
				1982
				1983	SKIP_BLANKS;
				1984	if (q == CUR_PTR) {
				1985	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1986	ctxt->sax->error(ctxt->userData,
				1987	"htmlParseStartTag: problem parsing attributes\n");
				1988	ctxt->wellFormed = 0;
				1989	break;
				1990	}
				1991	}
				1992
				1993	/*
				1994	* SAX: Start of Element !
				1995	*/
				1996	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				1997	ctxt->sax->startElement(ctxt->userData, name, atts);
				1998
				1999	if (atts != NULL) {
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	2000	for (i = 0;i < nbatts;i++) xmlFree((CHAR *) atts[i]);
				2001	xmlFree(atts);
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2002	}
				2003	return(name);
				2004	}
				2005
				2006	/**
				2007	* htmlParseEndTag:
				2008	* @ctxt: an HTML parser context
				2009	* @tagname: the tag name as parsed in the opening tag.
				2010	*
				2011	* parse an end of tag
				2012	*
				2013	* [42] ETag ::= '</' Name S? '>'
				2014	*
				2015	* With namespace
				2016	*
				2017	* [NS 9] ETag ::= '</' QName S? '>'
				2018	*/
				2019
				2020	void
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	2021	htmlParseEndTag(htmlParserCtxtPtr ctxt, const CHAR *tagname) {
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2022	CHAR *name;
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	2023	int i;
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2024
				2025	if ((CUR != '<') \|\| (NXT(1) != '/')) {
				2026	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2027	ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
				2028	ctxt->wellFormed = 0;
				2029	return;
				2030	}
				2031	SKIP(2);
				2032
				2033	name = htmlParseHTMLName(ctxt);
				2034
				2035	/*
				2036	* We should definitely be at the ending "S? '>'" part
				2037	*/
				2038	SKIP_BLANKS;
				2039	if ((!IS_CHAR(CUR)) \|\| (CUR != '>')) {
				2040	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2041	ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
				2042	ctxt->wellFormed = 0;
				2043	} else
				2044	NEXT;
				2045
				2046	/*
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	2047	* Check that we are not closing an already closed tag,
				2048	* <p><b>...</p></b> is a really common error !
				2049	*/
				2050	for (i = ctxt->nodeNr - 1;i >= 0;i--) {
				2051	if ((ctxt->nodeTab[i] != NULL) &&
				2052	(!xmlStrcmp(tagname, ctxt->nodeTab[i]->name)))
				2053	break;
				2054	}
				2055	if (i < 0) {
				2056	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2057	ctxt->sax->error(ctxt->userData,
				2058	"htmlParseEndTag: unexpected close for tag %s\n",
				2059	tagname);
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	2060	xmlFree(name);
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	2061	ctxt->wellFormed = 0;
				2062	return;
				2063	}
				2064
				2065	/*
				2066	* Check for auto-closure of HTML elements.
				2067	*/
				2068	htmlAutoCloseOnClose(ctxt, name);
				2069
				2070	/*
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2071	* Well formedness constraints, opening and closing must match.
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	2072	* With the exception that the autoclose may have popped stuff out
				2073	* of the stack.
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2074	*/
				2075	if (xmlStrcmp(name, tagname)) {
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	2076	if ((ctxt->node != NULL) &&
				2077	(xmlStrcmp(ctxt->node->name, name))) {
				2078	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2079	ctxt->sax->error(ctxt->userData,
				2080	"Opening and ending tag mismatch: %s and %s\n",
				2081	name, ctxt->node->name);
				2082	ctxt->wellFormed = 0;
				2083	}
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2084	}
				2085
				2086	/*
				2087	* SAX: End of Tag
				2088	*/
				2089	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				2090	ctxt->sax->endElement(ctxt->userData, name);
				2091
				2092	if (name != NULL)
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	2093	xmlFree(name);
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2094
				2095	return;
				2096	}
				2097
				2098
				2099	/**
				2100	* htmlParseReference:
				2101	* @ctxt: an HTML parser context
				2102	*
				2103	* parse and handle entity references in content,
				2104	* this will end-up in a call to character() since this is either a
				2105	* CharRef, or a predefined entity.
				2106	*/
				2107	void
				2108	htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	2109	htmlEntityDescPtr ent;
				2110	CHAR out[2];
				2111	CHAR *name;
				2112	int val;
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2113	if (CUR != '&') return;
				2114
				2115	if (NXT(1) == '#') {
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	2116	val = htmlParseCharRef(ctxt);
Daniel Veillard	b96e643	1999-08-29 21:02:19 +0000	[diff] [blame]	2117	/* invalid for UTF-8 variable encoding !!!!! */
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2118	out[0] = val;
				2119	out[1] = 0;
				2120	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				2121	ctxt->sax->characters(ctxt->userData, out, 1);
				2122	} else {
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	2123	ent = htmlParseEntityRef(ctxt, &name);
				2124	if (name == NULL) return; /* Shall we output & anyway ? */
				2125	if ((ent == NULL) \|\| (ent->value <= 0) \|\| (ent->value >= 255)) {
				2126	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
Daniel Veillard	b96e643	1999-08-29 21:02:19 +0000	[diff] [blame]	2127	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	2128	ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
Daniel Veillard	b96e643	1999-08-29 21:02:19 +0000	[diff] [blame]	2129	ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1);
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	2130	}
				2131	} else {
Daniel Veillard	b96e643	1999-08-29 21:02:19 +0000	[diff] [blame]	2132	/* invalid for UTF-8 variable encoding !!!!! */
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	2133	out[0] = ent->value;
				2134	out[1] = 0;
				2135	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				2136	ctxt->sax->characters(ctxt->userData, out, 1);
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2137	}
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	2138	xmlFree(name);
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2139	}
				2140	}
				2141
				2142	/**
				2143	* htmlParseContent:
				2144	* @ctxt: an HTML parser context
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	2145	* @name: the node name
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2146	*
				2147	* Parse a content: comment, sub-element, reference or text.
				2148	*
				2149	*/
				2150
				2151	void
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	2152	htmlParseContent(htmlParserCtxtPtr ctxt, const CHAR *name) {
				2153	htmlNodePtr currentNode;
				2154
				2155	currentNode = ctxt->node;
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2156	while ((CUR != '<') \|\| (NXT(1) != '/')) {
				2157	const CHAR *test = CUR_PTR;
				2158
				2159	/*
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	2160	* Has this node been popped out during parsing of
				2161	* the next element
				2162	*/
				2163	if (currentNode != ctxt->node) return;
				2164
				2165	/*
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2166	* First case : a comment
				2167	*/
				2168	if ((CUR == '<') && (NXT(1) == '!') &&
				2169	(NXT(2) == '-') && (NXT(3) == '-')) {
				2170	htmlParseComment(ctxt, 1);
				2171	}
				2172
				2173	/*
				2174	* Second case : a sub-element.
				2175	*/
				2176	else if (CUR == '<') {
				2177	htmlParseElement(ctxt);
				2178	}
				2179
				2180	/*
				2181	* Third case : a reference. If if has not been resolved,
				2182	* parsing returns it's Name, create the node
				2183	*/
				2184	else if (CUR == '&') {
				2185	htmlParseReference(ctxt);
				2186	}
				2187
				2188	/*
				2189	* Last case, text. Note that References are handled directly.
				2190	*/
				2191	else {
				2192	htmlParseCharData(ctxt, 0);
				2193	}
				2194
				2195	if (test == CUR_PTR) {
				2196	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2197	ctxt->sax->error(ctxt->userData,
				2198	"detected an error in element content\n");
				2199	ctxt->wellFormed = 0;
				2200	break;
				2201	}
Daniel Veillard	e2d034d	1999-07-27 19:52:06 +0000	[diff] [blame]	2202	GROW;
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2203	}
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	2204
				2205	/*
				2206	* parse the end of tag: '</' should be here.
				2207	*/
				2208	htmlParseEndTag(ctxt, name);
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2209	}
				2210
				2211	/**
				2212	* htmlParseElement:
				2213	* @ctxt: an HTML parser context
				2214	*
				2215	* parse an HTML element, this is highly recursive
				2216	*
				2217	* [39] element ::= EmptyElemTag \| STag content ETag
				2218	*
				2219	* [41] Attribute ::= Name Eq AttValue
				2220	*/
				2221
				2222	void
				2223	htmlParseElement(htmlParserCtxtPtr ctxt) {
				2224	const CHAR *openTag = CUR_PTR;
				2225	CHAR *name;
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2226	htmlNodePtr currentNode;
				2227	htmlElemDescPtr info;
Daniel Veillard	1ff7ae3	1999-09-01 12:19:13 +0000	[diff] [blame]	2228	htmlParserNodeInfo node_info;
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2229
				2230	/* Capture start position */
Daniel Veillard	1ff7ae3	1999-09-01 12:19:13 +0000	[diff] [blame]	2231	if (ctxt->record_info) {
				2232	node_info.begin_pos = ctxt->input->consumed +
				2233	(CUR_PTR - ctxt->input->base);
				2234	node_info.begin_line = ctxt->input->line;
				2235	}
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2236
				2237	name = htmlParseStartTag(ctxt);
				2238	if (name == NULL) {
				2239	return;
				2240	}
Daniel Veillard	1ff7ae3	1999-09-01 12:19:13 +0000	[diff] [blame]	2241	currentNode = ctxt->node;
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2242
				2243	/*
				2244	* Lookup the info for that element.
				2245	*/
				2246	info = htmlTagLookup(name);
				2247	if (info == NULL) {
				2248	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2249	ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
				2250	name);
				2251	ctxt->wellFormed = 0;
				2252	} else if (info->depr) {
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	2253	/***************************
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2254	if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
				2255	ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
				2256	name);
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	2257	***************************/
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2258	}
				2259
				2260	/*
				2261	* Check for an Empty Element labelled the XML/SGML way
				2262	*/
				2263	if ((CUR == '/') && (NXT(1) == '>')) {
				2264	SKIP(2);
				2265	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				2266	ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	2267	xmlFree(name);
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2268	return;
				2269	}
				2270
Daniel Veillard	e2d034d	1999-07-27 19:52:06 +0000	[diff] [blame]	2271	if (CUR == '>') {
				2272	NEXT;
				2273	} else {
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2274	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2275	ctxt->sax->error(ctxt->userData, "Couldn't find end of Start Tag\n%.30s\n",
				2276	openTag);
				2277	ctxt->wellFormed = 0;
				2278
				2279	/*
				2280	* end of parsing of this node.
				2281	*/
				2282	nodePop(ctxt);
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	2283	xmlFree(name);
Daniel Veillard	1ff7ae3	1999-09-01 12:19:13 +0000	[diff] [blame]	2284
				2285	/*
				2286	* Capture end position and add node
				2287	*/
				2288	if ( currentNode != NULL && ctxt->record_info ) {
				2289	node_info.end_pos = ctxt->input->consumed +
				2290	(CUR_PTR - ctxt->input->base);
				2291	node_info.end_line = ctxt->input->line;
				2292	node_info.node = currentNode;
				2293	xmlParserAddNodeInfo(ctxt, &node_info);
				2294	}
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2295	return;
				2296	}
				2297
				2298	/*
				2299	* Check for an Empty Element from DTD definition
				2300	*/
				2301	if ((info != NULL) && (info->empty)) {
				2302	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				2303	ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	2304	xmlFree(name);
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2305	return;
				2306	}
				2307
				2308	/*
				2309	* Parse the content of the element:
				2310	*/
				2311	currentNode = ctxt->node;
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	2312	htmlParseContent(ctxt, name);
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2313
				2314	/*
				2315	* check whether the element get popped due to auto closure
				2316	* on start tag
				2317	*/
				2318	if (currentNode != ctxt->node) {
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	2319	xmlFree(name);
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2320	return;
				2321	}
				2322
				2323	if (!IS_CHAR(CUR)) {
				2324	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2325	ctxt->sax->error(ctxt->userData,
				2326	"Premature end of data in tag %.30s\n", openTag);
				2327	ctxt->wellFormed = 0;
				2328
				2329	/*
				2330	* end of parsing of this node.
				2331	*/
				2332	nodePop(ctxt);
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	2333	xmlFree(name);
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2334	return;
				2335	}
				2336
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	2337	xmlFree(name);
Daniel Veillard	1ff7ae3	1999-09-01 12:19:13 +0000	[diff] [blame]	2338
				2339	/*
				2340	* Capture end position and add node
				2341	*/
				2342	if ( currentNode != NULL && ctxt->record_info ) {
				2343	node_info.end_pos = ctxt->input->consumed +
				2344	(CUR_PTR - ctxt->input->base);
				2345	node_info.end_line = ctxt->input->line;
				2346	node_info.node = currentNode;
				2347	xmlParserAddNodeInfo(ctxt, &node_info);
				2348	}
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2349	}
				2350
				2351	/**
				2352	* htmlParseDocument :
				2353	* @ctxt: an HTML parser context
				2354	*
				2355	* parse an HTML document (and build a tree if using the standard SAX
				2356	* interface).
				2357	*
				2358	* Returns 0, -1 in case of error. the parser context is augmented
				2359	* as a result of the parsing.
				2360	*/
				2361
				2362	int
				2363	htmlParseDocument(htmlParserCtxtPtr ctxt) {
				2364	htmlDefaultSAXHandlerInit();
				2365	ctxt->html = 1;
				2366
Daniel Veillard	e2d034d	1999-07-27 19:52:06 +0000	[diff] [blame]	2367	GROW;
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2368	/*
Daniel Veillard	b96e643	1999-08-29 21:02:19 +0000	[diff] [blame]	2369	* SAX: beginning of the document processing.
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2370	*/
				2371	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
				2372	ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
				2373
				2374	/*
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2375	* Wipe out everything which is before the first '<'
				2376	*/
				2377	if (IS_BLANK(CUR)) {
				2378	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2379	ctxt->sax->error(ctxt->userData,
				2380	"Extra spaces at the beginning of the document are not allowed\n");
				2381	ctxt->wellFormed = 0;
				2382	SKIP_BLANKS;
				2383	}
				2384
				2385	if (CUR == 0) {
				2386	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2387	ctxt->sax->error(ctxt->userData, "Document is empty\n");
				2388	ctxt->wellFormed = 0;
				2389	}
				2390
				2391
				2392	/*
				2393	* Then possibly doc type declaration(s) and more Misc
				2394	* (doctypedecl Misc*)?
				2395	*/
				2396	if ((CUR == '<') && (NXT(1) == '!') &&
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	2397	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				2398	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				2399	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				2400	(UPP(8) == 'E')) {
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2401	htmlParseDocTypeDecl(ctxt);
				2402	}
				2403	SKIP_BLANKS;
				2404
				2405	/*
				2406	* Create the document if not done already.
				2407	*/
				2408	if (ctxt->myDoc == NULL) {
				2409	ctxt->myDoc = htmlNewDoc(NULL, NULL);
				2410	}
				2411
				2412	/*
				2413	* Time to start parsing the tree itself
				2414	*/
				2415	htmlParseElement(ctxt);
				2416
				2417	/*
				2418	* SAX: end of the document processing.
				2419	*/
				2420	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				2421	ctxt->sax->endDocument(ctxt->userData);
				2422	if (! ctxt->wellFormed) return(-1);
				2423	return(0);
				2424	}
				2425
				2426
				2427	/********************************************************************************
				2428	* *
				2429	* Parser contexts handling *
				2430	* *
				2431	********************************************************************************/
				2432
				2433	/**
				2434	* xmlInitParserCtxt:
				2435	* @ctxt: an HTML parser context
				2436	*
				2437	* Initialize a parser context
				2438	*/
				2439
				2440	void
				2441	htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
				2442	{
				2443	htmlSAXHandler *sax;
				2444
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	2445	sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2446	if (sax == NULL) {
				2447	fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
				2448	}
				2449
				2450	/* Allocate the Input stack */
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	2451	ctxt->inputTab = (htmlParserInputPtr ) xmlMalloc(5 sizeof(htmlParserInputPtr));
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2452	ctxt->inputNr = 0;
				2453	ctxt->inputMax = 5;
				2454	ctxt->input = NULL;
				2455	ctxt->version = NULL;
				2456	ctxt->encoding = NULL;
				2457	ctxt->standalone = -1;
				2458
				2459	/* Allocate the Node stack */
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	2460	ctxt->nodeTab = (htmlNodePtr ) xmlMalloc(10 sizeof(htmlNodePtr));
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2461	ctxt->nodeNr = 0;
				2462	ctxt->nodeMax = 10;
				2463	ctxt->node = NULL;
				2464
				2465	if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
				2466	else {
				2467	ctxt->sax = sax;
				2468	memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
				2469	}
				2470	ctxt->userData = ctxt;
				2471	ctxt->myDoc = NULL;
				2472	ctxt->wellFormed = 1;
Daniel Veillard	5233ffc	1999-07-06 22:25:25 +0000	[diff] [blame]	2473	ctxt->replaceEntities = 0;
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2474	ctxt->html = 1;
				2475	ctxt->record_info = 0;
				2476	xmlInitNodeInfoSeq(&ctxt->node_seq);
				2477	}
				2478
				2479	/**
				2480	* htmlFreeParserCtxt:
				2481	* @ctxt: an HTML parser context
				2482	*
				2483	* Free all the memory used by a parser context. However the parsed
				2484	* document in ctxt->myDoc is not freed.
				2485	*/
				2486
				2487	void
				2488	htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
				2489	{
				2490	htmlParserInputPtr input;
				2491
				2492	if (ctxt == NULL) return;
				2493
				2494	while ((input = inputPop(ctxt)) != NULL) {
				2495	xmlFreeInputStream(input);
				2496	}
				2497
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	2498	if (ctxt->nodeTab != NULL) xmlFree(ctxt->nodeTab);
				2499	if (ctxt->inputTab != NULL) xmlFree(ctxt->inputTab);
				2500	if (ctxt->version != NULL) xmlFree((char *) ctxt->version);
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2501	if ((ctxt->sax != NULL) && (ctxt->sax != &htmlDefaultSAXHandler))
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	2502	xmlFree(ctxt->sax);
				2503	xmlFree(ctxt);
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2504	}
				2505
				2506	/**
				2507	* htmlCreateDocParserCtxt :
				2508	* @cur: a pointer to an array of CHAR
				2509	* @encoding: a free form C string describing the HTML document encoding, or NULL
				2510	*
				2511	* Create a parser context for an HTML document.
				2512	*
				2513	* Returns the new parser context or NULL
				2514	*/
				2515	htmlParserCtxtPtr
				2516	htmlCreateDocParserCtxt(CHAR cur, const char encoding) {
				2517	htmlParserCtxtPtr ctxt;
				2518	htmlParserInputPtr input;
				2519	/* htmlCharEncoding enc; */
				2520
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	2521	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2522	if (ctxt == NULL) {
				2523	perror("malloc");
				2524	return(NULL);
				2525	}
				2526	htmlInitParserCtxt(ctxt);
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	2527	input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2528	if (input == NULL) {
				2529	perror("malloc");
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	2530	xmlFree(ctxt);
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2531	return(NULL);
				2532	}
				2533
				2534	/*
				2535	* plug some encoding conversion routines here. !!!
				2536	if (encoding != NULL) {
				2537	enc = htmlDetectCharEncoding(cur);
				2538	htmlSwitchEncoding(ctxt, enc);
				2539	}
				2540	*/
				2541
				2542	input->filename = NULL;
				2543	input->line = 1;
				2544	input->col = 1;
				2545	input->base = cur;
				2546	input->cur = cur;
				2547	input->free = NULL;
Daniel Veillard	e2d034d	1999-07-27 19:52:06 +0000	[diff] [blame]	2548	input->buf = NULL;
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2549
				2550	inputPush(ctxt, input);
				2551	return(ctxt);
				2552	}
				2553
				2554	/********************************************************************************
				2555	* *
				2556	* User entry points *
				2557	* *
				2558	********************************************************************************/
				2559
				2560	/**
				2561	* htmlSAXParseDoc :
				2562	* @cur: a pointer to an array of CHAR
				2563	* @encoding: a free form C string describing the HTML document encoding, or NULL
				2564	* @sax: the SAX handler block
				2565	* @userData: if using SAX, this pointer will be provided on callbacks.
				2566	*
				2567	* parse an HTML in-memory document and build a tree.
				2568	* It use the given SAX function block to handle the parsing callback.
				2569	* If sax is NULL, fallback to the default DOM tree building routines.
				2570	*
				2571	* Returns the resulting document tree
				2572	*/
				2573
				2574	htmlDocPtr
				2575	htmlSAXParseDoc(CHAR cur, const char encoding, htmlSAXHandlerPtr sax, void *userData) {
				2576	htmlDocPtr ret;
				2577	htmlParserCtxtPtr ctxt;
				2578
				2579	if (cur == NULL) return(NULL);
				2580
				2581
				2582	ctxt = htmlCreateDocParserCtxt(cur, encoding);
				2583	if (ctxt == NULL) return(NULL);
				2584	if (sax != NULL) {
				2585	ctxt->sax = sax;
				2586	ctxt->userData = userData;
				2587	}
				2588
				2589	htmlParseDocument(ctxt);
				2590	ret = ctxt->myDoc;
				2591	if (sax != NULL) {
				2592	ctxt->sax = NULL;
				2593	ctxt->userData = NULL;
				2594	}
				2595	htmlFreeParserCtxt(ctxt);
				2596
				2597	return(ret);
				2598	}
				2599
				2600	/**
				2601	* htmlParseDoc :
				2602	* @cur: a pointer to an array of CHAR
				2603	* @encoding: a free form C string describing the HTML document encoding, or NULL
				2604	*
				2605	* parse an HTML in-memory document and build a tree.
				2606	*
				2607	* Returns the resulting document tree
				2608	*/
				2609
				2610	htmlDocPtr
				2611	htmlParseDoc(CHAR cur, const char encoding) {
				2612	return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
				2613	}
				2614
				2615
				2616	/**
				2617	* htmlCreateFileParserCtxt :
				2618	* @filename: the filename
				2619	* @encoding: a free form C string describing the HTML document encoding, or NULL
				2620	*
				2621	* Create a parser context for a file content.
				2622	* Automatic support for ZLIB/Compress compressed document is provided
				2623	* by default if found at compile-time.
				2624	*
				2625	* Returns the new parser context or NULL
				2626	*/
				2627	htmlParserCtxtPtr
				2628	htmlCreateFileParserCtxt(const char filename, const char encoding)
				2629	{
				2630	htmlParserCtxtPtr ctxt;
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2631	htmlParserInputPtr inputStream;
Daniel Veillard	e2d034d	1999-07-27 19:52:06 +0000	[diff] [blame]	2632	xmlParserInputBufferPtr buf;
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2633	/* htmlCharEncoding enc; */
				2634
Daniel Veillard	e2d034d	1999-07-27 19:52:06 +0000	[diff] [blame]	2635	buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
				2636	if (buf == NULL) return(NULL);
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2637
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	2638	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2639	if (ctxt == NULL) {
				2640	perror("malloc");
				2641	return(NULL);
				2642	}
				2643	htmlInitParserCtxt(ctxt);
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	2644	inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2645	if (inputStream == NULL) {
				2646	perror("malloc");
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	2647	xmlFree(ctxt);
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2648	return(NULL);
				2649	}
				2650
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	2651	inputStream->filename = xmlMemStrdup(filename);
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2652	inputStream->line = 1;
				2653	inputStream->col = 1;
Daniel Veillard	e2d034d	1999-07-27 19:52:06 +0000	[diff] [blame]	2654	inputStream->buf = buf;
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2655
Daniel Veillard	e2d034d	1999-07-27 19:52:06 +0000	[diff] [blame]	2656	inputStream->base = inputStream->buf->buffer->content;
				2657	inputStream->cur = inputStream->buf->buffer->content;
				2658	inputStream->free = NULL;
Daniel Veillard	be70ff7	1999-07-05 16:50:46 +0000	[diff] [blame]	2659
				2660	inputPush(ctxt, inputStream);
				2661	return(ctxt);
				2662	}
				2663
				2664	/**
				2665	* htmlSAXParseFile :
				2666	* @filename: the filename
				2667	* @encoding: a free form C string describing the HTML document encoding, or NULL
				2668	* @sax: the SAX handler block
				2669	* @userData: if using SAX, this pointer will be provided on callbacks.
				2670	*
				2671	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				2672	* compressed document is provided by default if found at compile-time.
				2673	* It use the given SAX function block to handle the parsing callback.
				2674	* If sax is NULL, fallback to the default DOM tree building routines.
				2675	*
				2676	* Returns the resulting document tree
				2677	*/
				2678
				2679	htmlDocPtr
				2680	htmlSAXParseFile(const char filename, const char encoding, htmlSAXHandlerPtr sax,
				2681	void *userData) {
				2682	htmlDocPtr ret;
				2683	htmlParserCtxtPtr ctxt;
				2684
				2685	ctxt = htmlCreateFileParserCtxt(filename, encoding);
				2686	if (ctxt == NULL) return(NULL);
				2687	if (sax != NULL) {
				2688	ctxt->sax = sax;
				2689	ctxt->userData = userData;
				2690	}
				2691
				2692	htmlParseDocument(ctxt);
				2693
				2694	ret = ctxt->myDoc;
				2695	if (sax != NULL) {
				2696	ctxt->sax = NULL;
				2697	ctxt->userData = NULL;
				2698	}
				2699	htmlFreeParserCtxt(ctxt);
				2700
				2701	return(ret);
				2702	}
				2703
				2704	/**
				2705	* htmlParseFile :
				2706	* @filename: the filename
				2707	* @encoding: a free form C string describing the HTML document encoding, or NULL
				2708	*
				2709	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				2710	* compressed document is provided by default if found at compile-time.
				2711	*
				2712	* Returns the resulting document tree
				2713	*/
				2714
				2715	htmlDocPtr
				2716	htmlParseFile(const char filename, const char encoding) {
				2717	return(htmlSAXParseFile(filename, encoding, NULL, NULL));
				2718	}