blob: 3f4c16cb226291d0f78571d9fcf21dab51550773 [file] [log] [blame]
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * Daniel.Veillard@w3.org
7 */
8
9#ifdef WIN32
10#define HAVE_FCNTL_H
11#include <io.h>
12#else
Daniel Veillard7f7d1111999-09-22 09:46:25 +000013#include "config.h"
Daniel Veillardbe70ff71999-07-05 16:50:46 +000014#endif
Daniel Veillard7f7d1111999-09-22 09:46:25 +000015
Daniel Veillardbe70ff71999-07-05 16:50:46 +000016#include <stdio.h>
Daniel Veillardbe70ff71999-07-05 16:50:46 +000017#include <string.h> /* for memset() only */
Daniel Veillard7f7d1111999-09-22 09:46:25 +000018#ifdef HAVE_CTYPE_H
19#include <ctype.h>
20#endif
21#ifdef HAVE_STDLIB_H
Daniel Veillardbe70ff71999-07-05 16:50:46 +000022#include <stdlib.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000023#endif
24#ifdef HAVE_SYS_STAT_H
Daniel Veillardbe70ff71999-07-05 16:50:46 +000025#include <sys/stat.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000026#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +000027#ifdef HAVE_FCNTL_H
28#include <fcntl.h>
29#endif
30#ifdef HAVE_UNISTD_H
31#include <unistd.h>
32#endif
33#ifdef HAVE_ZLIB_H
34#include <zlib.h>
35#endif
36
Daniel Veillard6454aec1999-09-02 22:04:43 +000037#include "xmlmemory.h"
Daniel Veillardbe70ff71999-07-05 16:50:46 +000038#include "tree.h"
39#include "HTMLparser.h"
40#include "entities.h"
41#include "encoding.h"
42#include "valid.h"
43#include "parserInternals.h"
Daniel Veillarde2d034d1999-07-27 19:52:06 +000044#include "xmlIO.h"
45
46#define HTML_MAX_NAMELEN 1000
47#define INPUT_CHUNK 50
Daniel Veillardbe70ff71999-07-05 16:50:46 +000048
Daniel Veillard82150d81999-07-07 07:32:15 +000049/* #define DEBUG */
Daniel Veillard5233ffc1999-07-06 22:25:25 +000050
51/************************************************************************
52 * *
53 * Parser stacks related functions and macros *
54 * *
55 ************************************************************************/
56
57/*
58 * Generic function for accessing stacks in the Parser Context
59 */
60
61#define PUSH_AND_POP(type, name) \
62int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
63 if (ctxt->name##Nr >= ctxt->name##Max) { \
64 ctxt->name##Max *= 2; \
Daniel Veillard6454aec1999-09-02 22:04:43 +000065 ctxt->name##Tab = (void *) xmlRealloc(ctxt->name##Tab, \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000066 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
67 if (ctxt->name##Tab == NULL) { \
68 fprintf(stderr, "realloc failed !\n"); \
69 exit(1); \
70 } \
71 } \
72 ctxt->name##Tab[ctxt->name##Nr] = value; \
73 ctxt->name = value; \
74 return(ctxt->name##Nr++); \
75} \
76type html##name##Pop(htmlParserCtxtPtr ctxt) { \
77 type ret; \
Daniel Veillard7c1206f1999-10-14 09:10:25 +000078 if (ctxt->name##Nr < 0) return(0); \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000079 ctxt->name##Nr--; \
Daniel Veillard7c1206f1999-10-14 09:10:25 +000080 if (ctxt->name##Nr < 0) return(0); \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000081 if (ctxt->name##Nr > 0) \
82 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
83 else \
84 ctxt->name = NULL; \
85 ret = ctxt->name##Tab[ctxt->name##Nr]; \
86 ctxt->name##Tab[ctxt->name##Nr] = 0; \
87 return(ret); \
88} \
89
90PUSH_AND_POP(xmlNodePtr, node)
Daniel Veillard2673d3c1999-10-08 14:37:09 +000091PUSH_AND_POP(xmlChar*, name)
Daniel Veillard5233ffc1999-07-06 22:25:25 +000092
93/*
94 * Macros for accessing the content. Those should be used only by the parser,
95 * and not exported.
96 *
97 * Dirty macros, i.e. one need to make assumption on the context to use them
98 *
Daniel Veillarddd6b3671999-09-23 22:19:22 +000099 * CUR_PTR return the current pointer to the xmlChar to be parsed.
100 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000101 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
102 * in UNICODE mode. This should be used internally by the parser
103 * only to compare to ASCII values otherwise it would break when
104 * running with UTF-8 encoding.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000105 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000106 * to compare on ASCII based substring.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000107 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000108 * it should be used only to compare on ASCII based substring.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000109 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000110 * strings within the parser.
111 *
112 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
113 *
114 * CURRENT Returns the current char value, with the full decoding of
115 * UTF-8 if we are using this mode. It returns an int.
116 * NEXT Skip to the next character, this does the proper decoding
117 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000118 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
119 */
120
121#define CUR (*ctxt->input->cur)
122#define UPPER (toupper(*ctxt->input->cur))
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000123#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000124#define NXT(val) ctxt->input->cur[(val)]
125#define UPP(val) (toupper(ctxt->input->cur[(val)]))
126#define CUR_PTR ctxt->input->cur
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000127#define SHRINK xmlParserInputShrink(ctxt->input)
128#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000129
130#define SKIP_BLANKS \
131 while (IS_BLANK(*(ctxt->input->cur))) NEXT
132
133#ifndef USE_UTF_8
134#define CURRENT (*ctxt->input->cur)
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000135#define NEXT { \
136 if ((*ctxt->input->cur == 0) && \
137 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) { \
138 xmlPopInput(ctxt); \
139 } else { \
140 if (*(ctxt->input->cur) == '\n') { \
141 ctxt->input->line++; ctxt->input->col = 1; \
142 } else ctxt->input->col++; \
143 ctxt->input->cur++; \
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000144 ctxt->nbChars++; \
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000145 if (*ctxt->input->cur == 0) \
146 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); \
147 }}
148
149/****************************************
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000150#define NEXT ((*ctxt->input->cur) ? \
151 (((*(ctxt->input->cur) == '\n') ? \
152 (ctxt->input->line++, ctxt->input->col = 1) : \
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000153 (ctxt->input->col++)), \
154 (ctxt->input->cur++), \
155 ((*ctxt->input->cur) ? \
156 (xmlParserInputGrow(ctxt->input, 100), \
157 ctxt->input->cur): \
158 (ctxt->input->cur))) : \
159 ((xmlParserInputGrow(ctxt->input, 100) > 0) ? \
160 ctxt->input->cur: \
161 (xmlPopInput(ctxt), ctxt->input->cur)))
162 ****************************************/
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000163#else
164#endif
165
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000166
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000167
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000168/************************************************************************
169 * *
170 * The list of HTML elements and their properties *
171 * *
172 ************************************************************************/
173
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000174/*
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000175 * Start Tag: 1 means the start tag can be ommited
176 * End Tag: 1 means the end tag can be ommited
177 * 2 means it's forbidden (empty elements)
178 * Depr: this element is deprecated
179 * DTD: 1 means that this element is valid only in the Loose DTD
180 * 2 means that this element is valid only in the Frameset DTD
181 *
182 * Name,Start Tag,End Tag, Empty, Depr., DTD, Description
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000183 */
184htmlElemDesc html40ElementTable[] = {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000185{ "a", 0, 0, 0, 0, 0, "anchor " },
186{ "abbr", 0, 0, 0, 0, 0, "abbreviated form" },
187{ "acronym", 0, 0, 0, 0, 0, "" },
188{ "address", 0, 0, 0, 0, 0, "information on author " },
189{ "applet", 0, 0, 0, 1, 1, "java applet " },
190{ "area", 0, 2, 1, 0, 0, "client-side image map area " },
191{ "b", 0, 0, 0, 0, 0, "bold text style" },
192{ "base", 0, 2, 1, 0, 0, "document base uri " },
193{ "basefont", 0, 2, 1, 1, 1, "base font size " },
194{ "bdo", 0, 0, 0, 0, 0, "i18n bidi over-ride " },
195{ "big", 0, 0, 0, 0, 0, "large text style" },
196{ "blockquote", 0, 0, 0, 0, 0, "long quotation " },
197{ "body", 1, 1, 0, 0, 0, "document body " },
198{ "br", 0, 2, 1, 0, 0, "forced line break " },
199{ "button", 0, 0, 0, 0, 0, "push button " },
200{ "caption", 0, 0, 0, 0, 0, "table caption " },
201{ "center", 0, 0, 0, 1, 1, "shorthand for div align=center " },
202{ "cite", 0, 0, 0, 0, 0, "citation" },
203{ "code", 0, 0, 0, 0, 0, "computer code fragment" },
204{ "col", 0, 2, 1, 0, 0, "table column " },
205{ "colgroup", 0, 1, 0, 0, 0, "table column group " },
206{ "dd", 0, 1, 0, 0, 0, "definition description " },
207{ "del", 0, 0, 0, 0, 0, "deleted text " },
208{ "dfn", 0, 0, 0, 0, 0, "instance definition" },
209{ "dir", 0, 0, 0, 1, 1, "directory list" },
210{ "div", 0, 0, 0, 0, 0, "generic language/style container"},
211{ "dl", 0, 0, 0, 0, 0, "definition list " },
212{ "dt", 0, 1, 0, 0, 0, "definition term " },
213{ "em", 0, 0, 0, 0, 0, "emphasis" },
214{ "fieldset", 0, 0, 0, 0, 0, "form control group " },
215{ "font", 0, 0, 0, 1, 1, "local change to font " },
216{ "form", 0, 0, 0, 0, 0, "interactive form " },
217{ "frame", 0, 2, 1, 0, 2, "subwindow " },
218{ "frameset", 0, 0, 0, 0, 2, "window subdivision" },
219{ "h1", 0, 0, 0, 0, 0, "heading " },
220{ "h2", 0, 0, 0, 0, 0, "heading " },
221{ "h3", 0, 0, 0, 0, 0, "heading " },
222{ "h4", 0, 0, 0, 0, 0, "heading " },
223{ "h5", 0, 0, 0, 0, 0, "heading " },
224{ "h6", 0, 0, 0, 0, 0, "heading " },
225{ "head", 1, 1, 0, 0, 0, "document head " },
226{ "hr", 0, 2, 1, 0, 0, "horizontal rule " },
227{ "html", 1, 1, 0, 0, 0, "document root element " },
228{ "i", 0, 0, 0, 0, 0, "italic text style" },
229{ "iframe", 0, 0, 0, 0, 1, "inline subwindow " },
230{ "img", 0, 2, 1, 0, 0, "embedded image " },
231{ "input", 0, 2, 1, 0, 0, "form control " },
232{ "ins", 0, 0, 0, 0, 0, "inserted text" },
233{ "isindex", 0, 2, 1, 1, 1, "single line prompt " },
234{ "kbd", 0, 0, 0, 0, 0, "text to be entered by the user" },
235{ "label", 0, 0, 0, 0, 0, "form field label text " },
236{ "legend", 0, 0, 0, 0, 0, "fieldset legend " },
237{ "li", 0, 1, 0, 0, 0, "list item " },
238{ "link", 0, 2, 1, 0, 0, "a media-independent link " },
239{ "map", 0, 0, 0, 0, 0, "client-side image map " },
240{ "menu", 0, 0, 0, 1, 1, "menu list " },
241{ "meta", 0, 2, 1, 0, 0, "generic metainformation " },
242{ "noframes", 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " },
243{ "noscript", 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
244{ "object", 0, 0, 0, 0, 0, "generic embedded object " },
245{ "ol", 0, 0, 0, 0, 0, "ordered list " },
246{ "optgroup", 0, 0, 0, 0, 0, "option group " },
247{ "option", 0, 1, 0, 0, 0, "selectable choice " },
248{ "p", 0, 1, 0, 0, 0, "paragraph " },
249{ "param", 0, 2, 1, 0, 0, "named property value " },
250{ "pre", 0, 0, 0, 0, 0, "preformatted text " },
251{ "q", 0, 0, 0, 0, 0, "short inline quotation " },
252{ "s", 0, 0, 0, 1, 1, "strike-through text style" },
253{ "samp", 0, 0, 0, 0, 0, "sample program output, scripts, etc." },
254{ "script", 0, 0, 0, 0, 0, "script statements " },
255{ "select", 0, 0, 0, 0, 0, "option selector " },
256{ "small", 0, 0, 0, 0, 0, "small text style" },
257{ "span", 0, 0, 0, 0, 0, "generic language/style container " },
258{ "strike", 0, 0, 0, 1, 1, "strike-through text" },
259{ "strong", 0, 0, 0, 0, 0, "strong emphasis" },
260{ "style", 0, 0, 0, 0, 0, "style info " },
261{ "sub", 0, 0, 0, 0, 0, "subscript" },
262{ "sup", 0, 0, 0, 0, 0, "superscript " },
263{ "table", 0, 0, 0, 0, 0, "&#160;" },
264{ "tbody", 1, 1, 0, 0, 0, "table body " },
265{ "td", 0, 1, 0, 0, 0, "table data cell" },
266{ "textarea", 0, 0, 0, 0, 0, "multi-line text field " },
267{ "tfoot", 0, 1, 0, 0, 0, "table footer " },
268{ "th", 0, 1, 0, 0, 0, "table header cell" },
269{ "thead", 0, 1, 0, 0, 0, "table header " },
270{ "title", 0, 0, 0, 0, 0, "document title " },
271{ "tr", 0, 1, 0, 0, 0, "table row " },
272{ "tt", 0, 0, 0, 0, 0, "teletype or monospaced text style" },
273{ "u", 0, 0, 0, 1, 1, "underlined text style" },
274{ "ul", 0, 0, 0, 0, 0, "unordered list " },
275{ "var", 0, 0, 0, 0, 0, "instance of a variable or program argument" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000276};
277
278/*
279 * start tags that imply the end of a current element
280 * any tag of each line implies the end of the current element if the type of
281 * that element is in the same line
282 */
Daniel Veillardb96e6431999-08-29 21:02:19 +0000283char *htmlEquEnd[] = {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000284"dt", "dd", "li", "option", NULL,
285"h1", "h2", "h3", "h4", "h5", "h6", NULL,
286"ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000287NULL
288};
289/*
290 * acording the HTML DTD, HR should be added to the 2nd line above, as it
291 * is not allowed within a H1, H2, H3, etc. But we should tolerate that case
292 * because many documents contain rules in headings...
293 */
294
295/*
296 * start tags that imply the end of current element
297 */
Daniel Veillardb96e6431999-08-29 21:02:19 +0000298char *htmlStartClose[] = {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000299"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
300 "dl", "ul", "ol", "menu", "dir", "address", "pre",
301 "listing", "xmp", "head", NULL,
302"head", "p", NULL,
303"title", "p", NULL,
304"body", "head", "style", "link", "title", "p", NULL,
305"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
306 "pre", "listing", "xmp", "head", "li", NULL,
307"hr", "p", "head", NULL,
308"h1", "p", "head", NULL,
309"h2", "p", "head", NULL,
310"h3", "p", "head", NULL,
311"h4", "p", "head", NULL,
312"h5", "p", "head", NULL,
313"h6", "p", "head", NULL,
314"dir", "p", "head", NULL,
315"address", "p", "head", "ul", NULL,
316"pre", "p", "head", "ul", NULL,
317"listing", "p", "head", NULL,
318"xmp", "p", "head", NULL,
319"blockquote", "p", "head", NULL,
320"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
321 "xmp", "head", NULL,
322"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
323 "head", "dd", NULL,
324"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
325 "head", "dt", NULL,
326"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
327 "listing", "xmp", NULL,
328"ol", "p", "head", "ul", NULL,
329"menu", "p", "head", "ul", NULL,
330"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
331"div", "p", "head", NULL,
332"noscript", "p", "head", NULL,
333"center", "font", "b", "i", "p", "head", NULL,
334"a", "a", NULL,
335"caption", "p", NULL,
336"colgroup", "caption", "colgroup", "col", "p", NULL,
337"col", "caption", "col", "p", NULL,
338"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
339 "listing", "xmp", "a", NULL,
340"th", "th", "td", NULL,
341"td", "th", "td", "p", NULL,
342"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
343"thead", "caption", "col", "colgroup", NULL,
344"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
345 "tbody", "p", NULL,
346"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
347 "tfoot", "tbody", "p", NULL,
348"optgroup", "option", NULL,
349"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
350 "pre", "listing", "xmp", "a", NULL,
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000351NULL
352};
353
Daniel Veillardb96e6431999-08-29 21:02:19 +0000354static char** htmlStartCloseIndex[100];
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000355static int htmlStartCloseIndexinitialized = 0;
356
357/************************************************************************
358 * *
359 * functions to handle HTML specific data *
360 * *
361 ************************************************************************/
362
363/**
364 * htmlInitAutoClose:
365 *
366 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
367 *
368 */
369void
370htmlInitAutoClose(void) {
371 int index, i = 0;
372
373 if (htmlStartCloseIndexinitialized) return;
374
375 for (index = 0;index < 100;index ++) htmlStartCloseIndex[index] = NULL;
376 index = 0;
377 while ((htmlStartClose[i] != NULL) && (index < 100 - 1)) {
378 htmlStartCloseIndex[index++] = &htmlStartClose[i];
379 while (htmlStartClose[i] != NULL) i++;
380 i++;
381 }
382}
383
384/**
385 * htmlTagLookup:
386 * @tag: The tag name
387 *
388 * Lookup the HTML tag in the ElementTable
389 *
390 * Returns the related htmlElemDescPtr or NULL if not found.
391 */
392htmlElemDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000393htmlTagLookup(const xmlChar *tag) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000394 int i = 0;
395
396 for (i = 0; i < (sizeof(html40ElementTable) /
397 sizeof(html40ElementTable[0]));i++) {
Daniel Veillardb96e6431999-08-29 21:02:19 +0000398 if (!xmlStrcmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000399 return(&html40ElementTable[i]);
400 }
401 return(NULL);
402}
403
404/**
405 * htmlCheckAutoClose:
406 * @new: The new tag name
407 * @old: The old tag name
408 *
409 * Checks wether the new tag is one of the registered valid tags for closing old.
410 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
411 *
412 * Returns 0 if no, 1 if yes.
413 */
414int
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000415htmlCheckAutoClose(const xmlChar *new, const xmlChar *old) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000416 int i, index;
Daniel Veillardb96e6431999-08-29 21:02:19 +0000417 char **close;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000418
419 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
420
421 /* inefficient, but not a big deal */
422 for (index = 0; index < 100;index++) {
423 close = htmlStartCloseIndex[index];
424 if (close == NULL) return(0);
Daniel Veillardb96e6431999-08-29 21:02:19 +0000425 if (!xmlStrcmp(BAD_CAST *close, new)) break;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000426 }
427
428 i = close - htmlStartClose;
429 i++;
430 while (htmlStartClose[i] != NULL) {
Daniel Veillardb96e6431999-08-29 21:02:19 +0000431 if (!xmlStrcmp(BAD_CAST htmlStartClose[i], old)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000432 return(1);
433 }
434 i++;
435 }
436 return(0);
437}
438
439/**
440 * htmlAutoClose:
441 * @ctxt: an HTML parser context
442 * @new: The new tag name
443 *
444 * The HTmL DtD allows a tag to implicitely close other tags.
445 * The list is kept in htmlStartClose array. This function is
446 * called when a new tag has been detected and generates the
447 * appropriates closes if possible/needed.
448 */
449void
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000450htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *new) {
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000451 xmlChar *oldname;
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000452 while ((ctxt->name != NULL) &&
453 (htmlCheckAutoClose(new, ctxt->name))) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000454#ifdef DEBUG
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000455 fprintf(stderr,"htmlAutoClose: %s closes %s\n", new, ctxt->name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000456#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000457 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000458 ctxt->sax->endElement(ctxt->userData, ctxt->name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +0000459 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000460 if (oldname != NULL) {
461#ifdef DEBUG
462 fprintf(stderr,"htmlAutoClose: popped %s\n", oldname);
463#endif
464 xmlFree(oldname);
465 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000466 }
467}
468
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000469/**
470 * htmlAutoCloseOnClose:
471 * @ctxt: an HTML parser context
472 * @new: The new tag name
473 *
474 * The HTmL DtD allows an ending tag to implicitely close other tags.
475 */
476void
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000477htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *new) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000478 htmlElemDescPtr info;
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000479 xmlChar *oldname;
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000480 int i;
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000481
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000482#ifdef DEBUG
483 fprintf(stderr,"Close of %s stack: %d elements\n", new, ctxt->nameNr);
484 for (i = 0;i < ctxt->nameNr;i++)
485 fprintf(stderr,"%d : %s\n", i, ctxt->nameTab[i]);
486#endif
487
488 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
489 if (!xmlStrcmp(new, ctxt->nameTab[i])) break;
490 }
491 if (i < 0) return;
492
493 while (xmlStrcmp(new, ctxt->name)) {
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000494 info = htmlTagLookup(ctxt->name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000495 if ((info == NULL) || (info->endTag == 1)) {
496#ifdef DEBUG
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000497 fprintf(stderr,"htmlAutoCloseOnClose: %s closes %s\n", new, ctxt->name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000498#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000499 } else {
500 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
501 ctxt->sax->error(ctxt->userData,
502 "Opening and ending tag mismatch: %s and %s\n",
503 new, ctxt->name);
504 ctxt->wellFormed = 0;
505 }
506 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
507 ctxt->sax->endElement(ctxt->userData, ctxt->name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +0000508 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000509 if (oldname != NULL) {
510#ifdef DEBUG
511 fprintf(stderr,"htmlAutoCloseOnClose: popped %s\n", oldname);
512#endif
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000513 xmlFree(oldname);
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000514 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000515 }
516}
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000517
518/************************************************************************
519 * *
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000520 * The list of HTML predefined entities *
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000521 * *
522 ************************************************************************/
523
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000524
525htmlEntityDesc html40EntitiesTable[] = {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000526/*
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000527 * the 4 absolute ones,
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000528 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000529{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
530{ 38, "amp", "ampersand, U+0026 ISOnum" },
Daniel Veillard1566d3a1999-07-15 14:24:29 +0000531{ 39, "apos", "single quote" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000532{ 60, "lt", "less-than sign, U+003C ISOnum" },
533{ 62, "gt", "greater-than sign, U+003E ISOnum" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000534
535/*
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000536 * A bunch still in the 128-255 range
537 * Replacing them depend really on the charset used.
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000538 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000539{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
540{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
541{ 162, "cent", "cent sign, U+00A2 ISOnum" },
542{ 163, "pound","pound sign, U+00A3 ISOnum" },
543{ 164, "curren","currency sign, U+00A4 ISOnum" },
544{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
545{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
546{ 167, "sect", "section sign, U+00A7 ISOnum" },
547{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
548{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
549{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
550{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
551{ 172, "not", "not sign, U+00AC ISOnum" },
552{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
553{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
554{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
555{ 176, "deg", "degree sign, U+00B0 ISOnum" },
556{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
557{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
558{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
559{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
560{ 181, "micro","micro sign, U+00B5 ISOnum" },
561{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000562{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000563{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
564{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
565{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000566{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000567{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
568{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
569{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
570{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
571{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
572{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
573{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
574{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
575{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
576{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
577{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
578{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
579{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
580{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
581{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
582{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
583{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
584{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
585{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
586{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
587{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
588{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
589{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
590{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
591{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
592{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
593{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
594{ 215, "times","multiplication sign, U+00D7 ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000595{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000596{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
597{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
598{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
599{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
600{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
601{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
602{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
603{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
604{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
605{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
606{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
607{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
608{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
609{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
610{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
611{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
612{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
613{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
614{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
615{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
616{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
617{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
618{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
619{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
620{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
621{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
622{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
623{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
624{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
625{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
626{ 247, "divide","division sign, U+00F7 ISOnum" },
627{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
628{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
629{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
630{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
631{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
632{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
633{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
634{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000635
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000636/*
637 * Anything below should really be kept as entities references
638 */
639{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000640
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000641{ 913, "Alpha","greek capital letter alpha, U+0391" },
642{ 914, "Beta", "greek capital letter beta, U+0392" },
643{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
644{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
645{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
646{ 918, "Zeta", "greek capital letter zeta, U+0396" },
647{ 919, "Eta", "greek capital letter eta, U+0397" },
648{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
649{ 921, "Iota", "greek capital letter iota, U+0399" },
650{ 922, "Kappa","greek capital letter kappa, U+039A" },
651{ 923, "Lambda""greek capital letter lambda, U+039B ISOgrk3" },
652{ 924, "Mu", "greek capital letter mu, U+039C" },
653{ 925, "Nu", "greek capital letter nu, U+039D" },
654{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
655{ 927, "Omicron","greek capital letter omicron, U+039F" },
656{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
657{ 929, "Rho", "greek capital letter rho, U+03A1" },
658{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
659{ 932, "Tau", "greek capital letter tau, U+03A4" },
660{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
661{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
662{ 935, "Chi", "greek capital letter chi, U+03A7" },
663{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
664{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000665
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000666{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
667{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
668{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
669{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
670{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
671{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
672{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
673{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
674{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
675{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
676{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
677{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
678{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
679{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
680{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
681{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
682{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
683{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
684{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
685{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
686{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
687{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
688{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
689{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
690{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
691{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
692{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
693{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000694
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000695{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
696{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
697{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
698{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
699{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
700{ 8260, "frasl","fraction slash, U+2044 NEW" },
701
Daniel Veillardb05deb71999-08-10 19:04:08 +0000702{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000703{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
704{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
705{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
706{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
707{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
708{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
709{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
710{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
711{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
712{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
713{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
714{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
715{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
716{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
717{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
718
719
720{ 8704, "forall","for all, U+2200 ISOtech" },
721{ 8706, "part", "partial differential, U+2202 ISOtech" },
722{ 8707, "exist","there exists, U+2203 ISOtech" },
723{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
724{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
725{ 8712, "isin", "element of, U+2208 ISOtech" },
726{ 8713, "notin","not an element of, U+2209 ISOtech" },
727{ 8715, "ni", "contains as member, U+220B ISOtech" },
728{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
729{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
730{ 8722, "minus","minus sign, U+2212 ISOtech" },
731{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
732{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
733{ 8733, "prop", "proportional to, U+221D ISOtech" },
734{ 8734, "infin","infinity, U+221E ISOtech" },
735{ 8736, "ang", "angle, U+2220 ISOamso" },
736{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
737{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
738{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
739{ 8746, "cup", "union = cup, U+222A ISOtech" },
740{ 8747, "int", "integral, U+222B ISOtech" },
741{ 8756, "there4","therefore, U+2234 ISOtech" },
742{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
743{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
744{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
745{ 8800, "ne", "not equal to, U+2260 ISOtech" },
746{ 8801, "equiv","identical to, U+2261 ISOtech" },
747{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
748{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
749{ 8834, "sub", "subset of, U+2282 ISOtech" },
750{ 8835, "sup", "superset of, U+2283 ISOtech" },
751{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
752{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
753{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
754{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
755{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
756{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
757{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
758{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
759{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
760{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
761{ 8971, "rfloor","right floor, U+230B ISOamsc" },
762{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
763{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
764{ 9674, "loz", "lozenge, U+25CA ISOpub" },
765
766{ 9824, "spades","black spade suit, U+2660 ISOpub" },
767{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
768{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
769{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
770
771{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
772{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
773{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
774{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
775{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
776{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
777{ 732, "tilde","small tilde, U+02DC ISOdia" },
778
779{ 8194, "ensp", "en space, U+2002 ISOpub" },
780{ 8195, "emsp", "em space, U+2003 ISOpub" },
781{ 8201, "thinsp","thin space, U+2009 ISOpub" },
782{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
783{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
784{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
785{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
786{ 8211, "ndash","en dash, U+2013 ISOpub" },
787{ 8212, "mdash","em dash, U+2014 ISOpub" },
788{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
789{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
790{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
791{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
792{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
793{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
794{ 8224, "dagger","dagger, U+2020 ISOpub" },
795{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
796{ 8240, "permil","per mille sign, U+2030 ISOtech" },
797{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000798{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000799{ 8364, "euro", "euro sign, U+20AC NEW" }
800};
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000801
802/************************************************************************
803 * *
804 * Commodity functions to handle entities *
805 * *
806 ************************************************************************/
807
808/*
809 * Macro used to grow the current buffer.
810 */
811#define growBuffer(buffer) { \
812 buffer##_size *= 2; \
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000813 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000814 if (buffer == NULL) { \
815 perror("realloc failed"); \
816 exit(1); \
817 } \
818}
819
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000820/**
821 * htmlEntityLookup:
822 * @name: the entity name
823 *
824 * Lookup the given entity in EntitiesTable
825 *
826 * TODO: the linear scan is really ugly, an hash table is really needed.
827 *
828 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
829 */
830htmlEntityDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000831htmlEntityLookup(const xmlChar *name) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000832 int i;
833
834 for (i = 0;i < (sizeof(html40EntitiesTable)/
835 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillardb96e6431999-08-29 21:02:19 +0000836 if (!xmlStrcmp(name, BAD_CAST html40EntitiesTable[i].name)) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000837#ifdef DEBUG
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000838 fprintf(stderr,"Found entity %s\n", name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000839#endif
840 return(&html40EntitiesTable[i]);
841 }
842 }
843 return(NULL);
844}
845
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000846
847/**
848 * htmlDecodeEntities:
849 * @ctxt: the parser context
850 * @len: the len to decode (in bytes !), -1 for no size limit
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000851 * @end: an end marker xmlChar, 0 if none
852 * @end2: an end marker xmlChar, 0 if none
853 * @end3: an end marker xmlChar, 0 if none
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000854 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000855 * Subtitute the HTML entities by their value
856 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000857 * DEPRECATED !!!!
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000858 *
859 * Returns A newly allocated string with the substitution done. The caller
860 * must deallocate it !
861 */
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000862xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000863htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000864 xmlChar end, xmlChar end2, xmlChar end3) {
865 xmlChar *buffer = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000866 int buffer_size = 0;
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000867 xmlChar *out = NULL;
868 xmlChar *name = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000869
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000870 xmlChar *cur = NULL;
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000871 htmlEntityDescPtr ent;
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000872 int nbchars = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000873 unsigned int max = (unsigned int) len;
874
875 /*
876 * allocate a translation buffer.
877 */
878 buffer_size = 1000;
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000879 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000880 if (buffer == NULL) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000881 perror("htmlDecodeEntities: malloc failed");
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000882 return(NULL);
883 }
884 out = buffer;
885
886 /*
887 * Ok loop until we reach one of the ending char or a size limit.
888 */
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000889 while ((nbchars < max) && (CUR != end) &&
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000890 (CUR != end2) && (CUR != end3)) {
891
892 if (CUR == '&') {
893 if (NXT(1) == '#') {
894 int val = htmlParseCharRef(ctxt);
Daniel Veillardb96e6431999-08-29 21:02:19 +0000895 /* invalid for UTF-8 variable encoding !!!!! */
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000896 *out++ = val;
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000897 nbchars += 3; /* !!!! */
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000898 } else {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000899 ent = htmlParseEntityRef(ctxt, &name);
900 if (name != NULL) {
901 if ((ent == NULL) || (ent->value <= 0) ||
902 (ent->value >= 255)) {
903 *out++ = '&';
904 cur = name;
905 while (*cur != 0) {
906 if (out - buffer > buffer_size - 100) {
907 int index = out - buffer;
908
909 growBuffer(buffer);
910 out = &buffer[index];
911 }
912 *out++ = *cur++;
913 }
914 *out++ = ';';
915 } else {
Daniel Veillardb96e6431999-08-29 21:02:19 +0000916 /* invalid for UTF-8 variable encoding !!!!! */
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000917 *out++ = (xmlChar)ent->value;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000918 if (out - buffer > buffer_size - 100) {
919 int index = out - buffer;
920
921 growBuffer(buffer);
922 out = &buffer[index];
923 }
924 }
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000925 nbchars += 2 + xmlStrlen(name);
Daniel Veillard6454aec1999-09-02 22:04:43 +0000926 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000927 }
928 }
929 } else {
Daniel Veillardb96e6431999-08-29 21:02:19 +0000930 /* invalid for UTF-8 , use COPY(out); !!!!! */
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000931 *out++ = CUR;
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000932 nbchars++;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000933 if (out - buffer > buffer_size - 100) {
934 int index = out - buffer;
935
936 growBuffer(buffer);
937 out = &buffer[index];
938 }
939 NEXT;
940 }
941 }
942 *out++ = 0;
943 return(buffer);
944}
945
946
947/************************************************************************
948 * *
949 * Commodity functions to handle encodings *
950 * *
951 ************************************************************************/
952
953/**
954 * htmlSwitchEncoding:
955 * @ctxt: the parser context
956 * @len: the len of @cur
957 *
958 * change the input functions when discovering the character encoding
959 * of a given entity.
960 *
961 */
962void
963htmlSwitchEncoding(htmlParserCtxtPtr ctxt, xmlCharEncoding enc)
964{
965 switch (enc) {
966 case XML_CHAR_ENCODING_ERROR:
967 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
968 ctxt->sax->error(ctxt->userData, "encoding unknown\n");
969 ctxt->wellFormed = 0;
970 break;
971 case XML_CHAR_ENCODING_NONE:
972 /* let's assume it's UTF-8 without the XML decl */
973 return;
974 case XML_CHAR_ENCODING_UTF8:
975 /* default encoding, no conversion should be needed */
976 return;
977 case XML_CHAR_ENCODING_UTF16LE:
978 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
979 ctxt->sax->error(ctxt->userData,
980 "char encoding UTF16 little endian not supported\n");
981 break;
982 case XML_CHAR_ENCODING_UTF16BE:
983 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
984 ctxt->sax->error(ctxt->userData,
985 "char encoding UTF16 big endian not supported\n");
986 break;
987 case XML_CHAR_ENCODING_UCS4LE:
988 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
989 ctxt->sax->error(ctxt->userData,
990 "char encoding USC4 little endian not supported\n");
991 break;
992 case XML_CHAR_ENCODING_UCS4BE:
993 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
994 ctxt->sax->error(ctxt->userData,
995 "char encoding USC4 big endian not supported\n");
996 break;
997 case XML_CHAR_ENCODING_EBCDIC:
998 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
999 ctxt->sax->error(ctxt->userData,
1000 "char encoding EBCDIC not supported\n");
1001 break;
1002 case XML_CHAR_ENCODING_UCS4_2143:
1003 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1004 ctxt->sax->error(ctxt->userData,
1005 "char encoding UCS4 2143 not supported\n");
1006 break;
1007 case XML_CHAR_ENCODING_UCS4_3412:
1008 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1009 ctxt->sax->error(ctxt->userData,
1010 "char encoding UCS4 3412 not supported\n");
1011 break;
1012 case XML_CHAR_ENCODING_UCS2:
1013 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1014 ctxt->sax->error(ctxt->userData,
1015 "char encoding UCS2 not supported\n");
1016 break;
1017 case XML_CHAR_ENCODING_8859_1:
1018 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1019 ctxt->sax->error(ctxt->userData,
1020 "char encoding ISO_8859_1 ISO Latin 1 not supported\n");
1021 break;
1022 case XML_CHAR_ENCODING_8859_2:
1023 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1024 ctxt->sax->error(ctxt->userData,
1025 "char encoding ISO_8859_2 ISO Latin 2 not supported\n");
1026 break;
1027 case XML_CHAR_ENCODING_8859_3:
1028 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1029 ctxt->sax->error(ctxt->userData,
1030 "char encoding ISO_8859_3 not supported\n");
1031 break;
1032 case XML_CHAR_ENCODING_8859_4:
1033 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1034 ctxt->sax->error(ctxt->userData,
1035 "char encoding ISO_8859_4 not supported\n");
1036 break;
1037 case XML_CHAR_ENCODING_8859_5:
1038 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1039 ctxt->sax->error(ctxt->userData,
1040 "char encoding ISO_8859_5 not supported\n");
1041 break;
1042 case XML_CHAR_ENCODING_8859_6:
1043 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1044 ctxt->sax->error(ctxt->userData,
1045 "char encoding ISO_8859_6 not supported\n");
1046 break;
1047 case XML_CHAR_ENCODING_8859_7:
1048 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1049 ctxt->sax->error(ctxt->userData,
1050 "char encoding ISO_8859_7 not supported\n");
1051 break;
1052 case XML_CHAR_ENCODING_8859_8:
1053 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1054 ctxt->sax->error(ctxt->userData,
1055 "char encoding ISO_8859_8 not supported\n");
1056 break;
1057 case XML_CHAR_ENCODING_8859_9:
1058 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1059 ctxt->sax->error(ctxt->userData,
1060 "char encoding ISO_8859_9 not supported\n");
1061 break;
1062 case XML_CHAR_ENCODING_2022_JP:
1063 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1064 ctxt->sax->error(ctxt->userData,
1065 "char encoding ISO-2022-JPnot supported\n");
1066 break;
1067 case XML_CHAR_ENCODING_SHIFT_JIS:
1068 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1069 ctxt->sax->error(ctxt->userData,
1070 "char encoding Shift_JISnot supported\n");
1071 break;
1072 case XML_CHAR_ENCODING_EUC_JP:
1073 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1074 ctxt->sax->error(ctxt->userData,
1075 "char encoding EUC-JPnot supported\n");
1076 break;
1077 }
1078}
1079
1080
1081/************************************************************************
1082 * *
1083 * Commodity functions, cleanup needed ? *
1084 * *
1085 ************************************************************************/
1086
1087/**
1088 * areBlanks:
1089 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001090 * @str: a xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001091 * @len: the size of @str
1092 *
1093 * Is this a sequence of blank chars that one can ignore ?
1094 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001095 * Returns 1 if ignorable 0 otherwise.
1096 */
1097
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001098static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001099 int i;
1100 xmlNodePtr lastChild;
1101
1102 for (i = 0;i < len;i++)
1103 if (!(IS_BLANK(str[i]))) return(0);
1104
1105 if (CUR != '<') return(0);
1106 if (ctxt->node == NULL) return(0);
1107 lastChild = xmlGetLastChild(ctxt->node);
1108 if (lastChild == NULL) {
1109 if (ctxt->node->content != NULL) return(0);
1110 } else if (xmlNodeIsText(lastChild))
1111 return(0);
1112 return(1);
1113}
1114
1115/**
1116 * htmlHandleEntity:
1117 * @ctxt: an HTML parser context
1118 * @entity: an XML entity pointer.
1119 *
1120 * Default handling of an HTML entity, call the parser with the
1121 * substitution string
1122 */
1123
1124void
1125htmlHandleEntity(htmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
1126 int len;
1127
1128 if (entity->content == NULL) {
1129 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1130 ctxt->sax->error(ctxt->userData, "htmlHandleEntity %s: content == NULL\n",
1131 entity->name);
1132 ctxt->wellFormed = 0;
1133 return;
1134 }
1135 len = xmlStrlen(entity->content);
1136
1137 /*
1138 * Just handle the content as a set of chars.
1139 */
1140 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
1141 ctxt->sax->characters(ctxt->userData, entity->content, len);
1142
1143}
1144
1145/**
1146 * htmlNewDoc:
1147 * @URI: URI for the dtd, or NULL
1148 * @ExternalID: the external ID of the DTD, or NULL
1149 *
1150 * Returns a new document
1151 */
1152htmlDocPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001153htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001154 xmlDocPtr cur;
1155
1156 /*
1157 * Allocate a new document and fill the fields.
1158 */
Daniel Veillard6454aec1999-09-02 22:04:43 +00001159 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001160 if (cur == NULL) {
1161 fprintf(stderr, "xmlNewDoc : malloc failed\n");
1162 return(NULL);
1163 }
Daniel Veillarde7a5a771999-08-30 13:05:42 +00001164 memset(cur, 0, sizeof(xmlDoc));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001165
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001166 cur->type = XML_HTML_DOCUMENT_NODE;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001167 cur->version = NULL;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001168 cur->intSubset = NULL;
Daniel Veillardb96e6431999-08-29 21:02:19 +00001169 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001170 cur->name = NULL;
1171 cur->root = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001172 cur->extSubset = NULL;
1173 cur->oldNs = NULL;
1174 cur->encoding = NULL;
1175 cur->standalone = 1;
1176 cur->compression = 0;
Daniel Veillardc08a2c61999-09-08 21:35:25 +00001177 cur->ids = NULL;
1178 cur->refs = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001179#ifndef XML_WITHOUT_CORBA
1180 cur->_private = NULL;
1181 cur->vepv = NULL;
1182#endif
1183 return(cur);
1184}
1185
1186
1187/************************************************************************
1188 * *
1189 * The parser itself *
1190 * Relates to http://www.w3.org/TR/html40 *
1191 * *
1192 ************************************************************************/
1193
1194/************************************************************************
1195 * *
1196 * The parser itself *
1197 * *
1198 ************************************************************************/
1199
1200/**
1201 * htmlParseHTMLName:
1202 * @ctxt: an HTML parser context
1203 *
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001204 * parse an HTML tag or attribute name, note that we convert it to lowercase
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001205 * since HTML names are not case-sensitive.
1206 *
1207 * Returns the Tag Name parsed or NULL
1208 */
1209
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001210xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001211htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001212 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001213 int i = 0;
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001214 xmlChar loc[100];
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001215
1216 if (!IS_LETTER(CUR) && (CUR != '_') &&
1217 (CUR != ':')) return(NULL);
1218
1219 while ((i < 100) && ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)))) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001220 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001221 else loc[i] = CUR;
1222 i++;
1223
1224 NEXT;
1225 }
1226
1227 ret = xmlStrndup(loc, i);
1228
1229 return(ret);
1230}
1231
1232/**
1233 * htmlParseName:
1234 * @ctxt: an HTML parser context
1235 *
1236 * parse an HTML name, this routine is case sensistive.
1237 *
1238 * Returns the Name parsed or NULL
1239 */
1240
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001241xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001242htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001243 xmlChar buf[HTML_MAX_NAMELEN];
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001244 int len = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001245
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001246 GROW;
1247 if (!IS_LETTER(CUR) && (CUR != '_')) {
1248 return(NULL);
1249 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001250
1251 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1252 (CUR == '.') || (CUR == '-') ||
1253 (CUR == '_') || (CUR == ':') ||
1254 (IS_COMBINING(CUR)) ||
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001255 (IS_EXTENDER(CUR))) {
1256 buf[len++] = CUR;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001257 NEXT;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001258 if (len >= HTML_MAX_NAMELEN) {
1259 fprintf(stderr,
1260 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1261 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1262 (CUR == '.') || (CUR == '-') ||
1263 (CUR == '_') || (CUR == ':') ||
1264 (IS_COMBINING(CUR)) ||
1265 (IS_EXTENDER(CUR)))
1266 NEXT;
1267 break;
1268 }
1269 }
1270 return(xmlStrndup(buf, len));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001271}
1272
1273/**
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001274 * htmlParseHTMLAttribute:
1275 * @ctxt: an HTML parser context
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001276 * @stop: a char stop value
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001277 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001278 * parse an HTML attribute value till the stop (quote), if
1279 * stop is 0 then it stops at the first space
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001280 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001281 * Returns the attribute parsed or NULL
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001282 */
1283
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001284xmlChar *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001285htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001286 xmlChar buf[HTML_MAX_NAMELEN];
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001287 int len = 0;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001288
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001289 GROW;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001290 while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1291 if ((stop == 0) && (IS_BLANK(CUR))) break;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001292 buf[len++] = CUR;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001293 NEXT;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001294 if (len >= HTML_MAX_NAMELEN) {
1295 fprintf(stderr,
1296 "htmlParseHTMLAttribute: reached HTML_MAX_NAMELEN limit\n");
1297 while ((!IS_BLANK(CUR)) && (CUR != '<') &&
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001298 (CUR != '>') &&
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001299 (CUR != '\'') && (CUR != '"'))
1300 NEXT;
1301 break;
1302 }
1303 }
1304 return(xmlStrndup(buf, len));
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001305}
1306
1307/**
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001308 * htmlParseNmtoken:
1309 * @ctxt: an HTML parser context
1310 *
1311 * parse an HTML Nmtoken.
1312 *
1313 * Returns the Nmtoken parsed or NULL
1314 */
1315
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001316xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001317htmlParseNmtoken(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001318 xmlChar buf[HTML_MAX_NAMELEN];
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001319 int len = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001320
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001321 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001322 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1323 (CUR == '.') || (CUR == '-') ||
1324 (CUR == '_') || (CUR == ':') ||
1325 (IS_COMBINING(CUR)) ||
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001326 (IS_EXTENDER(CUR))) {
1327 buf[len++] = CUR;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001328 NEXT;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001329 if (len >= HTML_MAX_NAMELEN) {
1330 fprintf(stderr,
1331 "htmlParseNmtoken: reached HTML_MAX_NAMELEN limit\n");
1332 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1333 (CUR == '.') || (CUR == '-') ||
1334 (CUR == '_') || (CUR == ':') ||
1335 (IS_COMBINING(CUR)) ||
1336 (IS_EXTENDER(CUR)))
1337 NEXT;
1338 break;
1339 }
1340 }
1341 return(xmlStrndup(buf, len));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001342}
1343
1344/**
1345 * htmlParseEntityRef:
1346 * @ctxt: an HTML parser context
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001347 * @str: location to store the entity name
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001348 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001349 * parse an HTML ENTITY references
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001350 *
1351 * [68] EntityRef ::= '&' Name ';'
1352 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001353 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
1354 * if non-NULL *str will have to be freed by the caller.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001355 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001356htmlEntityDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001357htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
1358 xmlChar *name;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001359 htmlEntityDescPtr ent = NULL;
1360 *str = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001361
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001362 if (CUR == '&') {
1363 NEXT;
1364 name = htmlParseName(ctxt);
1365 if (name == NULL) {
1366 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1367 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
1368 ctxt->wellFormed = 0;
1369 } else {
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001370 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001371 if (CUR == ';') {
1372 NEXT;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001373 *str = name;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001374
1375 /*
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001376 * Lookup the entity in the table.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001377 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001378 ent = htmlEntityLookup(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001379 } else {
1380 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1381 ctxt->sax->error(ctxt->userData,
1382 "htmlParseEntityRef: expecting ';'\n");
1383 ctxt->wellFormed = 0;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001384 if (ctxt->sax->characters != NULL) {
Daniel Veillardb96e6431999-08-29 21:02:19 +00001385 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001386 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
1387 }
Daniel Veillard6454aec1999-09-02 22:04:43 +00001388 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001389 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001390 }
1391 }
1392 return(ent);
1393}
1394
1395/**
1396 * htmlParseAttValue:
1397 * @ctxt: an HTML parser context
1398 *
1399 * parse a value for an attribute
1400 * Note: the parser won't do substitution of entities here, this
1401 * will be handled later in xmlStringGetNodeList, unless it was
1402 * asked for ctxt->replaceEntities != 0
1403 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001404 * Returns the AttValue parsed or NULL.
1405 */
1406
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001407xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001408htmlParseAttValue(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001409 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001410
1411 if (CUR == '"') {
1412 NEXT;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001413 ret = htmlParseHTMLAttribute(ctxt, '"');
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001414 if (CUR != '"') {
1415 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1416 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
1417 ctxt->wellFormed = 0;
1418 } else
1419 NEXT;
1420 } else if (CUR == '\'') {
1421 NEXT;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001422 ret = htmlParseHTMLAttribute(ctxt, '\'');
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001423 if (CUR != '\'') {
1424 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1425 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
1426 ctxt->wellFormed = 0;
1427 } else
1428 NEXT;
1429 } else {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001430 /*
1431 * That's an HTMLism, the attribute value may not be quoted
1432 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001433 ret = htmlParseHTMLAttribute(ctxt, 0);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001434 if (ret == NULL) {
1435 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1436 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
1437 ctxt->wellFormed = 0;
1438 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001439 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001440 return(ret);
1441}
1442
1443/**
1444 * htmlParseSystemLiteral:
1445 * @ctxt: an HTML parser context
1446 *
1447 * parse an HTML Literal
1448 *
1449 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
1450 *
1451 * Returns the SystemLiteral parsed or NULL
1452 */
1453
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001454xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001455htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001456 const xmlChar *q;
1457 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001458
1459 if (CUR == '"') {
1460 NEXT;
1461 q = CUR_PTR;
1462 while ((IS_CHAR(CUR)) && (CUR != '"'))
1463 NEXT;
1464 if (!IS_CHAR(CUR)) {
1465 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1466 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
1467 ctxt->wellFormed = 0;
1468 } else {
1469 ret = xmlStrndup(q, CUR_PTR - q);
1470 NEXT;
1471 }
1472 } else if (CUR == '\'') {
1473 NEXT;
1474 q = CUR_PTR;
1475 while ((IS_CHAR(CUR)) && (CUR != '\''))
1476 NEXT;
1477 if (!IS_CHAR(CUR)) {
1478 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1479 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
1480 ctxt->wellFormed = 0;
1481 } else {
1482 ret = xmlStrndup(q, CUR_PTR - q);
1483 NEXT;
1484 }
1485 } else {
1486 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1487 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
1488 ctxt->wellFormed = 0;
1489 }
1490
1491 return(ret);
1492}
1493
1494/**
1495 * htmlParsePubidLiteral:
1496 * @ctxt: an HTML parser context
1497 *
1498 * parse an HTML public literal
1499 *
1500 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
1501 *
1502 * Returns the PubidLiteral parsed or NULL.
1503 */
1504
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001505xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001506htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001507 const xmlChar *q;
1508 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001509 /*
1510 * Name ::= (Letter | '_') (NameChar)*
1511 */
1512 if (CUR == '"') {
1513 NEXT;
1514 q = CUR_PTR;
1515 while (IS_PUBIDCHAR(CUR)) NEXT;
1516 if (CUR != '"') {
1517 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1518 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
1519 ctxt->wellFormed = 0;
1520 } else {
1521 ret = xmlStrndup(q, CUR_PTR - q);
1522 NEXT;
1523 }
1524 } else if (CUR == '\'') {
1525 NEXT;
1526 q = CUR_PTR;
1527 while ((IS_LETTER(CUR)) && (CUR != '\''))
1528 NEXT;
1529 if (!IS_LETTER(CUR)) {
1530 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1531 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
1532 ctxt->wellFormed = 0;
1533 } else {
1534 ret = xmlStrndup(q, CUR_PTR - q);
1535 NEXT;
1536 }
1537 } else {
1538 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1539 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
1540 ctxt->wellFormed = 0;
1541 }
1542
1543 return(ret);
1544}
1545
1546/**
1547 * htmlParseCharData:
1548 * @ctxt: an HTML parser context
1549 * @cdata: int indicating whether we are within a CDATA section
1550 *
1551 * parse a CharData section.
1552 * if we are within a CDATA section ']]>' marks an end of section.
1553 *
1554 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
1555 */
1556
1557void
1558htmlParseCharData(htmlParserCtxtPtr ctxt, int cdata) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001559 xmlChar *buf = NULL;
1560 int len = 0;
1561 int size = 100;
1562 xmlChar q;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001563
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001564 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
1565 if (buf == NULL) {
1566 fprintf(stderr, "malloc of %d byte failed\n", size);
1567 return;
1568 }
1569
1570 q = CUR;
1571 while ((IS_CHAR(q)) && (q != '<') &&
1572 (q != '&')) {
1573 if ((q == ']') && (NXT(1) == ']') &&
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001574 (NXT(2) == '>')) {
1575 if (cdata) break;
1576 else {
1577 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1578 ctxt->sax->error(ctxt->userData,
1579 "Sequence ']]>' not allowed in content\n");
1580 ctxt->wellFormed = 0;
1581 }
1582 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001583 if (len + 1 >= size) {
1584 size *= 2;
1585 buf = xmlRealloc(buf, size * sizeof(xmlChar));
1586 if (buf == NULL) {
1587 fprintf(stderr, "realloc of %d byte failed\n", size);
1588 return;
1589 }
1590 }
1591 buf[len++] = q;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001592 NEXT;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001593 q = CUR;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001594 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001595 if (len == 0) {
1596 xmlFree(buf);
1597 return;
1598 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001599
1600 /*
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001601 * Ok the buffer is to be consumed as chars.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001602 */
1603 if (ctxt->sax != NULL) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001604 if (areBlanks(ctxt, buf, len)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001605 if (ctxt->sax->ignorableWhitespace != NULL)
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001606 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, len);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001607 } else {
1608 if (ctxt->sax->characters != NULL)
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001609 ctxt->sax->characters(ctxt->userData, buf, len);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001610 }
1611 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001612 xmlFree(buf);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001613}
1614
1615/**
1616 * htmlParseExternalID:
1617 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001618 * @publicID: a xmlChar** receiving PubidLiteral
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001619 * @strict: indicate whether we should restrict parsing to only
1620 * production [75], see NOTE below
1621 *
1622 * Parse an External ID or a Public ID
1623 *
1624 * NOTE: Productions [75] and [83] interract badly since [75] can generate
1625 * 'PUBLIC' S PubidLiteral S SystemLiteral
1626 *
1627 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
1628 * | 'PUBLIC' S PubidLiteral S SystemLiteral
1629 *
1630 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
1631 *
1632 * Returns the function returns SystemLiteral and in the second
1633 * case publicID receives PubidLiteral, is strict is off
1634 * it is possible to return NULL and have publicID set.
1635 */
1636
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001637xmlChar *
1638htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID, int strict) {
1639 xmlChar *URI = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001640
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001641 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
1642 (UPP(2) == 'S') && (UPP(3) == 'T') &&
1643 (UPP(4) == 'E') && (UPP(5) == 'M')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001644 SKIP(6);
1645 if (!IS_BLANK(CUR)) {
1646 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1647 ctxt->sax->error(ctxt->userData,
1648 "Space required after 'SYSTEM'\n");
1649 ctxt->wellFormed = 0;
1650 }
1651 SKIP_BLANKS;
1652 URI = htmlParseSystemLiteral(ctxt);
1653 if (URI == NULL) {
1654 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1655 ctxt->sax->error(ctxt->userData,
1656 "htmlParseExternalID: SYSTEM, no URI\n");
1657 ctxt->wellFormed = 0;
1658 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001659 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
1660 (UPP(2) == 'B') && (UPP(3) == 'L') &&
1661 (UPP(4) == 'I') && (UPP(5) == 'C')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001662 SKIP(6);
1663 if (!IS_BLANK(CUR)) {
1664 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1665 ctxt->sax->error(ctxt->userData,
1666 "Space required after 'PUBLIC'\n");
1667 ctxt->wellFormed = 0;
1668 }
1669 SKIP_BLANKS;
1670 *publicID = htmlParsePubidLiteral(ctxt);
1671 if (*publicID == NULL) {
1672 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1673 ctxt->sax->error(ctxt->userData,
1674 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
1675 ctxt->wellFormed = 0;
1676 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001677 SKIP_BLANKS;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001678 if ((CUR == '"') || (CUR == '\'')) {
1679 URI = htmlParseSystemLiteral(ctxt);
1680 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001681 }
1682 return(URI);
1683}
1684
1685/**
1686 * htmlParseComment:
1687 * @ctxt: an HTML parser context
1688 * @create: should we create a node, or just skip the content
1689 *
1690 * Parse an XML (SGML) comment <!-- .... -->
1691 *
1692 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
1693 */
1694void
1695htmlParseComment(htmlParserCtxtPtr ctxt, int create) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001696 xmlChar *buf = NULL;
1697 int len = 0;
1698 int size = 100;
1699 register xmlChar s, r, q;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001700
1701 /*
1702 * Check that there is a comment right here.
1703 */
1704 if ((CUR != '<') || (NXT(1) != '!') ||
1705 (NXT(2) != '-') || (NXT(3) != '-')) return;
1706
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001707 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
1708 if (buf == NULL) {
1709 fprintf(stderr, "malloc of %d byte failed\n", size);
1710 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001711 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001712 q = r = '-'; /* 0 or '-' to cover our ass against <!--> and <!---> ? !!! */
1713 SKIP(4);
1714 s = CUR;
1715
1716 while (IS_CHAR(s) &&
1717 ((s != '>') || (r != '-') || (q != '-'))) {
1718 if (len + 1 >= size) {
1719 size *= 2;
1720 buf = xmlRealloc(buf, size * sizeof(xmlChar));
1721 if (buf == NULL) {
1722 fprintf(stderr, "realloc of %d byte failed\n", size);
1723 return;
1724 }
1725 }
1726 buf[len++] = s;
1727 NEXT;
1728 q = r;
1729 r = s;
1730 s = CUR;
1731 }
1732 buf[len - 2] = 0;
1733 if (!IS_CHAR(s)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001734 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001735 ctxt->sax->error(ctxt->userData, "Comment not terminated \n<!--%.50s\n", buf);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001736 ctxt->wellFormed = 0;
1737 } else {
1738 NEXT;
1739 if (create) {
Daniel Veillard4c3a2031999-11-19 17:46:26 +00001740 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL)) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001741 ctxt->sax->comment(ctxt->userData, buf);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00001742 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001743 }
1744 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001745 xmlFree(buf);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001746}
1747
1748/**
1749 * htmlParseCharRef:
1750 * @ctxt: an HTML parser context
1751 *
1752 * parse Reference declarations
1753 *
1754 * [66] CharRef ::= '&#' [0-9]+ ';' |
1755 * '&#x' [0-9a-fA-F]+ ';'
1756 *
1757 * Returns the value parsed (as an int)
1758 */
1759int
1760htmlParseCharRef(htmlParserCtxtPtr ctxt) {
1761 int val = 0;
1762
1763 if ((CUR == '&') && (NXT(1) == '#') &&
1764 (NXT(2) == 'x')) {
1765 SKIP(3);
1766 while (CUR != ';') {
1767 if ((CUR >= '0') && (CUR <= '9'))
1768 val = val * 16 + (CUR - '0');
1769 else if ((CUR >= 'a') && (CUR <= 'f'))
1770 val = val * 16 + (CUR - 'a') + 10;
1771 else if ((CUR >= 'A') && (CUR <= 'F'))
1772 val = val * 16 + (CUR - 'A') + 10;
1773 else {
1774 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1775 ctxt->sax->error(ctxt->userData,
1776 "htmlParseCharRef: invalid hexadecimal value\n");
1777 ctxt->wellFormed = 0;
1778 val = 0;
1779 break;
1780 }
1781 NEXT;
1782 }
1783 if (CUR == ';')
1784 NEXT;
1785 } else if ((CUR == '&') && (NXT(1) == '#')) {
1786 SKIP(2);
1787 while (CUR != ';') {
1788 if ((CUR >= '0') && (CUR <= '9'))
1789 val = val * 10 + (CUR - '0');
1790 else {
1791 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1792 ctxt->sax->error(ctxt->userData,
1793 "htmlParseCharRef: invalid decimal value\n");
1794 ctxt->wellFormed = 0;
1795 val = 0;
1796 break;
1797 }
1798 NEXT;
1799 }
1800 if (CUR == ';')
1801 NEXT;
1802 } else {
1803 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1804 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
1805 ctxt->wellFormed = 0;
1806 }
1807 /*
1808 * Check the value IS_CHAR ...
1809 */
1810 if (IS_CHAR(val)) {
1811 return(val);
1812 } else {
1813 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001814 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001815 val);
1816 ctxt->wellFormed = 0;
1817 }
1818 return(0);
1819}
1820
1821
1822/**
1823 * htmlParseDocTypeDecl :
1824 * @ctxt: an HTML parser context
1825 *
1826 * parse a DOCTYPE declaration
1827 *
1828 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
1829 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
1830 */
1831
1832void
1833htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001834 xmlChar *name;
1835 xmlChar *ExternalID = NULL;
1836 xmlChar *URI = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001837
1838 /*
1839 * We know that '<!DOCTYPE' has been detected.
1840 */
1841 SKIP(9);
1842
1843 SKIP_BLANKS;
1844
1845 /*
1846 * Parse the DOCTYPE name.
1847 */
1848 name = htmlParseName(ctxt);
1849 if (name == NULL) {
1850 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1851 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
1852 ctxt->wellFormed = 0;
1853 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001854 /*
1855 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
1856 */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001857
1858 SKIP_BLANKS;
1859
1860 /*
1861 * Check for SystemID and ExternalID
1862 */
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001863 URI = htmlParseExternalID(ctxt, &ExternalID, 0);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001864 SKIP_BLANKS;
1865
1866 /*
1867 * We should be at the end of the DOCTYPE declaration.
1868 */
1869 if (CUR != '>') {
1870 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1871 ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
1872 ctxt->wellFormed = 0;
1873 /* We shouldn't try to resynchronize ... */
1874 } else {
1875 }
1876 NEXT;
1877
1878 /*
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001879 * Create the document accordingly to the DOCTYPE
1880 */
1881 ctxt->myDoc = htmlNewDoc(URI, ExternalID);
1882
1883 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001884 * Cleanup, since we don't use all those identifiers
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001885 */
Daniel Veillard6454aec1999-09-02 22:04:43 +00001886 if (URI != NULL) xmlFree(URI);
1887 if (ExternalID != NULL) xmlFree(ExternalID);
1888 if (name != NULL) xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001889}
1890
1891/**
1892 * htmlParseAttribute:
1893 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001894 * @value: a xmlChar ** used to store the value of the attribute
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001895 *
1896 * parse an attribute
1897 *
1898 * [41] Attribute ::= Name Eq AttValue
1899 *
1900 * [25] Eq ::= S? '=' S?
1901 *
1902 * With namespace:
1903 *
1904 * [NS 11] Attribute ::= QName Eq AttValue
1905 *
1906 * Also the case QName == xmlns:??? is handled independently as a namespace
1907 * definition.
1908 *
1909 * Returns the attribute name, and the value in *value.
1910 */
1911
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001912xmlChar *
1913htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
1914 xmlChar *name, *val;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001915
1916 *value = NULL;
1917 name = htmlParseName(ctxt);
1918 if (name == NULL) {
1919 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1920 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
1921 ctxt->wellFormed = 0;
1922 return(NULL);
1923 }
1924
1925 /*
1926 * read the value
1927 */
1928 SKIP_BLANKS;
1929 if (CUR == '=') {
1930 NEXT;
1931 SKIP_BLANKS;
1932 val = htmlParseAttValue(ctxt);
1933 } else {
1934 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1935 ctxt->sax->error(ctxt->userData,
1936 "Specification mandate value for attribute %s\n", name);
1937 ctxt->wellFormed = 0;
1938 return(NULL);
1939 }
1940
1941 *value = val;
1942 return(name);
1943}
1944
1945/**
1946 * htmlParseStartTag:
1947 * @ctxt: an HTML parser context
1948 *
1949 * parse a start of tag either for rule element or
1950 * EmptyElement. In both case we don't parse the tag closing chars.
1951 *
1952 * [40] STag ::= '<' Name (S Attribute)* S? '>'
1953 *
1954 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
1955 *
1956 * With namespace:
1957 *
1958 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
1959 *
1960 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
1961 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001962 */
1963
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001964void
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001965htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001966 xmlChar *name;
1967 xmlChar *attname;
1968 xmlChar *attvalue;
1969 const xmlChar **atts = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001970 int nbatts = 0;
1971 int maxatts = 0;
1972 int i;
1973
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001974 if (CUR != '<') return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001975 NEXT;
1976
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001977 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001978 name = htmlParseHTMLName(ctxt);
1979 if (name == NULL) {
1980 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1981 ctxt->sax->error(ctxt->userData,
1982 "htmlParseStartTag: invalid element name\n");
1983 ctxt->wellFormed = 0;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001984 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001985 }
1986
1987 /*
1988 * Check for auto-closure of HTML elements.
1989 */
1990 htmlAutoClose(ctxt, name);
1991
1992 /*
1993 * Now parse the attributes, it ends up with the ending
1994 *
1995 * (S Attribute)* S?
1996 */
1997 SKIP_BLANKS;
1998 while ((IS_CHAR(CUR)) &&
1999 (CUR != '>') &&
2000 ((CUR != '/') || (NXT(1) != '>'))) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002001 long cons = ctxt->nbChars;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002002
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002003 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002004 attname = htmlParseAttribute(ctxt, &attvalue);
2005 if ((attname != NULL) && (attvalue != NULL)) {
2006 /*
2007 * Well formedness requires at most one declaration of an attribute
2008 */
2009 for (i = 0; i < nbatts;i += 2) {
2010 if (!xmlStrcmp(atts[i], attname)) {
2011 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002012 ctxt->sax->error(ctxt->userData,
2013 "Attribute %s redefined\n",
2014 attname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002015 ctxt->wellFormed = 0;
Daniel Veillard6454aec1999-09-02 22:04:43 +00002016 xmlFree(attname);
2017 xmlFree(attvalue);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002018 goto failed;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002019 }
2020 }
2021
2022 /*
2023 * Add the pair to atts
2024 */
2025 if (atts == NULL) {
2026 maxatts = 10;
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002027 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002028 if (atts == NULL) {
2029 fprintf(stderr, "malloc of %ld byte failed\n",
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002030 maxatts * (long)sizeof(xmlChar *));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002031 if (name != NULL) xmlFree(name);
2032 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002033 }
Daniel Veillard51e3b151999-11-12 17:02:31 +00002034 } else if (nbatts + 4 > maxatts) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002035 maxatts *= 2;
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002036 atts = (const xmlChar **) xmlRealloc(atts, maxatts * sizeof(xmlChar *));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002037 if (atts == NULL) {
2038 fprintf(stderr, "realloc of %ld byte failed\n",
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002039 maxatts * (long)sizeof(xmlChar *));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002040 if (name != NULL) xmlFree(name);
2041 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002042 }
2043 }
2044 atts[nbatts++] = attname;
2045 atts[nbatts++] = attvalue;
2046 atts[nbatts] = NULL;
2047 atts[nbatts + 1] = NULL;
2048 }
2049
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002050failed:
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002051 SKIP_BLANKS;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002052 if (cons == ctxt->nbChars) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002053 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2054 ctxt->sax->error(ctxt->userData,
2055 "htmlParseStartTag: problem parsing attributes\n");
2056 ctxt->wellFormed = 0;
2057 break;
2058 }
2059 }
2060
2061 /*
2062 * SAX: Start of Element !
2063 */
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002064 htmlnamePush(ctxt, xmlStrdup(name));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002065#ifdef DEBUG
2066 fprintf(stderr,"Start of element %s: pushed %s\n", name, ctxt->name);
2067#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002068 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
2069 ctxt->sax->startElement(ctxt->userData, name, atts);
2070
2071 if (atts != NULL) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002072 for (i = 0;i < nbatts;i++) xmlFree((xmlChar *) atts[i]);
Daniel Veillard6454aec1999-09-02 22:04:43 +00002073 xmlFree(atts);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002074 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002075 if (name != NULL) xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002076}
2077
2078/**
2079 * htmlParseEndTag:
2080 * @ctxt: an HTML parser context
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002081 *
2082 * parse an end of tag
2083 *
2084 * [42] ETag ::= '</' Name S? '>'
2085 *
2086 * With namespace
2087 *
2088 * [NS 9] ETag ::= '</' QName S? '>'
2089 */
2090
2091void
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002092htmlParseEndTag(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002093 xmlChar *name;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002094 xmlChar *oldname;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002095 int i;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002096
2097 if ((CUR != '<') || (NXT(1) != '/')) {
2098 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2099 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
2100 ctxt->wellFormed = 0;
2101 return;
2102 }
2103 SKIP(2);
2104
2105 name = htmlParseHTMLName(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002106 if (name == NULL) return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002107
2108 /*
2109 * We should definitely be at the ending "S? '>'" part
2110 */
2111 SKIP_BLANKS;
2112 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
2113 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2114 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
2115 ctxt->wellFormed = 0;
2116 } else
2117 NEXT;
2118
2119 /*
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002120 * If the name read is not one of the element in the parsing stack
2121 * then return, it's just an error.
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002122 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002123 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
2124 if (!xmlStrcmp(name, ctxt->nameTab[i])) break;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002125 }
2126 if (i < 0) {
2127 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002128 ctxt->sax->error(ctxt->userData,
2129 "Unexpected end tag : %s\n", name);
Daniel Veillard6454aec1999-09-02 22:04:43 +00002130 xmlFree(name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002131 ctxt->wellFormed = 0;
2132 return;
2133 }
2134
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002135
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002136 /*
2137 * Check for auto-closure of HTML elements.
2138 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002139
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002140 htmlAutoCloseOnClose(ctxt, name);
2141
2142 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002143 * Well formedness constraints, opening and closing must match.
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002144 * With the exception that the autoclose may have popped stuff out
2145 * of the stack.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002146 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002147 if (xmlStrcmp(name, ctxt->name)) {
2148#ifdef DEBUG
2149 fprintf(stderr,"End of tag %s: expecting %s\n", name, ctxt->name);
2150#endif
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002151 if ((ctxt->name != NULL) &&
2152 (xmlStrcmp(ctxt->name, name))) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002153 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2154 ctxt->sax->error(ctxt->userData,
2155 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002156 name, ctxt->name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002157 ctxt->wellFormed = 0;
2158 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002159 }
2160
2161 /*
2162 * SAX: End of Tag
2163 */
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002164 oldname = ctxt->name;
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002165 if ((oldname != NULL) && (!xmlStrcmp(oldname, name))) {
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002166 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2167 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002168 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002169 if (oldname != NULL) {
2170#ifdef DEBUG
2171 fprintf(stderr,"End of tag %s: popping out %s\n", name, oldname);
2172#endif
2173 xmlFree(oldname);
2174#ifdef DEBUG
2175 } else {
2176 fprintf(stderr,"End of tag %s: stack empty !!!\n", name);
2177#endif
2178 }
2179 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002180
2181 if (name != NULL)
Daniel Veillard6454aec1999-09-02 22:04:43 +00002182 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002183
2184 return;
2185}
2186
2187
2188/**
2189 * htmlParseReference:
2190 * @ctxt: an HTML parser context
2191 *
2192 * parse and handle entity references in content,
2193 * this will end-up in a call to character() since this is either a
2194 * CharRef, or a predefined entity.
2195 */
2196void
2197htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002198 htmlEntityDescPtr ent;
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002199 xmlChar out[2];
2200 xmlChar *name;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002201 int val;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002202 if (CUR != '&') return;
2203
2204 if (NXT(1) == '#') {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002205 val = htmlParseCharRef(ctxt);
Daniel Veillardb96e6431999-08-29 21:02:19 +00002206 /* invalid for UTF-8 variable encoding !!!!! */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002207 out[0] = val;
2208 out[1] = 0;
2209 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
2210 ctxt->sax->characters(ctxt->userData, out, 1);
2211 } else {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002212 ent = htmlParseEntityRef(ctxt, &name);
2213 if (name == NULL) return; /* Shall we output & anyway ? */
2214 if ((ent == NULL) || (ent->value <= 0) || (ent->value >= 255)) {
2215 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
Daniel Veillardb96e6431999-08-29 21:02:19 +00002216 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002217 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
Daniel Veillardb96e6431999-08-29 21:02:19 +00002218 ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002219 }
2220 } else {
Daniel Veillardb96e6431999-08-29 21:02:19 +00002221 /* invalid for UTF-8 variable encoding !!!!! */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002222 out[0] = ent->value;
2223 out[1] = 0;
2224 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
2225 ctxt->sax->characters(ctxt->userData, out, 1);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002226 }
Daniel Veillard6454aec1999-09-02 22:04:43 +00002227 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002228 }
2229}
2230
2231/**
2232 * htmlParseContent:
2233 * @ctxt: an HTML parser context
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002234 * @name: the node name
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002235 *
2236 * Parse a content: comment, sub-element, reference or text.
2237 *
2238 */
2239
2240void
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002241htmlParseContent(htmlParserCtxtPtr ctxt) {
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002242 xmlChar *currentNode;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002243 int depth;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002244
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002245 currentNode = xmlStrdup(ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002246 depth = ctxt->nameNr;
2247 while (1) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002248 long cons = ctxt->nbChars;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002249
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002250 GROW;
2251 /*
2252 * Our tag or one of it's parent or children is ending.
2253 */
2254 if ((CUR == '<') && (NXT(1) == '/')) {
2255 htmlParseEndTag(ctxt);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002256 if (currentNode != NULL) xmlFree(currentNode);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002257 return;
2258 }
2259
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002260 /*
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002261 * Has this node been popped out during parsing of
2262 * the next element
2263 */
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002264 if ((xmlStrcmp(currentNode, ctxt->name)) &&
2265 (depth >= ctxt->nameNr)) {
2266 if (currentNode != NULL) xmlFree(currentNode);
2267 return;
2268 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002269
2270 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002271 * First case : a comment
2272 */
2273 if ((CUR == '<') && (NXT(1) == '!') &&
2274 (NXT(2) == '-') && (NXT(3) == '-')) {
2275 htmlParseComment(ctxt, 1);
2276 }
2277
2278 /*
2279 * Second case : a sub-element.
2280 */
2281 else if (CUR == '<') {
2282 htmlParseElement(ctxt);
2283 }
2284
2285 /*
2286 * Third case : a reference. If if has not been resolved,
2287 * parsing returns it's Name, create the node
2288 */
2289 else if (CUR == '&') {
2290 htmlParseReference(ctxt);
2291 }
2292
2293 /*
2294 * Last case, text. Note that References are handled directly.
2295 */
2296 else {
2297 htmlParseCharData(ctxt, 0);
2298 }
2299
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002300 if (cons == ctxt->nbChars) {
Daniel Veillard35008381999-10-25 13:15:52 +00002301 if (ctxt->node != NULL) {
2302 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2303 ctxt->sax->error(ctxt->userData,
2304 "detected an error in element content\n");
2305 ctxt->wellFormed = 0;
2306 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002307 break;
2308 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002309
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002310 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002311 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002312 if (currentNode != NULL) xmlFree(currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002313}
2314
2315/**
2316 * htmlParseElement:
2317 * @ctxt: an HTML parser context
2318 *
2319 * parse an HTML element, this is highly recursive
2320 *
2321 * [39] element ::= EmptyElemTag | STag content ETag
2322 *
2323 * [41] Attribute ::= Name Eq AttValue
2324 */
2325
2326void
2327htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002328 const xmlChar *openTag = CUR_PTR;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002329 xmlChar *oldname;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002330 xmlChar *name;
Daniel Veillard7d2c2761999-10-11 15:09:51 +00002331 xmlChar *currentNode = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002332 htmlElemDescPtr info;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002333 htmlParserNodeInfo node_info;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002334 int depth = ctxt->nameNr;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002335
2336 /* Capture start position */
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002337 if (ctxt->record_info) {
2338 node_info.begin_pos = ctxt->input->consumed +
2339 (CUR_PTR - ctxt->input->base);
2340 node_info.begin_line = ctxt->input->line;
2341 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002342
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002343 oldname = xmlStrdup(ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002344 htmlParseStartTag(ctxt);
2345 name = ctxt->name;
2346#ifdef DEBUG
2347 if (oldname == NULL)
2348 fprintf(stderr, "Start of element %s\n", name);
2349 else if (name == NULL)
2350 fprintf(stderr, "Start of element failed, was %s\n", oldname);
2351 else
2352 fprintf(stderr, "Start of element %s, was %s\n", name, oldname);
2353#endif
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002354 if (((depth == ctxt->nameNr) && (!xmlStrcmp(oldname, ctxt->name))) ||
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002355 (name == NULL)) {
2356 if (CUR == '>')
2357 NEXT;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002358 if (oldname != NULL)
2359 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002360 return;
2361 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002362 if (oldname != NULL)
2363 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002364
2365 /*
2366 * Lookup the info for that element.
2367 */
2368 info = htmlTagLookup(name);
2369 if (info == NULL) {
2370 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2371 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
2372 name);
2373 ctxt->wellFormed = 0;
2374 } else if (info->depr) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002375/***************************
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002376 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
2377 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
2378 name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002379 ***************************/
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002380 }
2381
2382 /*
2383 * Check for an Empty Element labelled the XML/SGML way
2384 */
2385 if ((CUR == '/') && (NXT(1) == '>')) {
2386 SKIP(2);
2387 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2388 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002389 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002390#ifdef DEBUG
2391 fprintf(stderr,"End of tag the XML way: popping out %s\n", oldname);
2392#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002393 if (oldname != NULL)
2394 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002395 return;
2396 }
2397
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002398 if (CUR == '>') {
2399 NEXT;
2400 } else {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002401 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2402 ctxt->sax->error(ctxt->userData, "Couldn't find end of Start Tag\n%.30s\n",
2403 openTag);
2404 ctxt->wellFormed = 0;
2405
2406 /*
2407 * end of parsing of this node.
2408 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002409 if (!xmlStrcmp(name, ctxt->name)) {
2410 nodePop(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002411 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002412#ifdef DEBUG
2413 fprintf(stderr,"End of start tag problem: popping out %s\n", oldname);
2414#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002415 if (oldname != NULL)
2416 xmlFree(oldname);
2417 }
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002418
2419 /*
2420 * Capture end position and add node
2421 */
2422 if ( currentNode != NULL && ctxt->record_info ) {
2423 node_info.end_pos = ctxt->input->consumed +
2424 (CUR_PTR - ctxt->input->base);
2425 node_info.end_line = ctxt->input->line;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002426 node_info.node = ctxt->node;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002427 xmlParserAddNodeInfo(ctxt, &node_info);
2428 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002429 return;
2430 }
2431
2432 /*
2433 * Check for an Empty Element from DTD definition
2434 */
2435 if ((info != NULL) && (info->empty)) {
2436 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2437 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002438 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002439#ifdef DEBUG
2440 fprintf(stderr,"End of empty tag %s : popping out %s\n", name, oldname);
2441#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002442 if (oldname != NULL)
2443 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002444 return;
2445 }
2446
2447 /*
2448 * Parse the content of the element:
2449 */
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002450 currentNode = xmlStrdup(ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002451 depth = ctxt->nameNr;
2452 while (IS_CHAR(CUR)) {
2453 htmlParseContent(ctxt);
2454 if (ctxt->nameNr < depth) break;
2455 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002456
2457 if (!IS_CHAR(CUR)) {
2458 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2459 ctxt->sax->error(ctxt->userData,
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002460 "Premature end of data in tag %s\n", currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002461 ctxt->wellFormed = 0;
2462
2463 /*
2464 * end of parsing of this node.
2465 */
2466 nodePop(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002467 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002468#ifdef DEBUG
2469 fprintf(stderr,"Premature end of tag %s : popping out %s\n", name, oldname);
2470#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002471 if (oldname != NULL)
2472 xmlFree(oldname);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002473 if (currentNode != NULL)
2474 xmlFree(currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002475 return;
2476 }
2477
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002478 /*
2479 * Capture end position and add node
2480 */
2481 if ( currentNode != NULL && ctxt->record_info ) {
2482 node_info.end_pos = ctxt->input->consumed +
2483 (CUR_PTR - ctxt->input->base);
2484 node_info.end_line = ctxt->input->line;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002485 node_info.node = ctxt->node;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002486 xmlParserAddNodeInfo(ctxt, &node_info);
2487 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002488 if (currentNode != NULL)
2489 xmlFree(currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002490}
2491
2492/**
2493 * htmlParseDocument :
2494 * @ctxt: an HTML parser context
2495 *
2496 * parse an HTML document (and build a tree if using the standard SAX
2497 * interface).
2498 *
2499 * Returns 0, -1 in case of error. the parser context is augmented
2500 * as a result of the parsing.
2501 */
2502
2503int
2504htmlParseDocument(htmlParserCtxtPtr ctxt) {
2505 htmlDefaultSAXHandlerInit();
2506 ctxt->html = 1;
2507
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002508 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002509 /*
Daniel Veillardb96e6431999-08-29 21:02:19 +00002510 * SAX: beginning of the document processing.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002511 */
2512 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
2513 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
2514
2515 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002516 * Wipe out everything which is before the first '<'
2517 */
Daniel Veillard35008381999-10-25 13:15:52 +00002518 SKIP_BLANKS;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002519 if (CUR == 0) {
2520 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2521 ctxt->sax->error(ctxt->userData, "Document is empty\n");
2522 ctxt->wellFormed = 0;
2523 }
2524
Daniel Veillard35008381999-10-25 13:15:52 +00002525 /*
2526 * Parse possible comments before any content
2527 */
2528 while ((CUR == '<') && (NXT(1) == '!') &&
2529 (NXT(2) == '-') && (NXT(3) == '-')) {
2530 ctxt->myDoc = htmlNewDoc(NULL, NULL);
2531 htmlParseComment(ctxt, 1);
2532 SKIP_BLANKS;
2533 }
2534
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002535
2536 /*
2537 * Then possibly doc type declaration(s) and more Misc
2538 * (doctypedecl Misc*)?
2539 */
2540 if ((CUR == '<') && (NXT(1) == '!') &&
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002541 (UPP(2) == 'D') && (UPP(3) == 'O') &&
2542 (UPP(4) == 'C') && (UPP(5) == 'T') &&
2543 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
2544 (UPP(8) == 'E')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002545 htmlParseDocTypeDecl(ctxt);
2546 }
2547 SKIP_BLANKS;
2548
2549 /*
2550 * Create the document if not done already.
2551 */
2552 if (ctxt->myDoc == NULL) {
2553 ctxt->myDoc = htmlNewDoc(NULL, NULL);
2554 }
2555
2556 /*
2557 * Time to start parsing the tree itself
2558 */
Daniel Veillard35008381999-10-25 13:15:52 +00002559 htmlParseContent(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002560
2561 /*
2562 * SAX: end of the document processing.
2563 */
2564 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
2565 ctxt->sax->endDocument(ctxt->userData);
2566 if (! ctxt->wellFormed) return(-1);
2567 return(0);
2568}
2569
2570
2571/********************************************************************************
2572 * *
2573 * Parser contexts handling *
2574 * *
2575 ********************************************************************************/
2576
2577/**
2578 * xmlInitParserCtxt:
2579 * @ctxt: an HTML parser context
2580 *
2581 * Initialize a parser context
2582 */
2583
2584void
2585htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
2586{
2587 htmlSAXHandler *sax;
2588
Daniel Veillard35008381999-10-25 13:15:52 +00002589 if (ctxt == NULL) return;
2590 memset(ctxt, 0, sizeof(htmlParserCtxt));
2591
Daniel Veillard6454aec1999-09-02 22:04:43 +00002592 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002593 if (sax == NULL) {
2594 fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
2595 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002596 memset(sax, 0, sizeof(htmlSAXHandler));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002597
2598 /* Allocate the Input stack */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002599 ctxt->inputTab = (htmlParserInputPtr *)
2600 xmlMalloc(5 * sizeof(htmlParserInputPtr));
2601 if (ctxt->inputTab == NULL) {
2602 fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
2603 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002604 ctxt->inputNr = 0;
2605 ctxt->inputMax = 5;
2606 ctxt->input = NULL;
2607 ctxt->version = NULL;
2608 ctxt->encoding = NULL;
2609 ctxt->standalone = -1;
2610
2611 /* Allocate the Node stack */
Daniel Veillard6454aec1999-09-02 22:04:43 +00002612 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002613 ctxt->nodeNr = 0;
2614 ctxt->nodeMax = 10;
2615 ctxt->node = NULL;
2616
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002617 /* Allocate the Name stack */
2618 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
2619 ctxt->nameNr = 0;
2620 ctxt->nameMax = 10;
2621 ctxt->name = NULL;
2622
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002623 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
2624 else {
2625 ctxt->sax = sax;
2626 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
2627 }
2628 ctxt->userData = ctxt;
2629 ctxt->myDoc = NULL;
2630 ctxt->wellFormed = 1;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002631 ctxt->replaceEntities = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002632 ctxt->html = 1;
2633 ctxt->record_info = 0;
Daniel Veillard35008381999-10-25 13:15:52 +00002634 ctxt->validate = 0;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002635 ctxt->nbChars = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002636 xmlInitNodeInfoSeq(&ctxt->node_seq);
2637}
2638
2639/**
2640 * htmlFreeParserCtxt:
2641 * @ctxt: an HTML parser context
2642 *
2643 * Free all the memory used by a parser context. However the parsed
2644 * document in ctxt->myDoc is not freed.
2645 */
2646
2647void
2648htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
2649{
2650 htmlParserInputPtr input;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002651 xmlChar *oldname;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002652
2653 if (ctxt == NULL) return;
2654
2655 while ((input = inputPop(ctxt)) != NULL) {
2656 xmlFreeInputStream(input);
2657 }
2658
Daniel Veillard6454aec1999-09-02 22:04:43 +00002659 if (ctxt->nodeTab != NULL) xmlFree(ctxt->nodeTab);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002660 while ((oldname = htmlnamePop(ctxt)) != NULL) {
2661 xmlFree(oldname);
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002662 }
2663 if (ctxt->nameTab != NULL) xmlFree(ctxt->nameTab);
Daniel Veillard6454aec1999-09-02 22:04:43 +00002664 if (ctxt->inputTab != NULL) xmlFree(ctxt->inputTab);
2665 if (ctxt->version != NULL) xmlFree((char *) ctxt->version);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002666 if ((ctxt->sax != NULL) && (ctxt->sax != &htmlDefaultSAXHandler))
Daniel Veillard6454aec1999-09-02 22:04:43 +00002667 xmlFree(ctxt->sax);
2668 xmlFree(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002669}
2670
2671/**
2672 * htmlCreateDocParserCtxt :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002673 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002674 * @encoding: a free form C string describing the HTML document encoding, or NULL
2675 *
2676 * Create a parser context for an HTML document.
2677 *
2678 * Returns the new parser context or NULL
2679 */
2680htmlParserCtxtPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002681htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002682 htmlParserCtxtPtr ctxt;
2683 htmlParserInputPtr input;
2684 /* htmlCharEncoding enc; */
2685
Daniel Veillard6454aec1999-09-02 22:04:43 +00002686 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002687 if (ctxt == NULL) {
2688 perror("malloc");
2689 return(NULL);
2690 }
2691 htmlInitParserCtxt(ctxt);
Daniel Veillard6454aec1999-09-02 22:04:43 +00002692 input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002693 if (input == NULL) {
2694 perror("malloc");
Daniel Veillard6454aec1999-09-02 22:04:43 +00002695 xmlFree(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002696 return(NULL);
2697 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002698 memset(input, 0, sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002699
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002700 input->line = 1;
2701 input->col = 1;
2702 input->base = cur;
2703 input->cur = cur;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002704
2705 inputPush(ctxt, input);
2706 return(ctxt);
2707}
2708
2709/********************************************************************************
2710 * *
2711 * User entry points *
2712 * *
2713 ********************************************************************************/
2714
2715/**
2716 * htmlSAXParseDoc :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002717 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002718 * @encoding: a free form C string describing the HTML document encoding, or NULL
2719 * @sax: the SAX handler block
2720 * @userData: if using SAX, this pointer will be provided on callbacks.
2721 *
2722 * parse an HTML in-memory document and build a tree.
2723 * It use the given SAX function block to handle the parsing callback.
2724 * If sax is NULL, fallback to the default DOM tree building routines.
2725 *
2726 * Returns the resulting document tree
2727 */
2728
2729htmlDocPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002730htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002731 htmlDocPtr ret;
2732 htmlParserCtxtPtr ctxt;
2733
2734 if (cur == NULL) return(NULL);
2735
2736
2737 ctxt = htmlCreateDocParserCtxt(cur, encoding);
2738 if (ctxt == NULL) return(NULL);
2739 if (sax != NULL) {
2740 ctxt->sax = sax;
2741 ctxt->userData = userData;
2742 }
2743
2744 htmlParseDocument(ctxt);
2745 ret = ctxt->myDoc;
2746 if (sax != NULL) {
2747 ctxt->sax = NULL;
2748 ctxt->userData = NULL;
2749 }
2750 htmlFreeParserCtxt(ctxt);
2751
2752 return(ret);
2753}
2754
2755/**
2756 * htmlParseDoc :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002757 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002758 * @encoding: a free form C string describing the HTML document encoding, or NULL
2759 *
2760 * parse an HTML in-memory document and build a tree.
2761 *
2762 * Returns the resulting document tree
2763 */
2764
2765htmlDocPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002766htmlParseDoc(xmlChar *cur, const char *encoding) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002767 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
2768}
2769
2770
2771/**
2772 * htmlCreateFileParserCtxt :
2773 * @filename: the filename
2774 * @encoding: a free form C string describing the HTML document encoding, or NULL
2775 *
2776 * Create a parser context for a file content.
2777 * Automatic support for ZLIB/Compress compressed document is provided
2778 * by default if found at compile-time.
2779 *
2780 * Returns the new parser context or NULL
2781 */
2782htmlParserCtxtPtr
2783htmlCreateFileParserCtxt(const char *filename, const char *encoding)
2784{
2785 htmlParserCtxtPtr ctxt;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002786 htmlParserInputPtr inputStream;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002787 xmlParserInputBufferPtr buf;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002788 /* htmlCharEncoding enc; */
2789
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002790 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
2791 if (buf == NULL) return(NULL);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002792
Daniel Veillard6454aec1999-09-02 22:04:43 +00002793 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002794 if (ctxt == NULL) {
2795 perror("malloc");
2796 return(NULL);
2797 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002798 memset(ctxt, 0, sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002799 htmlInitParserCtxt(ctxt);
Daniel Veillard6454aec1999-09-02 22:04:43 +00002800 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002801 if (inputStream == NULL) {
2802 perror("malloc");
Daniel Veillard6454aec1999-09-02 22:04:43 +00002803 xmlFree(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002804 return(NULL);
2805 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002806 memset(inputStream, 0, sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002807
Daniel Veillard6454aec1999-09-02 22:04:43 +00002808 inputStream->filename = xmlMemStrdup(filename);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002809 inputStream->line = 1;
2810 inputStream->col = 1;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002811 inputStream->buf = buf;
Daniel Veillard35008381999-10-25 13:15:52 +00002812 inputStream->directory = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002813
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002814 inputStream->base = inputStream->buf->buffer->content;
2815 inputStream->cur = inputStream->buf->buffer->content;
2816 inputStream->free = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002817
2818 inputPush(ctxt, inputStream);
2819 return(ctxt);
2820}
2821
2822/**
2823 * htmlSAXParseFile :
2824 * @filename: the filename
2825 * @encoding: a free form C string describing the HTML document encoding, or NULL
2826 * @sax: the SAX handler block
2827 * @userData: if using SAX, this pointer will be provided on callbacks.
2828 *
2829 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
2830 * compressed document is provided by default if found at compile-time.
2831 * It use the given SAX function block to handle the parsing callback.
2832 * If sax is NULL, fallback to the default DOM tree building routines.
2833 *
2834 * Returns the resulting document tree
2835 */
2836
2837htmlDocPtr
2838htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
2839 void *userData) {
2840 htmlDocPtr ret;
2841 htmlParserCtxtPtr ctxt;
2842
2843 ctxt = htmlCreateFileParserCtxt(filename, encoding);
2844 if (ctxt == NULL) return(NULL);
2845 if (sax != NULL) {
2846 ctxt->sax = sax;
2847 ctxt->userData = userData;
2848 }
2849
2850 htmlParseDocument(ctxt);
2851
2852 ret = ctxt->myDoc;
2853 if (sax != NULL) {
2854 ctxt->sax = NULL;
2855 ctxt->userData = NULL;
2856 }
2857 htmlFreeParserCtxt(ctxt);
2858
2859 return(ret);
2860}
2861
2862/**
2863 * htmlParseFile :
2864 * @filename: the filename
2865 * @encoding: a free form C string describing the HTML document encoding, or NULL
2866 *
2867 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
2868 * compressed document is provided by default if found at compile-time.
2869 *
2870 * Returns the resulting document tree
2871 */
2872
2873htmlDocPtr
2874htmlParseFile(const char *filename, const char *encoding) {
2875 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
2876}