blob: c2177c82894644f1f09aef8f51e168923f75fdfe [file] [log] [blame]
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * Daniel.Veillard@w3.org
7 */
8
9#ifdef WIN32
10#define HAVE_FCNTL_H
11#include <io.h>
12#else
Daniel Veillard7f7d1111999-09-22 09:46:25 +000013#include "config.h"
Daniel Veillardbe70ff71999-07-05 16:50:46 +000014#endif
Daniel Veillard7f7d1111999-09-22 09:46:25 +000015
Daniel Veillardbe70ff71999-07-05 16:50:46 +000016#include <stdio.h>
Daniel Veillardbe70ff71999-07-05 16:50:46 +000017#include <string.h> /* for memset() only */
Daniel Veillard7f7d1111999-09-22 09:46:25 +000018#ifdef HAVE_CTYPE_H
19#include <ctype.h>
20#endif
21#ifdef HAVE_STDLIB_H
Daniel Veillardbe70ff71999-07-05 16:50:46 +000022#include <stdlib.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000023#endif
24#ifdef HAVE_SYS_STAT_H
Daniel Veillardbe70ff71999-07-05 16:50:46 +000025#include <sys/stat.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000026#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +000027#ifdef HAVE_FCNTL_H
28#include <fcntl.h>
29#endif
30#ifdef HAVE_UNISTD_H
31#include <unistd.h>
32#endif
33#ifdef HAVE_ZLIB_H
34#include <zlib.h>
35#endif
36
Daniel Veillard6454aec1999-09-02 22:04:43 +000037#include "xmlmemory.h"
Daniel Veillardbe70ff71999-07-05 16:50:46 +000038#include "tree.h"
39#include "HTMLparser.h"
40#include "entities.h"
41#include "encoding.h"
42#include "valid.h"
43#include "parserInternals.h"
Daniel Veillarde2d034d1999-07-27 19:52:06 +000044#include "xmlIO.h"
45
46#define HTML_MAX_NAMELEN 1000
47#define INPUT_CHUNK 50
Daniel Veillardbe70ff71999-07-05 16:50:46 +000048
Daniel Veillard82150d81999-07-07 07:32:15 +000049/* #define DEBUG */
Daniel Veillard5233ffc1999-07-06 22:25:25 +000050
51/************************************************************************
52 * *
53 * Parser stacks related functions and macros *
54 * *
55 ************************************************************************/
56
57/*
58 * Generic function for accessing stacks in the Parser Context
59 */
60
61#define PUSH_AND_POP(type, name) \
62int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
63 if (ctxt->name##Nr >= ctxt->name##Max) { \
64 ctxt->name##Max *= 2; \
Daniel Veillard6454aec1999-09-02 22:04:43 +000065 ctxt->name##Tab = (void *) xmlRealloc(ctxt->name##Tab, \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000066 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
67 if (ctxt->name##Tab == NULL) { \
68 fprintf(stderr, "realloc failed !\n"); \
69 exit(1); \
70 } \
71 } \
72 ctxt->name##Tab[ctxt->name##Nr] = value; \
73 ctxt->name = value; \
74 return(ctxt->name##Nr++); \
75} \
76type html##name##Pop(htmlParserCtxtPtr ctxt) { \
77 type ret; \
Daniel Veillard7c1206f1999-10-14 09:10:25 +000078 if (ctxt->name##Nr < 0) return(0); \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000079 ctxt->name##Nr--; \
Daniel Veillard7c1206f1999-10-14 09:10:25 +000080 if (ctxt->name##Nr < 0) return(0); \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000081 if (ctxt->name##Nr > 0) \
82 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
83 else \
84 ctxt->name = NULL; \
85 ret = ctxt->name##Tab[ctxt->name##Nr]; \
86 ctxt->name##Tab[ctxt->name##Nr] = 0; \
87 return(ret); \
88} \
89
90PUSH_AND_POP(xmlNodePtr, node)
Daniel Veillard2673d3c1999-10-08 14:37:09 +000091PUSH_AND_POP(xmlChar*, name)
Daniel Veillard5233ffc1999-07-06 22:25:25 +000092
93/*
94 * Macros for accessing the content. Those should be used only by the parser,
95 * and not exported.
96 *
97 * Dirty macros, i.e. one need to make assumption on the context to use them
98 *
Daniel Veillarddd6b3671999-09-23 22:19:22 +000099 * CUR_PTR return the current pointer to the xmlChar to be parsed.
100 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000101 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
102 * in UNICODE mode. This should be used internally by the parser
103 * only to compare to ASCII values otherwise it would break when
104 * running with UTF-8 encoding.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000105 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000106 * to compare on ASCII based substring.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000107 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000108 * it should be used only to compare on ASCII based substring.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000109 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000110 * strings within the parser.
111 *
112 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
113 *
114 * CURRENT Returns the current char value, with the full decoding of
115 * UTF-8 if we are using this mode. It returns an int.
116 * NEXT Skip to the next character, this does the proper decoding
117 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000118 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
119 */
120
121#define CUR (*ctxt->input->cur)
122#define UPPER (toupper(*ctxt->input->cur))
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000123#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000124#define NXT(val) ctxt->input->cur[(val)]
125#define UPP(val) (toupper(ctxt->input->cur[(val)]))
126#define CUR_PTR ctxt->input->cur
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000127#define SHRINK xmlParserInputShrink(ctxt->input)
128#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000129
130#define SKIP_BLANKS \
131 while (IS_BLANK(*(ctxt->input->cur))) NEXT
132
133#ifndef USE_UTF_8
134#define CURRENT (*ctxt->input->cur)
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000135#define NEXT { \
136 if ((*ctxt->input->cur == 0) && \
137 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) { \
138 xmlPopInput(ctxt); \
139 } else { \
140 if (*(ctxt->input->cur) == '\n') { \
141 ctxt->input->line++; ctxt->input->col = 1; \
142 } else ctxt->input->col++; \
143 ctxt->input->cur++; \
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000144 ctxt->nbChars++; \
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000145 if (*ctxt->input->cur == 0) \
146 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); \
147 }}
148
149/****************************************
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000150#define NEXT ((*ctxt->input->cur) ? \
151 (((*(ctxt->input->cur) == '\n') ? \
152 (ctxt->input->line++, ctxt->input->col = 1) : \
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000153 (ctxt->input->col++)), \
154 (ctxt->input->cur++), \
155 ((*ctxt->input->cur) ? \
156 (xmlParserInputGrow(ctxt->input, 100), \
157 ctxt->input->cur): \
158 (ctxt->input->cur))) : \
159 ((xmlParserInputGrow(ctxt->input, 100) > 0) ? \
160 ctxt->input->cur: \
161 (xmlPopInput(ctxt), ctxt->input->cur)))
162 ****************************************/
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000163#else
164#endif
165
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000166
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000167
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000168/************************************************************************
169 * *
170 * The list of HTML elements and their properties *
171 * *
172 ************************************************************************/
173
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000174/*
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000175 * Start Tag: 1 means the start tag can be ommited
176 * End Tag: 1 means the end tag can be ommited
177 * 2 means it's forbidden (empty elements)
178 * Depr: this element is deprecated
179 * DTD: 1 means that this element is valid only in the Loose DTD
180 * 2 means that this element is valid only in the Frameset DTD
181 *
182 * Name,Start Tag,End Tag, Empty, Depr., DTD, Description
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000183 */
184htmlElemDesc html40ElementTable[] = {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000185{ "a", 0, 0, 0, 0, 0, "anchor " },
186{ "abbr", 0, 0, 0, 0, 0, "abbreviated form" },
187{ "acronym", 0, 0, 0, 0, 0, "" },
188{ "address", 0, 0, 0, 0, 0, "information on author " },
189{ "applet", 0, 0, 0, 1, 1, "java applet " },
190{ "area", 0, 2, 1, 0, 0, "client-side image map area " },
191{ "b", 0, 0, 0, 0, 0, "bold text style" },
192{ "base", 0, 2, 1, 0, 0, "document base uri " },
193{ "basefont", 0, 2, 1, 1, 1, "base font size " },
194{ "bdo", 0, 0, 0, 0, 0, "i18n bidi over-ride " },
195{ "big", 0, 0, 0, 0, 0, "large text style" },
196{ "blockquote", 0, 0, 0, 0, 0, "long quotation " },
197{ "body", 1, 1, 0, 0, 0, "document body " },
198{ "br", 0, 2, 1, 0, 0, "forced line break " },
199{ "button", 0, 0, 0, 0, 0, "push button " },
200{ "caption", 0, 0, 0, 0, 0, "table caption " },
201{ "center", 0, 0, 0, 1, 1, "shorthand for div align=center " },
202{ "cite", 0, 0, 0, 0, 0, "citation" },
203{ "code", 0, 0, 0, 0, 0, "computer code fragment" },
204{ "col", 0, 2, 1, 0, 0, "table column " },
205{ "colgroup", 0, 1, 0, 0, 0, "table column group " },
206{ "dd", 0, 1, 0, 0, 0, "definition description " },
207{ "del", 0, 0, 0, 0, 0, "deleted text " },
208{ "dfn", 0, 0, 0, 0, 0, "instance definition" },
209{ "dir", 0, 0, 0, 1, 1, "directory list" },
210{ "div", 0, 0, 0, 0, 0, "generic language/style container"},
211{ "dl", 0, 0, 0, 0, 0, "definition list " },
212{ "dt", 0, 1, 0, 0, 0, "definition term " },
213{ "em", 0, 0, 0, 0, 0, "emphasis" },
214{ "fieldset", 0, 0, 0, 0, 0, "form control group " },
215{ "font", 0, 0, 0, 1, 1, "local change to font " },
216{ "form", 0, 0, 0, 0, 0, "interactive form " },
217{ "frame", 0, 2, 1, 0, 2, "subwindow " },
218{ "frameset", 0, 0, 0, 0, 2, "window subdivision" },
219{ "h1", 0, 0, 0, 0, 0, "heading " },
220{ "h2", 0, 0, 0, 0, 0, "heading " },
221{ "h3", 0, 0, 0, 0, 0, "heading " },
222{ "h4", 0, 0, 0, 0, 0, "heading " },
223{ "h5", 0, 0, 0, 0, 0, "heading " },
224{ "h6", 0, 0, 0, 0, 0, "heading " },
225{ "head", 1, 1, 0, 0, 0, "document head " },
226{ "hr", 0, 2, 1, 0, 0, "horizontal rule " },
227{ "html", 1, 1, 0, 0, 0, "document root element " },
228{ "i", 0, 0, 0, 0, 0, "italic text style" },
229{ "iframe", 0, 0, 0, 0, 1, "inline subwindow " },
230{ "img", 0, 2, 1, 0, 0, "embedded image " },
231{ "input", 0, 2, 1, 0, 0, "form control " },
232{ "ins", 0, 0, 0, 0, 0, "inserted text" },
233{ "isindex", 0, 2, 1, 1, 1, "single line prompt " },
234{ "kbd", 0, 0, 0, 0, 0, "text to be entered by the user" },
235{ "label", 0, 0, 0, 0, 0, "form field label text " },
236{ "legend", 0, 0, 0, 0, 0, "fieldset legend " },
237{ "li", 0, 1, 0, 0, 0, "list item " },
238{ "link", 0, 2, 1, 0, 0, "a media-independent link " },
239{ "map", 0, 0, 0, 0, 0, "client-side image map " },
240{ "menu", 0, 0, 0, 1, 1, "menu list " },
241{ "meta", 0, 2, 1, 0, 0, "generic metainformation " },
242{ "noframes", 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " },
243{ "noscript", 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
244{ "object", 0, 0, 0, 0, 0, "generic embedded object " },
245{ "ol", 0, 0, 0, 0, 0, "ordered list " },
246{ "optgroup", 0, 0, 0, 0, 0, "option group " },
247{ "option", 0, 1, 0, 0, 0, "selectable choice " },
248{ "p", 0, 1, 0, 0, 0, "paragraph " },
249{ "param", 0, 2, 1, 0, 0, "named property value " },
250{ "pre", 0, 0, 0, 0, 0, "preformatted text " },
251{ "q", 0, 0, 0, 0, 0, "short inline quotation " },
252{ "s", 0, 0, 0, 1, 1, "strike-through text style" },
253{ "samp", 0, 0, 0, 0, 0, "sample program output, scripts, etc." },
254{ "script", 0, 0, 0, 0, 0, "script statements " },
255{ "select", 0, 0, 0, 0, 0, "option selector " },
256{ "small", 0, 0, 0, 0, 0, "small text style" },
257{ "span", 0, 0, 0, 0, 0, "generic language/style container " },
258{ "strike", 0, 0, 0, 1, 1, "strike-through text" },
259{ "strong", 0, 0, 0, 0, 0, "strong emphasis" },
260{ "style", 0, 0, 0, 0, 0, "style info " },
261{ "sub", 0, 0, 0, 0, 0, "subscript" },
262{ "sup", 0, 0, 0, 0, 0, "superscript " },
263{ "table", 0, 0, 0, 0, 0, "&#160;" },
264{ "tbody", 1, 1, 0, 0, 0, "table body " },
265{ "td", 0, 1, 0, 0, 0, "table data cell" },
266{ "textarea", 0, 0, 0, 0, 0, "multi-line text field " },
267{ "tfoot", 0, 1, 0, 0, 0, "table footer " },
268{ "th", 0, 1, 0, 0, 0, "table header cell" },
269{ "thead", 0, 1, 0, 0, 0, "table header " },
270{ "title", 0, 0, 0, 0, 0, "document title " },
271{ "tr", 0, 1, 0, 0, 0, "table row " },
272{ "tt", 0, 0, 0, 0, 0, "teletype or monospaced text style" },
273{ "u", 0, 0, 0, 1, 1, "underlined text style" },
274{ "ul", 0, 0, 0, 0, 0, "unordered list " },
275{ "var", 0, 0, 0, 0, 0, "instance of a variable or program argument" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000276};
277
278/*
279 * start tags that imply the end of a current element
280 * any tag of each line implies the end of the current element if the type of
281 * that element is in the same line
282 */
Daniel Veillardb96e6431999-08-29 21:02:19 +0000283char *htmlEquEnd[] = {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000284"dt", "dd", "li", "option", NULL,
285"h1", "h2", "h3", "h4", "h5", "h6", NULL,
286"ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000287NULL
288};
289/*
290 * acording the HTML DTD, HR should be added to the 2nd line above, as it
291 * is not allowed within a H1, H2, H3, etc. But we should tolerate that case
292 * because many documents contain rules in headings...
293 */
294
295/*
296 * start tags that imply the end of current element
297 */
Daniel Veillardb96e6431999-08-29 21:02:19 +0000298char *htmlStartClose[] = {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000299"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
300 "dl", "ul", "ol", "menu", "dir", "address", "pre",
301 "listing", "xmp", "head", NULL,
302"head", "p", NULL,
303"title", "p", NULL,
304"body", "head", "style", "link", "title", "p", NULL,
305"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
306 "pre", "listing", "xmp", "head", "li", NULL,
307"hr", "p", "head", NULL,
308"h1", "p", "head", NULL,
309"h2", "p", "head", NULL,
310"h3", "p", "head", NULL,
311"h4", "p", "head", NULL,
312"h5", "p", "head", NULL,
313"h6", "p", "head", NULL,
314"dir", "p", "head", NULL,
315"address", "p", "head", "ul", NULL,
316"pre", "p", "head", "ul", NULL,
317"listing", "p", "head", NULL,
318"xmp", "p", "head", NULL,
319"blockquote", "p", "head", NULL,
320"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
321 "xmp", "head", NULL,
322"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
323 "head", "dd", NULL,
324"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
325 "head", "dt", NULL,
326"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
327 "listing", "xmp", NULL,
328"ol", "p", "head", "ul", NULL,
329"menu", "p", "head", "ul", NULL,
330"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
331"div", "p", "head", NULL,
332"noscript", "p", "head", NULL,
333"center", "font", "b", "i", "p", "head", NULL,
334"a", "a", NULL,
335"caption", "p", NULL,
336"colgroup", "caption", "colgroup", "col", "p", NULL,
337"col", "caption", "col", "p", NULL,
338"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
339 "listing", "xmp", "a", NULL,
340"th", "th", "td", NULL,
341"td", "th", "td", "p", NULL,
342"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
343"thead", "caption", "col", "colgroup", NULL,
344"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
345 "tbody", "p", NULL,
346"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
347 "tfoot", "tbody", "p", NULL,
348"optgroup", "option", NULL,
349"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
350 "pre", "listing", "xmp", "a", NULL,
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000351NULL
352};
353
Daniel Veillardb96e6431999-08-29 21:02:19 +0000354static char** htmlStartCloseIndex[100];
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000355static int htmlStartCloseIndexinitialized = 0;
356
357/************************************************************************
358 * *
359 * functions to handle HTML specific data *
360 * *
361 ************************************************************************/
362
363/**
364 * htmlInitAutoClose:
365 *
366 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
367 *
368 */
369void
370htmlInitAutoClose(void) {
371 int index, i = 0;
372
373 if (htmlStartCloseIndexinitialized) return;
374
375 for (index = 0;index < 100;index ++) htmlStartCloseIndex[index] = NULL;
376 index = 0;
377 while ((htmlStartClose[i] != NULL) && (index < 100 - 1)) {
378 htmlStartCloseIndex[index++] = &htmlStartClose[i];
379 while (htmlStartClose[i] != NULL) i++;
380 i++;
381 }
382}
383
384/**
385 * htmlTagLookup:
386 * @tag: The tag name
387 *
388 * Lookup the HTML tag in the ElementTable
389 *
390 * Returns the related htmlElemDescPtr or NULL if not found.
391 */
392htmlElemDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000393htmlTagLookup(const xmlChar *tag) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000394 int i = 0;
395
396 for (i = 0; i < (sizeof(html40ElementTable) /
397 sizeof(html40ElementTable[0]));i++) {
Daniel Veillardb96e6431999-08-29 21:02:19 +0000398 if (!xmlStrcmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000399 return(&html40ElementTable[i]);
400 }
401 return(NULL);
402}
403
404/**
405 * htmlCheckAutoClose:
406 * @new: The new tag name
407 * @old: The old tag name
408 *
409 * Checks wether the new tag is one of the registered valid tags for closing old.
410 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
411 *
412 * Returns 0 if no, 1 if yes.
413 */
414int
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000415htmlCheckAutoClose(const xmlChar *new, const xmlChar *old) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000416 int i, index;
Daniel Veillardb96e6431999-08-29 21:02:19 +0000417 char **close;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000418
419 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
420
421 /* inefficient, but not a big deal */
422 for (index = 0; index < 100;index++) {
423 close = htmlStartCloseIndex[index];
424 if (close == NULL) return(0);
Daniel Veillardb96e6431999-08-29 21:02:19 +0000425 if (!xmlStrcmp(BAD_CAST *close, new)) break;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000426 }
427
428 i = close - htmlStartClose;
429 i++;
430 while (htmlStartClose[i] != NULL) {
Daniel Veillardb96e6431999-08-29 21:02:19 +0000431 if (!xmlStrcmp(BAD_CAST htmlStartClose[i], old)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000432 return(1);
433 }
434 i++;
435 }
436 return(0);
437}
438
439/**
440 * htmlAutoClose:
441 * @ctxt: an HTML parser context
442 * @new: The new tag name
443 *
444 * The HTmL DtD allows a tag to implicitely close other tags.
445 * The list is kept in htmlStartClose array. This function is
446 * called when a new tag has been detected and generates the
447 * appropriates closes if possible/needed.
448 */
449void
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000450htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *new) {
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000451 xmlChar *oldname;
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000452 while ((ctxt->name != NULL) &&
453 (htmlCheckAutoClose(new, ctxt->name))) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000454#ifdef DEBUG
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000455 fprintf(stderr,"htmlAutoClose: %s closes %s\n", new, ctxt->name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000456#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000457 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000458 ctxt->sax->endElement(ctxt->userData, ctxt->name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +0000459 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000460 if (oldname != NULL) {
461#ifdef DEBUG
462 fprintf(stderr,"htmlAutoClose: popped %s\n", oldname);
463#endif
464 xmlFree(oldname);
465 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000466 }
467}
468
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000469/**
470 * htmlAutoCloseOnClose:
471 * @ctxt: an HTML parser context
472 * @new: The new tag name
473 *
474 * The HTmL DtD allows an ending tag to implicitely close other tags.
475 */
476void
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000477htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *new) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000478 htmlElemDescPtr info;
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000479 xmlChar *oldname;
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000480 int i;
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000481
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000482#ifdef DEBUG
483 fprintf(stderr,"Close of %s stack: %d elements\n", new, ctxt->nameNr);
484 for (i = 0;i < ctxt->nameNr;i++)
485 fprintf(stderr,"%d : %s\n", i, ctxt->nameTab[i]);
486#endif
487
488 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
489 if (!xmlStrcmp(new, ctxt->nameTab[i])) break;
490 }
491 if (i < 0) return;
492
493 while (xmlStrcmp(new, ctxt->name)) {
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000494 info = htmlTagLookup(ctxt->name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000495 if ((info == NULL) || (info->endTag == 1)) {
496#ifdef DEBUG
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000497 fprintf(stderr,"htmlAutoCloseOnClose: %s closes %s\n", new, ctxt->name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000498#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000499 } else {
500 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
501 ctxt->sax->error(ctxt->userData,
502 "Opening and ending tag mismatch: %s and %s\n",
503 new, ctxt->name);
504 ctxt->wellFormed = 0;
505 }
506 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
507 ctxt->sax->endElement(ctxt->userData, ctxt->name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +0000508 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000509 if (oldname != NULL) {
510#ifdef DEBUG
511 fprintf(stderr,"htmlAutoCloseOnClose: popped %s\n", oldname);
512#endif
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000513 xmlFree(oldname);
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000514 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000515 }
516}
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000517
518/************************************************************************
519 * *
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000520 * The list of HTML predefined entities *
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000521 * *
522 ************************************************************************/
523
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000524
525htmlEntityDesc html40EntitiesTable[] = {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000526/*
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000527 * the 4 absolute ones,
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000528 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000529{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
530{ 38, "amp", "ampersand, U+0026 ISOnum" },
Daniel Veillard1566d3a1999-07-15 14:24:29 +0000531{ 39, "apos", "single quote" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000532{ 60, "lt", "less-than sign, U+003C ISOnum" },
533{ 62, "gt", "greater-than sign, U+003E ISOnum" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000534
535/*
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000536 * A bunch still in the 128-255 range
537 * Replacing them depend really on the charset used.
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000538 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000539{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
540{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
541{ 162, "cent", "cent sign, U+00A2 ISOnum" },
542{ 163, "pound","pound sign, U+00A3 ISOnum" },
543{ 164, "curren","currency sign, U+00A4 ISOnum" },
544{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
545{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
546{ 167, "sect", "section sign, U+00A7 ISOnum" },
547{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
548{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
549{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
550{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
551{ 172, "not", "not sign, U+00AC ISOnum" },
552{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
553{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
554{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
555{ 176, "deg", "degree sign, U+00B0 ISOnum" },
556{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
557{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
558{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
559{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
560{ 181, "micro","micro sign, U+00B5 ISOnum" },
561{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000562{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000563{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
564{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
565{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000566{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000567{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
568{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
569{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
570{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
571{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
572{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
573{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
574{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
575{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
576{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
577{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
578{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
579{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
580{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
581{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
582{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
583{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
584{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
585{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
586{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
587{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
588{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
589{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
590{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
591{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
592{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
593{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
594{ 215, "times","multiplication sign, U+00D7 ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000595{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000596{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
597{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
598{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
599{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
600{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
601{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
602{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
603{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
604{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
605{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
606{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
607{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
608{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
609{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
610{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
611{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
612{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
613{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
614{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
615{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
616{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
617{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
618{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
619{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
620{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
621{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
622{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
623{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
624{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
625{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
626{ 247, "divide","division sign, U+00F7 ISOnum" },
627{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
628{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
629{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
630{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
631{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
632{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
633{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
634{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000635
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000636/*
637 * Anything below should really be kept as entities references
638 */
639{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000640
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000641{ 913, "Alpha","greek capital letter alpha, U+0391" },
642{ 914, "Beta", "greek capital letter beta, U+0392" },
643{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
644{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
645{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
646{ 918, "Zeta", "greek capital letter zeta, U+0396" },
647{ 919, "Eta", "greek capital letter eta, U+0397" },
648{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
649{ 921, "Iota", "greek capital letter iota, U+0399" },
650{ 922, "Kappa","greek capital letter kappa, U+039A" },
651{ 923, "Lambda""greek capital letter lambda, U+039B ISOgrk3" },
652{ 924, "Mu", "greek capital letter mu, U+039C" },
653{ 925, "Nu", "greek capital letter nu, U+039D" },
654{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
655{ 927, "Omicron","greek capital letter omicron, U+039F" },
656{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
657{ 929, "Rho", "greek capital letter rho, U+03A1" },
658{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
659{ 932, "Tau", "greek capital letter tau, U+03A4" },
660{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
661{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
662{ 935, "Chi", "greek capital letter chi, U+03A7" },
663{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
664{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000665
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000666{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
667{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
668{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
669{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
670{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
671{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
672{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
673{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
674{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
675{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
676{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
677{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
678{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
679{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
680{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
681{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
682{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
683{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
684{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
685{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
686{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
687{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
688{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
689{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
690{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
691{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
692{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
693{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000694
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000695{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
696{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
697{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
698{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
699{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
700{ 8260, "frasl","fraction slash, U+2044 NEW" },
701
Daniel Veillardb05deb71999-08-10 19:04:08 +0000702{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000703{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
704{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
705{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
706{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
707{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
708{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
709{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
710{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
711{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
712{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
713{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
714{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
715{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
716{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
717{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
718
719
720{ 8704, "forall","for all, U+2200 ISOtech" },
721{ 8706, "part", "partial differential, U+2202 ISOtech" },
722{ 8707, "exist","there exists, U+2203 ISOtech" },
723{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
724{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
725{ 8712, "isin", "element of, U+2208 ISOtech" },
726{ 8713, "notin","not an element of, U+2209 ISOtech" },
727{ 8715, "ni", "contains as member, U+220B ISOtech" },
728{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
729{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
730{ 8722, "minus","minus sign, U+2212 ISOtech" },
731{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
732{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
733{ 8733, "prop", "proportional to, U+221D ISOtech" },
734{ 8734, "infin","infinity, U+221E ISOtech" },
735{ 8736, "ang", "angle, U+2220 ISOamso" },
736{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
737{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
738{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
739{ 8746, "cup", "union = cup, U+222A ISOtech" },
740{ 8747, "int", "integral, U+222B ISOtech" },
741{ 8756, "there4","therefore, U+2234 ISOtech" },
742{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
743{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
744{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
745{ 8800, "ne", "not equal to, U+2260 ISOtech" },
746{ 8801, "equiv","identical to, U+2261 ISOtech" },
747{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
748{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
749{ 8834, "sub", "subset of, U+2282 ISOtech" },
750{ 8835, "sup", "superset of, U+2283 ISOtech" },
751{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
752{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
753{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
754{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
755{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
756{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
757{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
758{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
759{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
760{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
761{ 8971, "rfloor","right floor, U+230B ISOamsc" },
762{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
763{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
764{ 9674, "loz", "lozenge, U+25CA ISOpub" },
765
766{ 9824, "spades","black spade suit, U+2660 ISOpub" },
767{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
768{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
769{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
770
771{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
772{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
773{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
774{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
775{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
776{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
777{ 732, "tilde","small tilde, U+02DC ISOdia" },
778
779{ 8194, "ensp", "en space, U+2002 ISOpub" },
780{ 8195, "emsp", "em space, U+2003 ISOpub" },
781{ 8201, "thinsp","thin space, U+2009 ISOpub" },
782{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
783{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
784{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
785{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
786{ 8211, "ndash","en dash, U+2013 ISOpub" },
787{ 8212, "mdash","em dash, U+2014 ISOpub" },
788{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
789{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
790{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
791{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
792{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
793{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
794{ 8224, "dagger","dagger, U+2020 ISOpub" },
795{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
796{ 8240, "permil","per mille sign, U+2030 ISOtech" },
797{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000798{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000799{ 8364, "euro", "euro sign, U+20AC NEW" }
800};
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000801
802/************************************************************************
803 * *
804 * Commodity functions to handle entities *
805 * *
806 ************************************************************************/
807
808/*
809 * Macro used to grow the current buffer.
810 */
811#define growBuffer(buffer) { \
812 buffer##_size *= 2; \
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000813 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000814 if (buffer == NULL) { \
815 perror("realloc failed"); \
816 exit(1); \
817 } \
818}
819
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000820/**
821 * htmlEntityLookup:
822 * @name: the entity name
823 *
824 * Lookup the given entity in EntitiesTable
825 *
826 * TODO: the linear scan is really ugly, an hash table is really needed.
827 *
828 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
829 */
830htmlEntityDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000831htmlEntityLookup(const xmlChar *name) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000832 int i;
833
834 for (i = 0;i < (sizeof(html40EntitiesTable)/
835 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillardb96e6431999-08-29 21:02:19 +0000836 if (!xmlStrcmp(name, BAD_CAST html40EntitiesTable[i].name)) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000837#ifdef DEBUG
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000838 fprintf(stderr,"Found entity %s\n", name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000839#endif
840 return(&html40EntitiesTable[i]);
841 }
842 }
843 return(NULL);
844}
845
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000846
847/**
848 * htmlDecodeEntities:
849 * @ctxt: the parser context
850 * @len: the len to decode (in bytes !), -1 for no size limit
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000851 * @end: an end marker xmlChar, 0 if none
852 * @end2: an end marker xmlChar, 0 if none
853 * @end3: an end marker xmlChar, 0 if none
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000854 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000855 * Subtitute the HTML entities by their value
856 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000857 * DEPRECATED !!!!
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000858 *
859 * Returns A newly allocated string with the substitution done. The caller
860 * must deallocate it !
861 */
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000862xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000863htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000864 xmlChar end, xmlChar end2, xmlChar end3) {
865 xmlChar *buffer = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000866 int buffer_size = 0;
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000867 xmlChar *out = NULL;
868 xmlChar *name = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000869
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000870 xmlChar *cur = NULL;
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000871 htmlEntityDescPtr ent;
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000872 int nbchars = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000873 unsigned int max = (unsigned int) len;
874
875 /*
876 * allocate a translation buffer.
877 */
878 buffer_size = 1000;
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000879 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000880 if (buffer == NULL) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000881 perror("htmlDecodeEntities: malloc failed");
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000882 return(NULL);
883 }
884 out = buffer;
885
886 /*
887 * Ok loop until we reach one of the ending char or a size limit.
888 */
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000889 while ((nbchars < max) && (CUR != end) &&
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000890 (CUR != end2) && (CUR != end3)) {
891
892 if (CUR == '&') {
893 if (NXT(1) == '#') {
894 int val = htmlParseCharRef(ctxt);
Daniel Veillardb96e6431999-08-29 21:02:19 +0000895 /* invalid for UTF-8 variable encoding !!!!! */
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000896 *out++ = val;
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000897 nbchars += 3; /* !!!! */
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000898 } else {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000899 ent = htmlParseEntityRef(ctxt, &name);
900 if (name != NULL) {
901 if ((ent == NULL) || (ent->value <= 0) ||
902 (ent->value >= 255)) {
903 *out++ = '&';
904 cur = name;
905 while (*cur != 0) {
906 if (out - buffer > buffer_size - 100) {
907 int index = out - buffer;
908
909 growBuffer(buffer);
910 out = &buffer[index];
911 }
912 *out++ = *cur++;
913 }
914 *out++ = ';';
915 } else {
Daniel Veillardb96e6431999-08-29 21:02:19 +0000916 /* invalid for UTF-8 variable encoding !!!!! */
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000917 *out++ = (xmlChar)ent->value;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000918 if (out - buffer > buffer_size - 100) {
919 int index = out - buffer;
920
921 growBuffer(buffer);
922 out = &buffer[index];
923 }
924 }
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000925 nbchars += 2 + xmlStrlen(name);
Daniel Veillard6454aec1999-09-02 22:04:43 +0000926 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000927 }
928 }
929 } else {
Daniel Veillardb96e6431999-08-29 21:02:19 +0000930 /* invalid for UTF-8 , use COPY(out); !!!!! */
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000931 *out++ = CUR;
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000932 nbchars++;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000933 if (out - buffer > buffer_size - 100) {
934 int index = out - buffer;
935
936 growBuffer(buffer);
937 out = &buffer[index];
938 }
939 NEXT;
940 }
941 }
942 *out++ = 0;
943 return(buffer);
944}
945
946
947/************************************************************************
948 * *
949 * Commodity functions to handle encodings *
950 * *
951 ************************************************************************/
952
953/**
954 * htmlSwitchEncoding:
955 * @ctxt: the parser context
956 * @len: the len of @cur
957 *
958 * change the input functions when discovering the character encoding
959 * of a given entity.
960 *
961 */
962void
963htmlSwitchEncoding(htmlParserCtxtPtr ctxt, xmlCharEncoding enc)
964{
965 switch (enc) {
966 case XML_CHAR_ENCODING_ERROR:
967 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
968 ctxt->sax->error(ctxt->userData, "encoding unknown\n");
969 ctxt->wellFormed = 0;
970 break;
971 case XML_CHAR_ENCODING_NONE:
972 /* let's assume it's UTF-8 without the XML decl */
973 return;
974 case XML_CHAR_ENCODING_UTF8:
975 /* default encoding, no conversion should be needed */
976 return;
977 case XML_CHAR_ENCODING_UTF16LE:
978 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
979 ctxt->sax->error(ctxt->userData,
980 "char encoding UTF16 little endian not supported\n");
981 break;
982 case XML_CHAR_ENCODING_UTF16BE:
983 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
984 ctxt->sax->error(ctxt->userData,
985 "char encoding UTF16 big endian not supported\n");
986 break;
987 case XML_CHAR_ENCODING_UCS4LE:
988 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
989 ctxt->sax->error(ctxt->userData,
990 "char encoding USC4 little endian not supported\n");
991 break;
992 case XML_CHAR_ENCODING_UCS4BE:
993 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
994 ctxt->sax->error(ctxt->userData,
995 "char encoding USC4 big endian not supported\n");
996 break;
997 case XML_CHAR_ENCODING_EBCDIC:
998 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
999 ctxt->sax->error(ctxt->userData,
1000 "char encoding EBCDIC not supported\n");
1001 break;
1002 case XML_CHAR_ENCODING_UCS4_2143:
1003 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1004 ctxt->sax->error(ctxt->userData,
1005 "char encoding UCS4 2143 not supported\n");
1006 break;
1007 case XML_CHAR_ENCODING_UCS4_3412:
1008 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1009 ctxt->sax->error(ctxt->userData,
1010 "char encoding UCS4 3412 not supported\n");
1011 break;
1012 case XML_CHAR_ENCODING_UCS2:
1013 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1014 ctxt->sax->error(ctxt->userData,
1015 "char encoding UCS2 not supported\n");
1016 break;
1017 case XML_CHAR_ENCODING_8859_1:
1018 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1019 ctxt->sax->error(ctxt->userData,
1020 "char encoding ISO_8859_1 ISO Latin 1 not supported\n");
1021 break;
1022 case XML_CHAR_ENCODING_8859_2:
1023 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1024 ctxt->sax->error(ctxt->userData,
1025 "char encoding ISO_8859_2 ISO Latin 2 not supported\n");
1026 break;
1027 case XML_CHAR_ENCODING_8859_3:
1028 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1029 ctxt->sax->error(ctxt->userData,
1030 "char encoding ISO_8859_3 not supported\n");
1031 break;
1032 case XML_CHAR_ENCODING_8859_4:
1033 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1034 ctxt->sax->error(ctxt->userData,
1035 "char encoding ISO_8859_4 not supported\n");
1036 break;
1037 case XML_CHAR_ENCODING_8859_5:
1038 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1039 ctxt->sax->error(ctxt->userData,
1040 "char encoding ISO_8859_5 not supported\n");
1041 break;
1042 case XML_CHAR_ENCODING_8859_6:
1043 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1044 ctxt->sax->error(ctxt->userData,
1045 "char encoding ISO_8859_6 not supported\n");
1046 break;
1047 case XML_CHAR_ENCODING_8859_7:
1048 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1049 ctxt->sax->error(ctxt->userData,
1050 "char encoding ISO_8859_7 not supported\n");
1051 break;
1052 case XML_CHAR_ENCODING_8859_8:
1053 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1054 ctxt->sax->error(ctxt->userData,
1055 "char encoding ISO_8859_8 not supported\n");
1056 break;
1057 case XML_CHAR_ENCODING_8859_9:
1058 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1059 ctxt->sax->error(ctxt->userData,
1060 "char encoding ISO_8859_9 not supported\n");
1061 break;
1062 case XML_CHAR_ENCODING_2022_JP:
1063 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1064 ctxt->sax->error(ctxt->userData,
1065 "char encoding ISO-2022-JPnot supported\n");
1066 break;
1067 case XML_CHAR_ENCODING_SHIFT_JIS:
1068 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1069 ctxt->sax->error(ctxt->userData,
1070 "char encoding Shift_JISnot supported\n");
1071 break;
1072 case XML_CHAR_ENCODING_EUC_JP:
1073 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1074 ctxt->sax->error(ctxt->userData,
1075 "char encoding EUC-JPnot supported\n");
1076 break;
1077 }
1078}
1079
1080
1081/************************************************************************
1082 * *
1083 * Commodity functions, cleanup needed ? *
1084 * *
1085 ************************************************************************/
1086
1087/**
1088 * areBlanks:
1089 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001090 * @str: a xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001091 * @len: the size of @str
1092 *
1093 * Is this a sequence of blank chars that one can ignore ?
1094 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001095 * Returns 1 if ignorable 0 otherwise.
1096 */
1097
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001098static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001099 int i;
1100 xmlNodePtr lastChild;
1101
1102 for (i = 0;i < len;i++)
1103 if (!(IS_BLANK(str[i]))) return(0);
1104
1105 if (CUR != '<') return(0);
1106 if (ctxt->node == NULL) return(0);
1107 lastChild = xmlGetLastChild(ctxt->node);
1108 if (lastChild == NULL) {
1109 if (ctxt->node->content != NULL) return(0);
1110 } else if (xmlNodeIsText(lastChild))
1111 return(0);
1112 return(1);
1113}
1114
1115/**
1116 * htmlHandleEntity:
1117 * @ctxt: an HTML parser context
1118 * @entity: an XML entity pointer.
1119 *
1120 * Default handling of an HTML entity, call the parser with the
1121 * substitution string
1122 */
1123
1124void
1125htmlHandleEntity(htmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
1126 int len;
1127
1128 if (entity->content == NULL) {
1129 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1130 ctxt->sax->error(ctxt->userData, "htmlHandleEntity %s: content == NULL\n",
1131 entity->name);
1132 ctxt->wellFormed = 0;
1133 return;
1134 }
1135 len = xmlStrlen(entity->content);
1136
1137 /*
1138 * Just handle the content as a set of chars.
1139 */
1140 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
1141 ctxt->sax->characters(ctxt->userData, entity->content, len);
1142
1143}
1144
1145/**
1146 * htmlNewDoc:
1147 * @URI: URI for the dtd, or NULL
1148 * @ExternalID: the external ID of the DTD, or NULL
1149 *
1150 * Returns a new document
1151 */
1152htmlDocPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001153htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001154 xmlDocPtr cur;
1155
1156 /*
1157 * Allocate a new document and fill the fields.
1158 */
Daniel Veillard6454aec1999-09-02 22:04:43 +00001159 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001160 if (cur == NULL) {
1161 fprintf(stderr, "xmlNewDoc : malloc failed\n");
1162 return(NULL);
1163 }
Daniel Veillarde7a5a771999-08-30 13:05:42 +00001164 memset(cur, 0, sizeof(xmlDoc));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001165
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001166 cur->type = XML_HTML_DOCUMENT_NODE;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001167 cur->version = NULL;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001168 cur->intSubset = NULL;
Daniel Veillardb96e6431999-08-29 21:02:19 +00001169 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001170 cur->name = NULL;
1171 cur->root = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001172 cur->extSubset = NULL;
1173 cur->oldNs = NULL;
1174 cur->encoding = NULL;
1175 cur->standalone = 1;
1176 cur->compression = 0;
Daniel Veillardc08a2c61999-09-08 21:35:25 +00001177 cur->ids = NULL;
1178 cur->refs = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001179#ifndef XML_WITHOUT_CORBA
1180 cur->_private = NULL;
1181 cur->vepv = NULL;
1182#endif
1183 return(cur);
1184}
1185
1186
1187/************************************************************************
1188 * *
1189 * The parser itself *
1190 * Relates to http://www.w3.org/TR/html40 *
1191 * *
1192 ************************************************************************/
1193
1194/************************************************************************
1195 * *
1196 * The parser itself *
1197 * *
1198 ************************************************************************/
1199
1200/**
1201 * htmlParseHTMLName:
1202 * @ctxt: an HTML parser context
1203 *
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001204 * parse an HTML tag or attribute name, note that we convert it to lowercase
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001205 * since HTML names are not case-sensitive.
1206 *
1207 * Returns the Tag Name parsed or NULL
1208 */
1209
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001210xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001211htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001212 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001213 int i = 0;
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001214 xmlChar loc[100];
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001215
1216 if (!IS_LETTER(CUR) && (CUR != '_') &&
1217 (CUR != ':')) return(NULL);
1218
1219 while ((i < 100) && ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)))) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001220 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001221 else loc[i] = CUR;
1222 i++;
1223
1224 NEXT;
1225 }
1226
1227 ret = xmlStrndup(loc, i);
1228
1229 return(ret);
1230}
1231
1232/**
1233 * htmlParseName:
1234 * @ctxt: an HTML parser context
1235 *
1236 * parse an HTML name, this routine is case sensistive.
1237 *
1238 * Returns the Name parsed or NULL
1239 */
1240
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001241xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001242htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001243 xmlChar buf[HTML_MAX_NAMELEN];
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001244 int len = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001245
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001246 GROW;
1247 if (!IS_LETTER(CUR) && (CUR != '_')) {
1248 return(NULL);
1249 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001250
1251 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1252 (CUR == '.') || (CUR == '-') ||
1253 (CUR == '_') || (CUR == ':') ||
1254 (IS_COMBINING(CUR)) ||
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001255 (IS_EXTENDER(CUR))) {
1256 buf[len++] = CUR;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001257 NEXT;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001258 if (len >= HTML_MAX_NAMELEN) {
1259 fprintf(stderr,
1260 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1261 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1262 (CUR == '.') || (CUR == '-') ||
1263 (CUR == '_') || (CUR == ':') ||
1264 (IS_COMBINING(CUR)) ||
1265 (IS_EXTENDER(CUR)))
1266 NEXT;
1267 break;
1268 }
1269 }
1270 return(xmlStrndup(buf, len));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001271}
1272
1273/**
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001274 * htmlParseHTMLAttribute:
1275 * @ctxt: an HTML parser context
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001276 * @stop: a char stop value
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001277 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001278 * parse an HTML attribute value till the stop (quote), if
1279 * stop is 0 then it stops at the first space
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001280 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001281 * Returns the attribute parsed or NULL
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001282 */
1283
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001284xmlChar *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001285htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001286 xmlChar buf[HTML_MAX_NAMELEN];
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001287 int len = 0;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001288
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001289 GROW;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001290 while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1291 if ((stop == 0) && (IS_BLANK(CUR))) break;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001292 buf[len++] = CUR;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001293 NEXT;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001294 if (len >= HTML_MAX_NAMELEN) {
1295 fprintf(stderr,
1296 "htmlParseHTMLAttribute: reached HTML_MAX_NAMELEN limit\n");
1297 while ((!IS_BLANK(CUR)) && (CUR != '<') &&
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001298 (CUR != '>') &&
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001299 (CUR != '\'') && (CUR != '"'))
1300 NEXT;
1301 break;
1302 }
1303 }
1304 return(xmlStrndup(buf, len));
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001305}
1306
1307/**
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001308 * htmlParseNmtoken:
1309 * @ctxt: an HTML parser context
1310 *
1311 * parse an HTML Nmtoken.
1312 *
1313 * Returns the Nmtoken parsed or NULL
1314 */
1315
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001316xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001317htmlParseNmtoken(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001318 xmlChar buf[HTML_MAX_NAMELEN];
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001319 int len = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001320
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001321 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001322 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1323 (CUR == '.') || (CUR == '-') ||
1324 (CUR == '_') || (CUR == ':') ||
1325 (IS_COMBINING(CUR)) ||
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001326 (IS_EXTENDER(CUR))) {
1327 buf[len++] = CUR;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001328 NEXT;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001329 if (len >= HTML_MAX_NAMELEN) {
1330 fprintf(stderr,
1331 "htmlParseNmtoken: reached HTML_MAX_NAMELEN limit\n");
1332 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1333 (CUR == '.') || (CUR == '-') ||
1334 (CUR == '_') || (CUR == ':') ||
1335 (IS_COMBINING(CUR)) ||
1336 (IS_EXTENDER(CUR)))
1337 NEXT;
1338 break;
1339 }
1340 }
1341 return(xmlStrndup(buf, len));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001342}
1343
1344/**
1345 * htmlParseEntityRef:
1346 * @ctxt: an HTML parser context
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001347 * @str: location to store the entity name
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001348 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001349 * parse an HTML ENTITY references
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001350 *
1351 * [68] EntityRef ::= '&' Name ';'
1352 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001353 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
1354 * if non-NULL *str will have to be freed by the caller.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001355 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001356htmlEntityDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001357htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
1358 xmlChar *name;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001359 htmlEntityDescPtr ent = NULL;
1360 *str = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001361
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001362 if (CUR == '&') {
1363 NEXT;
1364 name = htmlParseName(ctxt);
1365 if (name == NULL) {
1366 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1367 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
1368 ctxt->wellFormed = 0;
1369 } else {
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001370 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001371 if (CUR == ';') {
1372 NEXT;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001373 *str = name;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001374
1375 /*
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001376 * Lookup the entity in the table.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001377 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001378 ent = htmlEntityLookup(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001379 } else {
1380 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1381 ctxt->sax->error(ctxt->userData,
1382 "htmlParseEntityRef: expecting ';'\n");
1383 ctxt->wellFormed = 0;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001384 if (ctxt->sax->characters != NULL) {
Daniel Veillardb96e6431999-08-29 21:02:19 +00001385 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001386 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
1387 }
Daniel Veillard6454aec1999-09-02 22:04:43 +00001388 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001389 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001390 }
1391 }
1392 return(ent);
1393}
1394
1395/**
1396 * htmlParseAttValue:
1397 * @ctxt: an HTML parser context
1398 *
1399 * parse a value for an attribute
1400 * Note: the parser won't do substitution of entities here, this
1401 * will be handled later in xmlStringGetNodeList, unless it was
1402 * asked for ctxt->replaceEntities != 0
1403 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001404 * Returns the AttValue parsed or NULL.
1405 */
1406
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001407xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001408htmlParseAttValue(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001409 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001410
1411 if (CUR == '"') {
1412 NEXT;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001413 ret = htmlParseHTMLAttribute(ctxt, '"');
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001414 if (CUR != '"') {
1415 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1416 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
1417 ctxt->wellFormed = 0;
1418 } else
1419 NEXT;
1420 } else if (CUR == '\'') {
1421 NEXT;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001422 ret = htmlParseHTMLAttribute(ctxt, '\'');
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001423 if (CUR != '\'') {
1424 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1425 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
1426 ctxt->wellFormed = 0;
1427 } else
1428 NEXT;
1429 } else {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001430 /*
1431 * That's an HTMLism, the attribute value may not be quoted
1432 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001433 ret = htmlParseHTMLAttribute(ctxt, 0);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001434 if (ret == NULL) {
1435 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1436 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
1437 ctxt->wellFormed = 0;
1438 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001439 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001440 return(ret);
1441}
1442
1443/**
1444 * htmlParseSystemLiteral:
1445 * @ctxt: an HTML parser context
1446 *
1447 * parse an HTML Literal
1448 *
1449 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
1450 *
1451 * Returns the SystemLiteral parsed or NULL
1452 */
1453
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001454xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001455htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001456 const xmlChar *q;
1457 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001458
1459 if (CUR == '"') {
1460 NEXT;
1461 q = CUR_PTR;
1462 while ((IS_CHAR(CUR)) && (CUR != '"'))
1463 NEXT;
1464 if (!IS_CHAR(CUR)) {
1465 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1466 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
1467 ctxt->wellFormed = 0;
1468 } else {
1469 ret = xmlStrndup(q, CUR_PTR - q);
1470 NEXT;
1471 }
1472 } else if (CUR == '\'') {
1473 NEXT;
1474 q = CUR_PTR;
1475 while ((IS_CHAR(CUR)) && (CUR != '\''))
1476 NEXT;
1477 if (!IS_CHAR(CUR)) {
1478 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1479 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
1480 ctxt->wellFormed = 0;
1481 } else {
1482 ret = xmlStrndup(q, CUR_PTR - q);
1483 NEXT;
1484 }
1485 } else {
1486 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1487 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
1488 ctxt->wellFormed = 0;
1489 }
1490
1491 return(ret);
1492}
1493
1494/**
1495 * htmlParsePubidLiteral:
1496 * @ctxt: an HTML parser context
1497 *
1498 * parse an HTML public literal
1499 *
1500 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
1501 *
1502 * Returns the PubidLiteral parsed or NULL.
1503 */
1504
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001505xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001506htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001507 const xmlChar *q;
1508 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001509 /*
1510 * Name ::= (Letter | '_') (NameChar)*
1511 */
1512 if (CUR == '"') {
1513 NEXT;
1514 q = CUR_PTR;
1515 while (IS_PUBIDCHAR(CUR)) NEXT;
1516 if (CUR != '"') {
1517 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1518 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
1519 ctxt->wellFormed = 0;
1520 } else {
1521 ret = xmlStrndup(q, CUR_PTR - q);
1522 NEXT;
1523 }
1524 } else if (CUR == '\'') {
1525 NEXT;
1526 q = CUR_PTR;
1527 while ((IS_LETTER(CUR)) && (CUR != '\''))
1528 NEXT;
1529 if (!IS_LETTER(CUR)) {
1530 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1531 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
1532 ctxt->wellFormed = 0;
1533 } else {
1534 ret = xmlStrndup(q, CUR_PTR - q);
1535 NEXT;
1536 }
1537 } else {
1538 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1539 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
1540 ctxt->wellFormed = 0;
1541 }
1542
1543 return(ret);
1544}
1545
1546/**
1547 * htmlParseCharData:
1548 * @ctxt: an HTML parser context
1549 * @cdata: int indicating whether we are within a CDATA section
1550 *
1551 * parse a CharData section.
1552 * if we are within a CDATA section ']]>' marks an end of section.
1553 *
1554 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
1555 */
1556
1557void
1558htmlParseCharData(htmlParserCtxtPtr ctxt, int cdata) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001559 xmlChar *buf = NULL;
1560 int len = 0;
1561 int size = 100;
1562 xmlChar q;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001563
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001564 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
1565 if (buf == NULL) {
1566 fprintf(stderr, "malloc of %d byte failed\n", size);
1567 return;
1568 }
1569
1570 q = CUR;
1571 while ((IS_CHAR(q)) && (q != '<') &&
1572 (q != '&')) {
1573 if ((q == ']') && (NXT(1) == ']') &&
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001574 (NXT(2) == '>')) {
1575 if (cdata) break;
1576 else {
1577 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1578 ctxt->sax->error(ctxt->userData,
1579 "Sequence ']]>' not allowed in content\n");
1580 ctxt->wellFormed = 0;
1581 }
1582 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001583 if (len + 1 >= size) {
1584 size *= 2;
1585 buf = xmlRealloc(buf, size * sizeof(xmlChar));
1586 if (buf == NULL) {
1587 fprintf(stderr, "realloc of %d byte failed\n", size);
1588 return;
1589 }
1590 }
1591 buf[len++] = q;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001592 NEXT;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001593 q = CUR;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001594 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001595 if (len == 0) {
1596 xmlFree(buf);
1597 return;
1598 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001599
1600 /*
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001601 * Ok the buffer is to be consumed as chars.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001602 */
1603 if (ctxt->sax != NULL) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001604 if (areBlanks(ctxt, buf, len)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001605 if (ctxt->sax->ignorableWhitespace != NULL)
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001606 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, len);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001607 } else {
1608 if (ctxt->sax->characters != NULL)
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001609 ctxt->sax->characters(ctxt->userData, buf, len);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001610 }
1611 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001612 xmlFree(buf);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001613}
1614
1615/**
1616 * htmlParseExternalID:
1617 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001618 * @publicID: a xmlChar** receiving PubidLiteral
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001619 * @strict: indicate whether we should restrict parsing to only
1620 * production [75], see NOTE below
1621 *
1622 * Parse an External ID or a Public ID
1623 *
1624 * NOTE: Productions [75] and [83] interract badly since [75] can generate
1625 * 'PUBLIC' S PubidLiteral S SystemLiteral
1626 *
1627 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
1628 * | 'PUBLIC' S PubidLiteral S SystemLiteral
1629 *
1630 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
1631 *
1632 * Returns the function returns SystemLiteral and in the second
1633 * case publicID receives PubidLiteral, is strict is off
1634 * it is possible to return NULL and have publicID set.
1635 */
1636
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001637xmlChar *
1638htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID, int strict) {
1639 xmlChar *URI = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001640
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001641 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
1642 (UPP(2) == 'S') && (UPP(3) == 'T') &&
1643 (UPP(4) == 'E') && (UPP(5) == 'M')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001644 SKIP(6);
1645 if (!IS_BLANK(CUR)) {
1646 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1647 ctxt->sax->error(ctxt->userData,
1648 "Space required after 'SYSTEM'\n");
1649 ctxt->wellFormed = 0;
1650 }
1651 SKIP_BLANKS;
1652 URI = htmlParseSystemLiteral(ctxt);
1653 if (URI == NULL) {
1654 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1655 ctxt->sax->error(ctxt->userData,
1656 "htmlParseExternalID: SYSTEM, no URI\n");
1657 ctxt->wellFormed = 0;
1658 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001659 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
1660 (UPP(2) == 'B') && (UPP(3) == 'L') &&
1661 (UPP(4) == 'I') && (UPP(5) == 'C')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001662 SKIP(6);
1663 if (!IS_BLANK(CUR)) {
1664 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1665 ctxt->sax->error(ctxt->userData,
1666 "Space required after 'PUBLIC'\n");
1667 ctxt->wellFormed = 0;
1668 }
1669 SKIP_BLANKS;
1670 *publicID = htmlParsePubidLiteral(ctxt);
1671 if (*publicID == NULL) {
1672 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1673 ctxt->sax->error(ctxt->userData,
1674 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
1675 ctxt->wellFormed = 0;
1676 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001677 SKIP_BLANKS;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001678 if ((CUR == '"') || (CUR == '\'')) {
1679 URI = htmlParseSystemLiteral(ctxt);
1680 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001681 }
1682 return(URI);
1683}
1684
1685/**
1686 * htmlParseComment:
1687 * @ctxt: an HTML parser context
1688 * @create: should we create a node, or just skip the content
1689 *
1690 * Parse an XML (SGML) comment <!-- .... -->
1691 *
1692 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
1693 */
1694void
1695htmlParseComment(htmlParserCtxtPtr ctxt, int create) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001696 xmlChar *buf = NULL;
1697 int len = 0;
1698 int size = 100;
1699 register xmlChar s, r, q;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001700
1701 /*
1702 * Check that there is a comment right here.
1703 */
1704 if ((CUR != '<') || (NXT(1) != '!') ||
1705 (NXT(2) != '-') || (NXT(3) != '-')) return;
1706
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001707 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
1708 if (buf == NULL) {
1709 fprintf(stderr, "malloc of %d byte failed\n", size);
1710 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001711 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001712 q = r = '-'; /* 0 or '-' to cover our ass against <!--> and <!---> ? !!! */
1713 SKIP(4);
1714 s = CUR;
1715
1716 while (IS_CHAR(s) &&
1717 ((s != '>') || (r != '-') || (q != '-'))) {
1718 if (len + 1 >= size) {
1719 size *= 2;
1720 buf = xmlRealloc(buf, size * sizeof(xmlChar));
1721 if (buf == NULL) {
1722 fprintf(stderr, "realloc of %d byte failed\n", size);
1723 return;
1724 }
1725 }
1726 buf[len++] = s;
1727 NEXT;
1728 q = r;
1729 r = s;
1730 s = CUR;
1731 }
1732 buf[len - 2] = 0;
1733 if (!IS_CHAR(s)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001734 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001735 ctxt->sax->error(ctxt->userData, "Comment not terminated \n<!--%.50s\n", buf);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001736 ctxt->wellFormed = 0;
1737 } else {
1738 NEXT;
1739 if (create) {
Daniel Veillard4c3a2031999-11-19 17:46:26 +00001740 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL)) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001741 ctxt->sax->comment(ctxt->userData, buf);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00001742 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001743 }
1744 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001745 xmlFree(buf);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001746}
1747
1748/**
1749 * htmlParseCharRef:
1750 * @ctxt: an HTML parser context
1751 *
1752 * parse Reference declarations
1753 *
1754 * [66] CharRef ::= '&#' [0-9]+ ';' |
1755 * '&#x' [0-9a-fA-F]+ ';'
1756 *
1757 * Returns the value parsed (as an int)
1758 */
1759int
1760htmlParseCharRef(htmlParserCtxtPtr ctxt) {
1761 int val = 0;
1762
1763 if ((CUR == '&') && (NXT(1) == '#') &&
1764 (NXT(2) == 'x')) {
1765 SKIP(3);
1766 while (CUR != ';') {
1767 if ((CUR >= '0') && (CUR <= '9'))
1768 val = val * 16 + (CUR - '0');
1769 else if ((CUR >= 'a') && (CUR <= 'f'))
1770 val = val * 16 + (CUR - 'a') + 10;
1771 else if ((CUR >= 'A') && (CUR <= 'F'))
1772 val = val * 16 + (CUR - 'A') + 10;
1773 else {
1774 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1775 ctxt->sax->error(ctxt->userData,
1776 "htmlParseCharRef: invalid hexadecimal value\n");
1777 ctxt->wellFormed = 0;
1778 val = 0;
1779 break;
1780 }
1781 NEXT;
1782 }
1783 if (CUR == ';')
1784 NEXT;
1785 } else if ((CUR == '&') && (NXT(1) == '#')) {
1786 SKIP(2);
1787 while (CUR != ';') {
1788 if ((CUR >= '0') && (CUR <= '9'))
1789 val = val * 10 + (CUR - '0');
1790 else {
1791 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1792 ctxt->sax->error(ctxt->userData,
1793 "htmlParseCharRef: invalid decimal value\n");
1794 ctxt->wellFormed = 0;
1795 val = 0;
1796 break;
1797 }
1798 NEXT;
1799 }
1800 if (CUR == ';')
1801 NEXT;
1802 } else {
1803 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1804 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
1805 ctxt->wellFormed = 0;
1806 }
1807 /*
1808 * Check the value IS_CHAR ...
1809 */
1810 if (IS_CHAR(val)) {
1811 return(val);
1812 } else {
1813 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001814 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001815 val);
1816 ctxt->wellFormed = 0;
1817 }
1818 return(0);
1819}
1820
1821
1822/**
1823 * htmlParseDocTypeDecl :
1824 * @ctxt: an HTML parser context
1825 *
1826 * parse a DOCTYPE declaration
1827 *
1828 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
1829 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
1830 */
1831
1832void
1833htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001834 xmlChar *name;
1835 xmlChar *ExternalID = NULL;
1836 xmlChar *URI = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001837
1838 /*
1839 * We know that '<!DOCTYPE' has been detected.
1840 */
1841 SKIP(9);
1842
1843 SKIP_BLANKS;
1844
1845 /*
1846 * Parse the DOCTYPE name.
1847 */
1848 name = htmlParseName(ctxt);
1849 if (name == NULL) {
1850 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1851 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
1852 ctxt->wellFormed = 0;
1853 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001854 /*
1855 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
1856 */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001857
1858 SKIP_BLANKS;
1859
1860 /*
1861 * Check for SystemID and ExternalID
1862 */
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001863 URI = htmlParseExternalID(ctxt, &ExternalID, 0);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001864 SKIP_BLANKS;
1865
1866 /*
1867 * We should be at the end of the DOCTYPE declaration.
1868 */
1869 if (CUR != '>') {
1870 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1871 ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
1872 ctxt->wellFormed = 0;
1873 /* We shouldn't try to resynchronize ... */
1874 } else {
1875 }
1876 NEXT;
1877
1878 /*
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001879 * Create the document accordingly to the DOCTYPE
1880 */
1881 ctxt->myDoc = htmlNewDoc(URI, ExternalID);
1882
1883 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001884 * Cleanup, since we don't use all those identifiers
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001885 */
Daniel Veillard6454aec1999-09-02 22:04:43 +00001886 if (URI != NULL) xmlFree(URI);
1887 if (ExternalID != NULL) xmlFree(ExternalID);
1888 if (name != NULL) xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001889}
1890
1891/**
1892 * htmlParseAttribute:
1893 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001894 * @value: a xmlChar ** used to store the value of the attribute
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001895 *
1896 * parse an attribute
1897 *
1898 * [41] Attribute ::= Name Eq AttValue
1899 *
1900 * [25] Eq ::= S? '=' S?
1901 *
1902 * With namespace:
1903 *
1904 * [NS 11] Attribute ::= QName Eq AttValue
1905 *
1906 * Also the case QName == xmlns:??? is handled independently as a namespace
1907 * definition.
1908 *
1909 * Returns the attribute name, and the value in *value.
1910 */
1911
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001912xmlChar *
1913htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
1914 xmlChar *name, *val;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001915
1916 *value = NULL;
1917 name = htmlParseName(ctxt);
1918 if (name == NULL) {
1919 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1920 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
1921 ctxt->wellFormed = 0;
1922 return(NULL);
1923 }
1924
1925 /*
1926 * read the value
1927 */
1928 SKIP_BLANKS;
1929 if (CUR == '=') {
1930 NEXT;
1931 SKIP_BLANKS;
1932 val = htmlParseAttValue(ctxt);
1933 } else {
Daniel Veillard4a53eca1999-12-12 13:03:50 +00001934 /* TODO : some attribute must have values, some may not */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001935 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1936 ctxt->sax->error(ctxt->userData,
1937 "Specification mandate value for attribute %s\n", name);
1938 ctxt->wellFormed = 0;
1939 return(NULL);
1940 }
1941
1942 *value = val;
1943 return(name);
1944}
1945
1946/**
1947 * htmlParseStartTag:
1948 * @ctxt: an HTML parser context
1949 *
1950 * parse a start of tag either for rule element or
1951 * EmptyElement. In both case we don't parse the tag closing chars.
1952 *
1953 * [40] STag ::= '<' Name (S Attribute)* S? '>'
1954 *
1955 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
1956 *
1957 * With namespace:
1958 *
1959 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
1960 *
1961 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
1962 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001963 */
1964
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001965void
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001966htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001967 xmlChar *name;
1968 xmlChar *attname;
1969 xmlChar *attvalue;
1970 const xmlChar **atts = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001971 int nbatts = 0;
1972 int maxatts = 0;
1973 int i;
1974
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001975 if (CUR != '<') return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001976 NEXT;
1977
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001978 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001979 name = htmlParseHTMLName(ctxt);
1980 if (name == NULL) {
1981 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1982 ctxt->sax->error(ctxt->userData,
1983 "htmlParseStartTag: invalid element name\n");
1984 ctxt->wellFormed = 0;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001985 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001986 }
1987
1988 /*
1989 * Check for auto-closure of HTML elements.
1990 */
1991 htmlAutoClose(ctxt, name);
1992
1993 /*
1994 * Now parse the attributes, it ends up with the ending
1995 *
1996 * (S Attribute)* S?
1997 */
1998 SKIP_BLANKS;
1999 while ((IS_CHAR(CUR)) &&
2000 (CUR != '>') &&
2001 ((CUR != '/') || (NXT(1) != '>'))) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002002 long cons = ctxt->nbChars;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002003
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002004 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002005 attname = htmlParseAttribute(ctxt, &attvalue);
2006 if ((attname != NULL) && (attvalue != NULL)) {
2007 /*
2008 * Well formedness requires at most one declaration of an attribute
2009 */
2010 for (i = 0; i < nbatts;i += 2) {
2011 if (!xmlStrcmp(atts[i], attname)) {
2012 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002013 ctxt->sax->error(ctxt->userData,
2014 "Attribute %s redefined\n",
2015 attname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002016 ctxt->wellFormed = 0;
Daniel Veillard6454aec1999-09-02 22:04:43 +00002017 xmlFree(attname);
2018 xmlFree(attvalue);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002019 goto failed;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002020 }
2021 }
2022
2023 /*
2024 * Add the pair to atts
2025 */
2026 if (atts == NULL) {
2027 maxatts = 10;
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002028 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002029 if (atts == NULL) {
2030 fprintf(stderr, "malloc of %ld byte failed\n",
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002031 maxatts * (long)sizeof(xmlChar *));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002032 if (name != NULL) xmlFree(name);
2033 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002034 }
Daniel Veillard51e3b151999-11-12 17:02:31 +00002035 } else if (nbatts + 4 > maxatts) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002036 maxatts *= 2;
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002037 atts = (const xmlChar **) xmlRealloc(atts, maxatts * sizeof(xmlChar *));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002038 if (atts == NULL) {
2039 fprintf(stderr, "realloc of %ld byte failed\n",
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002040 maxatts * (long)sizeof(xmlChar *));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002041 if (name != NULL) xmlFree(name);
2042 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002043 }
2044 }
2045 atts[nbatts++] = attname;
2046 atts[nbatts++] = attvalue;
2047 atts[nbatts] = NULL;
2048 atts[nbatts + 1] = NULL;
2049 }
2050
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002051failed:
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002052 SKIP_BLANKS;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002053 if (cons == ctxt->nbChars) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002054 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2055 ctxt->sax->error(ctxt->userData,
2056 "htmlParseStartTag: problem parsing attributes\n");
2057 ctxt->wellFormed = 0;
2058 break;
2059 }
2060 }
2061
2062 /*
2063 * SAX: Start of Element !
2064 */
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002065 htmlnamePush(ctxt, xmlStrdup(name));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002066#ifdef DEBUG
2067 fprintf(stderr,"Start of element %s: pushed %s\n", name, ctxt->name);
2068#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002069 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
2070 ctxt->sax->startElement(ctxt->userData, name, atts);
2071
2072 if (atts != NULL) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002073 for (i = 0;i < nbatts;i++) xmlFree((xmlChar *) atts[i]);
Daniel Veillard6454aec1999-09-02 22:04:43 +00002074 xmlFree(atts);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002075 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002076 if (name != NULL) xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002077}
2078
2079/**
2080 * htmlParseEndTag:
2081 * @ctxt: an HTML parser context
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002082 *
2083 * parse an end of tag
2084 *
2085 * [42] ETag ::= '</' Name S? '>'
2086 *
2087 * With namespace
2088 *
2089 * [NS 9] ETag ::= '</' QName S? '>'
2090 */
2091
2092void
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002093htmlParseEndTag(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002094 xmlChar *name;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002095 xmlChar *oldname;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002096 int i;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002097
2098 if ((CUR != '<') || (NXT(1) != '/')) {
2099 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2100 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
2101 ctxt->wellFormed = 0;
2102 return;
2103 }
2104 SKIP(2);
2105
2106 name = htmlParseHTMLName(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002107 if (name == NULL) return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002108
2109 /*
2110 * We should definitely be at the ending "S? '>'" part
2111 */
2112 SKIP_BLANKS;
2113 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
2114 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2115 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
2116 ctxt->wellFormed = 0;
2117 } else
2118 NEXT;
2119
2120 /*
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002121 * If the name read is not one of the element in the parsing stack
2122 * then return, it's just an error.
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002123 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002124 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
2125 if (!xmlStrcmp(name, ctxt->nameTab[i])) break;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002126 }
2127 if (i < 0) {
2128 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002129 ctxt->sax->error(ctxt->userData,
2130 "Unexpected end tag : %s\n", name);
Daniel Veillard6454aec1999-09-02 22:04:43 +00002131 xmlFree(name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002132 ctxt->wellFormed = 0;
2133 return;
2134 }
2135
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002136
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002137 /*
2138 * Check for auto-closure of HTML elements.
2139 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002140
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002141 htmlAutoCloseOnClose(ctxt, name);
2142
2143 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002144 * Well formedness constraints, opening and closing must match.
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002145 * With the exception that the autoclose may have popped stuff out
2146 * of the stack.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002147 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002148 if (xmlStrcmp(name, ctxt->name)) {
2149#ifdef DEBUG
2150 fprintf(stderr,"End of tag %s: expecting %s\n", name, ctxt->name);
2151#endif
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002152 if ((ctxt->name != NULL) &&
2153 (xmlStrcmp(ctxt->name, name))) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002154 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2155 ctxt->sax->error(ctxt->userData,
2156 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002157 name, ctxt->name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002158 ctxt->wellFormed = 0;
2159 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002160 }
2161
2162 /*
2163 * SAX: End of Tag
2164 */
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002165 oldname = ctxt->name;
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002166 if ((oldname != NULL) && (!xmlStrcmp(oldname, name))) {
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002167 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2168 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002169 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002170 if (oldname != NULL) {
2171#ifdef DEBUG
2172 fprintf(stderr,"End of tag %s: popping out %s\n", name, oldname);
2173#endif
2174 xmlFree(oldname);
2175#ifdef DEBUG
2176 } else {
2177 fprintf(stderr,"End of tag %s: stack empty !!!\n", name);
2178#endif
2179 }
2180 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002181
2182 if (name != NULL)
Daniel Veillard6454aec1999-09-02 22:04:43 +00002183 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002184
2185 return;
2186}
2187
2188
2189/**
2190 * htmlParseReference:
2191 * @ctxt: an HTML parser context
2192 *
2193 * parse and handle entity references in content,
2194 * this will end-up in a call to character() since this is either a
2195 * CharRef, or a predefined entity.
2196 */
2197void
2198htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002199 htmlEntityDescPtr ent;
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002200 xmlChar out[2];
2201 xmlChar *name;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002202 int val;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002203 if (CUR != '&') return;
2204
2205 if (NXT(1) == '#') {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002206 val = htmlParseCharRef(ctxt);
Daniel Veillardb96e6431999-08-29 21:02:19 +00002207 /* invalid for UTF-8 variable encoding !!!!! */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002208 out[0] = val;
2209 out[1] = 0;
2210 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
2211 ctxt->sax->characters(ctxt->userData, out, 1);
2212 } else {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002213 ent = htmlParseEntityRef(ctxt, &name);
2214 if (name == NULL) return; /* Shall we output & anyway ? */
2215 if ((ent == NULL) || (ent->value <= 0) || (ent->value >= 255)) {
2216 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
Daniel Veillardb96e6431999-08-29 21:02:19 +00002217 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002218 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
Daniel Veillardb96e6431999-08-29 21:02:19 +00002219 ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002220 }
2221 } else {
Daniel Veillardb96e6431999-08-29 21:02:19 +00002222 /* invalid for UTF-8 variable encoding !!!!! */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002223 out[0] = ent->value;
2224 out[1] = 0;
2225 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
2226 ctxt->sax->characters(ctxt->userData, out, 1);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002227 }
Daniel Veillard6454aec1999-09-02 22:04:43 +00002228 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002229 }
2230}
2231
2232/**
2233 * htmlParseContent:
2234 * @ctxt: an HTML parser context
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002235 * @name: the node name
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002236 *
2237 * Parse a content: comment, sub-element, reference or text.
2238 *
2239 */
2240
2241void
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002242htmlParseContent(htmlParserCtxtPtr ctxt) {
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002243 xmlChar *currentNode;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002244 int depth;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002245
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002246 currentNode = xmlStrdup(ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002247 depth = ctxt->nameNr;
2248 while (1) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002249 long cons = ctxt->nbChars;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002250
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002251 GROW;
2252 /*
2253 * Our tag or one of it's parent or children is ending.
2254 */
2255 if ((CUR == '<') && (NXT(1) == '/')) {
2256 htmlParseEndTag(ctxt);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002257 if (currentNode != NULL) xmlFree(currentNode);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002258 return;
2259 }
2260
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002261 /*
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002262 * Has this node been popped out during parsing of
2263 * the next element
2264 */
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002265 if ((xmlStrcmp(currentNode, ctxt->name)) &&
2266 (depth >= ctxt->nameNr)) {
2267 if (currentNode != NULL) xmlFree(currentNode);
2268 return;
2269 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002270
2271 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002272 * First case : a comment
2273 */
2274 if ((CUR == '<') && (NXT(1) == '!') &&
2275 (NXT(2) == '-') && (NXT(3) == '-')) {
2276 htmlParseComment(ctxt, 1);
2277 }
2278
2279 /*
2280 * Second case : a sub-element.
2281 */
2282 else if (CUR == '<') {
2283 htmlParseElement(ctxt);
2284 }
2285
2286 /*
2287 * Third case : a reference. If if has not been resolved,
2288 * parsing returns it's Name, create the node
2289 */
2290 else if (CUR == '&') {
2291 htmlParseReference(ctxt);
2292 }
2293
2294 /*
2295 * Last case, text. Note that References are handled directly.
2296 */
2297 else {
2298 htmlParseCharData(ctxt, 0);
2299 }
2300
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002301 if (cons == ctxt->nbChars) {
Daniel Veillard35008381999-10-25 13:15:52 +00002302 if (ctxt->node != NULL) {
2303 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2304 ctxt->sax->error(ctxt->userData,
2305 "detected an error in element content\n");
2306 ctxt->wellFormed = 0;
2307 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002308 break;
2309 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002310
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002311 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002312 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002313 if (currentNode != NULL) xmlFree(currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002314}
2315
2316/**
2317 * htmlParseElement:
2318 * @ctxt: an HTML parser context
2319 *
2320 * parse an HTML element, this is highly recursive
2321 *
2322 * [39] element ::= EmptyElemTag | STag content ETag
2323 *
2324 * [41] Attribute ::= Name Eq AttValue
2325 */
2326
2327void
2328htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002329 const xmlChar *openTag = CUR_PTR;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002330 xmlChar *oldname;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002331 xmlChar *name;
Daniel Veillard7d2c2761999-10-11 15:09:51 +00002332 xmlChar *currentNode = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002333 htmlElemDescPtr info;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002334 htmlParserNodeInfo node_info;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002335 int depth = ctxt->nameNr;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002336
2337 /* Capture start position */
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002338 if (ctxt->record_info) {
2339 node_info.begin_pos = ctxt->input->consumed +
2340 (CUR_PTR - ctxt->input->base);
2341 node_info.begin_line = ctxt->input->line;
2342 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002343
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002344 oldname = xmlStrdup(ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002345 htmlParseStartTag(ctxt);
2346 name = ctxt->name;
2347#ifdef DEBUG
2348 if (oldname == NULL)
2349 fprintf(stderr, "Start of element %s\n", name);
2350 else if (name == NULL)
2351 fprintf(stderr, "Start of element failed, was %s\n", oldname);
2352 else
2353 fprintf(stderr, "Start of element %s, was %s\n", name, oldname);
2354#endif
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002355 if (((depth == ctxt->nameNr) && (!xmlStrcmp(oldname, ctxt->name))) ||
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002356 (name == NULL)) {
2357 if (CUR == '>')
2358 NEXT;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002359 if (oldname != NULL)
2360 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002361 return;
2362 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002363 if (oldname != NULL)
2364 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002365
2366 /*
2367 * Lookup the info for that element.
2368 */
2369 info = htmlTagLookup(name);
2370 if (info == NULL) {
2371 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2372 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
2373 name);
2374 ctxt->wellFormed = 0;
2375 } else if (info->depr) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002376/***************************
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002377 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
2378 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
2379 name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002380 ***************************/
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002381 }
2382
2383 /*
2384 * Check for an Empty Element labelled the XML/SGML way
2385 */
2386 if ((CUR == '/') && (NXT(1) == '>')) {
2387 SKIP(2);
2388 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2389 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002390 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002391#ifdef DEBUG
2392 fprintf(stderr,"End of tag the XML way: popping out %s\n", oldname);
2393#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002394 if (oldname != NULL)
2395 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002396 return;
2397 }
2398
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002399 if (CUR == '>') {
2400 NEXT;
2401 } else {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002402 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2403 ctxt->sax->error(ctxt->userData, "Couldn't find end of Start Tag\n%.30s\n",
2404 openTag);
2405 ctxt->wellFormed = 0;
2406
2407 /*
2408 * end of parsing of this node.
2409 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002410 if (!xmlStrcmp(name, ctxt->name)) {
2411 nodePop(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002412 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002413#ifdef DEBUG
2414 fprintf(stderr,"End of start tag problem: popping out %s\n", oldname);
2415#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002416 if (oldname != NULL)
2417 xmlFree(oldname);
2418 }
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002419
2420 /*
2421 * Capture end position and add node
2422 */
2423 if ( currentNode != NULL && ctxt->record_info ) {
2424 node_info.end_pos = ctxt->input->consumed +
2425 (CUR_PTR - ctxt->input->base);
2426 node_info.end_line = ctxt->input->line;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002427 node_info.node = ctxt->node;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002428 xmlParserAddNodeInfo(ctxt, &node_info);
2429 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002430 return;
2431 }
2432
2433 /*
2434 * Check for an Empty Element from DTD definition
2435 */
2436 if ((info != NULL) && (info->empty)) {
2437 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2438 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002439 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002440#ifdef DEBUG
2441 fprintf(stderr,"End of empty tag %s : popping out %s\n", name, oldname);
2442#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002443 if (oldname != NULL)
2444 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002445 return;
2446 }
2447
2448 /*
2449 * Parse the content of the element:
2450 */
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002451 currentNode = xmlStrdup(ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002452 depth = ctxt->nameNr;
2453 while (IS_CHAR(CUR)) {
2454 htmlParseContent(ctxt);
2455 if (ctxt->nameNr < depth) break;
2456 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002457
2458 if (!IS_CHAR(CUR)) {
2459 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2460 ctxt->sax->error(ctxt->userData,
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002461 "Premature end of data in tag %s\n", currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002462 ctxt->wellFormed = 0;
2463
2464 /*
2465 * end of parsing of this node.
2466 */
2467 nodePop(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002468 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002469#ifdef DEBUG
2470 fprintf(stderr,"Premature end of tag %s : popping out %s\n", name, oldname);
2471#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002472 if (oldname != NULL)
2473 xmlFree(oldname);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002474 if (currentNode != NULL)
2475 xmlFree(currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002476 return;
2477 }
2478
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002479 /*
2480 * Capture end position and add node
2481 */
2482 if ( currentNode != NULL && ctxt->record_info ) {
2483 node_info.end_pos = ctxt->input->consumed +
2484 (CUR_PTR - ctxt->input->base);
2485 node_info.end_line = ctxt->input->line;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002486 node_info.node = ctxt->node;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002487 xmlParserAddNodeInfo(ctxt, &node_info);
2488 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002489 if (currentNode != NULL)
2490 xmlFree(currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002491}
2492
2493/**
2494 * htmlParseDocument :
2495 * @ctxt: an HTML parser context
2496 *
2497 * parse an HTML document (and build a tree if using the standard SAX
2498 * interface).
2499 *
2500 * Returns 0, -1 in case of error. the parser context is augmented
2501 * as a result of the parsing.
2502 */
2503
2504int
2505htmlParseDocument(htmlParserCtxtPtr ctxt) {
2506 htmlDefaultSAXHandlerInit();
2507 ctxt->html = 1;
2508
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002509 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002510 /*
Daniel Veillardb96e6431999-08-29 21:02:19 +00002511 * SAX: beginning of the document processing.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002512 */
2513 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
2514 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
2515
2516 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002517 * Wipe out everything which is before the first '<'
2518 */
Daniel Veillard35008381999-10-25 13:15:52 +00002519 SKIP_BLANKS;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002520 if (CUR == 0) {
2521 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2522 ctxt->sax->error(ctxt->userData, "Document is empty\n");
2523 ctxt->wellFormed = 0;
2524 }
2525
Daniel Veillard35008381999-10-25 13:15:52 +00002526 /*
2527 * Parse possible comments before any content
2528 */
2529 while ((CUR == '<') && (NXT(1) == '!') &&
2530 (NXT(2) == '-') && (NXT(3) == '-')) {
2531 ctxt->myDoc = htmlNewDoc(NULL, NULL);
2532 htmlParseComment(ctxt, 1);
2533 SKIP_BLANKS;
2534 }
2535
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002536
2537 /*
2538 * Then possibly doc type declaration(s) and more Misc
2539 * (doctypedecl Misc*)?
2540 */
2541 if ((CUR == '<') && (NXT(1) == '!') &&
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002542 (UPP(2) == 'D') && (UPP(3) == 'O') &&
2543 (UPP(4) == 'C') && (UPP(5) == 'T') &&
2544 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
2545 (UPP(8) == 'E')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002546 htmlParseDocTypeDecl(ctxt);
2547 }
2548 SKIP_BLANKS;
2549
2550 /*
2551 * Create the document if not done already.
2552 */
2553 if (ctxt->myDoc == NULL) {
2554 ctxt->myDoc = htmlNewDoc(NULL, NULL);
2555 }
2556
2557 /*
2558 * Time to start parsing the tree itself
2559 */
Daniel Veillard35008381999-10-25 13:15:52 +00002560 htmlParseContent(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002561
2562 /*
2563 * SAX: end of the document processing.
2564 */
2565 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
2566 ctxt->sax->endDocument(ctxt->userData);
2567 if (! ctxt->wellFormed) return(-1);
2568 return(0);
2569}
2570
2571
2572/********************************************************************************
2573 * *
2574 * Parser contexts handling *
2575 * *
2576 ********************************************************************************/
2577
2578/**
2579 * xmlInitParserCtxt:
2580 * @ctxt: an HTML parser context
2581 *
2582 * Initialize a parser context
2583 */
2584
2585void
2586htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
2587{
2588 htmlSAXHandler *sax;
2589
Daniel Veillard35008381999-10-25 13:15:52 +00002590 if (ctxt == NULL) return;
2591 memset(ctxt, 0, sizeof(htmlParserCtxt));
2592
Daniel Veillard6454aec1999-09-02 22:04:43 +00002593 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002594 if (sax == NULL) {
2595 fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
2596 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002597 memset(sax, 0, sizeof(htmlSAXHandler));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002598
2599 /* Allocate the Input stack */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002600 ctxt->inputTab = (htmlParserInputPtr *)
2601 xmlMalloc(5 * sizeof(htmlParserInputPtr));
2602 if (ctxt->inputTab == NULL) {
2603 fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
2604 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002605 ctxt->inputNr = 0;
2606 ctxt->inputMax = 5;
2607 ctxt->input = NULL;
2608 ctxt->version = NULL;
2609 ctxt->encoding = NULL;
2610 ctxt->standalone = -1;
2611
2612 /* Allocate the Node stack */
Daniel Veillard6454aec1999-09-02 22:04:43 +00002613 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002614 ctxt->nodeNr = 0;
2615 ctxt->nodeMax = 10;
2616 ctxt->node = NULL;
2617
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002618 /* Allocate the Name stack */
2619 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
2620 ctxt->nameNr = 0;
2621 ctxt->nameMax = 10;
2622 ctxt->name = NULL;
2623
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002624 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
2625 else {
2626 ctxt->sax = sax;
2627 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
2628 }
2629 ctxt->userData = ctxt;
2630 ctxt->myDoc = NULL;
2631 ctxt->wellFormed = 1;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002632 ctxt->replaceEntities = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002633 ctxt->html = 1;
2634 ctxt->record_info = 0;
Daniel Veillard35008381999-10-25 13:15:52 +00002635 ctxt->validate = 0;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002636 ctxt->nbChars = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002637 xmlInitNodeInfoSeq(&ctxt->node_seq);
2638}
2639
2640/**
2641 * htmlFreeParserCtxt:
2642 * @ctxt: an HTML parser context
2643 *
2644 * Free all the memory used by a parser context. However the parsed
2645 * document in ctxt->myDoc is not freed.
2646 */
2647
2648void
2649htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
2650{
2651 htmlParserInputPtr input;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002652 xmlChar *oldname;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002653
2654 if (ctxt == NULL) return;
2655
2656 while ((input = inputPop(ctxt)) != NULL) {
2657 xmlFreeInputStream(input);
2658 }
2659
Daniel Veillard6454aec1999-09-02 22:04:43 +00002660 if (ctxt->nodeTab != NULL) xmlFree(ctxt->nodeTab);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002661 while ((oldname = htmlnamePop(ctxt)) != NULL) {
2662 xmlFree(oldname);
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002663 }
2664 if (ctxt->nameTab != NULL) xmlFree(ctxt->nameTab);
Daniel Veillard6454aec1999-09-02 22:04:43 +00002665 if (ctxt->inputTab != NULL) xmlFree(ctxt->inputTab);
2666 if (ctxt->version != NULL) xmlFree((char *) ctxt->version);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002667 if ((ctxt->sax != NULL) && (ctxt->sax != &htmlDefaultSAXHandler))
Daniel Veillard6454aec1999-09-02 22:04:43 +00002668 xmlFree(ctxt->sax);
2669 xmlFree(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002670}
2671
2672/**
2673 * htmlCreateDocParserCtxt :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002674 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002675 * @encoding: a free form C string describing the HTML document encoding, or NULL
2676 *
2677 * Create a parser context for an HTML document.
2678 *
2679 * Returns the new parser context or NULL
2680 */
2681htmlParserCtxtPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002682htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002683 htmlParserCtxtPtr ctxt;
2684 htmlParserInputPtr input;
2685 /* htmlCharEncoding enc; */
2686
Daniel Veillard6454aec1999-09-02 22:04:43 +00002687 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002688 if (ctxt == NULL) {
2689 perror("malloc");
2690 return(NULL);
2691 }
2692 htmlInitParserCtxt(ctxt);
Daniel Veillard6454aec1999-09-02 22:04:43 +00002693 input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002694 if (input == NULL) {
2695 perror("malloc");
Daniel Veillard6454aec1999-09-02 22:04:43 +00002696 xmlFree(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002697 return(NULL);
2698 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002699 memset(input, 0, sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002700
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002701 input->line = 1;
2702 input->col = 1;
2703 input->base = cur;
2704 input->cur = cur;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002705
2706 inputPush(ctxt, input);
2707 return(ctxt);
2708}
2709
2710/********************************************************************************
2711 * *
2712 * User entry points *
2713 * *
2714 ********************************************************************************/
2715
2716/**
2717 * htmlSAXParseDoc :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002718 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002719 * @encoding: a free form C string describing the HTML document encoding, or NULL
2720 * @sax: the SAX handler block
2721 * @userData: if using SAX, this pointer will be provided on callbacks.
2722 *
2723 * parse an HTML in-memory document and build a tree.
2724 * It use the given SAX function block to handle the parsing callback.
2725 * If sax is NULL, fallback to the default DOM tree building routines.
2726 *
2727 * Returns the resulting document tree
2728 */
2729
2730htmlDocPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002731htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002732 htmlDocPtr ret;
2733 htmlParserCtxtPtr ctxt;
2734
2735 if (cur == NULL) return(NULL);
2736
2737
2738 ctxt = htmlCreateDocParserCtxt(cur, encoding);
2739 if (ctxt == NULL) return(NULL);
2740 if (sax != NULL) {
2741 ctxt->sax = sax;
2742 ctxt->userData = userData;
2743 }
2744
2745 htmlParseDocument(ctxt);
2746 ret = ctxt->myDoc;
2747 if (sax != NULL) {
2748 ctxt->sax = NULL;
2749 ctxt->userData = NULL;
2750 }
2751 htmlFreeParserCtxt(ctxt);
2752
2753 return(ret);
2754}
2755
2756/**
2757 * htmlParseDoc :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002758 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002759 * @encoding: a free form C string describing the HTML document encoding, or NULL
2760 *
2761 * parse an HTML in-memory document and build a tree.
2762 *
2763 * Returns the resulting document tree
2764 */
2765
2766htmlDocPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002767htmlParseDoc(xmlChar *cur, const char *encoding) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002768 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
2769}
2770
2771
2772/**
2773 * htmlCreateFileParserCtxt :
2774 * @filename: the filename
2775 * @encoding: a free form C string describing the HTML document encoding, or NULL
2776 *
2777 * Create a parser context for a file content.
2778 * Automatic support for ZLIB/Compress compressed document is provided
2779 * by default if found at compile-time.
2780 *
2781 * Returns the new parser context or NULL
2782 */
2783htmlParserCtxtPtr
2784htmlCreateFileParserCtxt(const char *filename, const char *encoding)
2785{
2786 htmlParserCtxtPtr ctxt;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002787 htmlParserInputPtr inputStream;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002788 xmlParserInputBufferPtr buf;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002789 /* htmlCharEncoding enc; */
2790
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002791 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
2792 if (buf == NULL) return(NULL);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002793
Daniel Veillard6454aec1999-09-02 22:04:43 +00002794 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002795 if (ctxt == NULL) {
2796 perror("malloc");
2797 return(NULL);
2798 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002799 memset(ctxt, 0, sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002800 htmlInitParserCtxt(ctxt);
Daniel Veillard6454aec1999-09-02 22:04:43 +00002801 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002802 if (inputStream == NULL) {
2803 perror("malloc");
Daniel Veillard6454aec1999-09-02 22:04:43 +00002804 xmlFree(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002805 return(NULL);
2806 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002807 memset(inputStream, 0, sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002808
Daniel Veillard6454aec1999-09-02 22:04:43 +00002809 inputStream->filename = xmlMemStrdup(filename);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002810 inputStream->line = 1;
2811 inputStream->col = 1;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002812 inputStream->buf = buf;
Daniel Veillard35008381999-10-25 13:15:52 +00002813 inputStream->directory = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002814
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002815 inputStream->base = inputStream->buf->buffer->content;
2816 inputStream->cur = inputStream->buf->buffer->content;
2817 inputStream->free = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002818
2819 inputPush(ctxt, inputStream);
2820 return(ctxt);
2821}
2822
2823/**
2824 * htmlSAXParseFile :
2825 * @filename: the filename
2826 * @encoding: a free form C string describing the HTML document encoding, or NULL
2827 * @sax: the SAX handler block
2828 * @userData: if using SAX, this pointer will be provided on callbacks.
2829 *
2830 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
2831 * compressed document is provided by default if found at compile-time.
2832 * It use the given SAX function block to handle the parsing callback.
2833 * If sax is NULL, fallback to the default DOM tree building routines.
2834 *
2835 * Returns the resulting document tree
2836 */
2837
2838htmlDocPtr
2839htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
2840 void *userData) {
2841 htmlDocPtr ret;
2842 htmlParserCtxtPtr ctxt;
2843
2844 ctxt = htmlCreateFileParserCtxt(filename, encoding);
2845 if (ctxt == NULL) return(NULL);
2846 if (sax != NULL) {
2847 ctxt->sax = sax;
2848 ctxt->userData = userData;
2849 }
2850
2851 htmlParseDocument(ctxt);
2852
2853 ret = ctxt->myDoc;
2854 if (sax != NULL) {
2855 ctxt->sax = NULL;
2856 ctxt->userData = NULL;
2857 }
2858 htmlFreeParserCtxt(ctxt);
2859
2860 return(ret);
2861}
2862
2863/**
2864 * htmlParseFile :
2865 * @filename: the filename
2866 * @encoding: a free form C string describing the HTML document encoding, or NULL
2867 *
2868 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
2869 * compressed document is provided by default if found at compile-time.
2870 *
2871 * Returns the resulting document tree
2872 */
2873
2874htmlDocPtr
2875htmlParseFile(const char *filename, const char *encoding) {
2876 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
2877}