blob: a559d9b7dd983ec514d5d8b0c30ce68959e00e98 [file] [log] [blame]
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * Daniel.Veillard@w3.org
7 */
8
9#ifdef WIN32
10#define HAVE_FCNTL_H
11#include <io.h>
12#else
13#include <config.h>
14#endif
15#include <stdio.h>
16#include <ctype.h>
17#include <string.h> /* for memset() only */
18#include <stdlib.h>
19#include <sys/stat.h>
20#ifdef HAVE_FCNTL_H
21#include <fcntl.h>
22#endif
23#ifdef HAVE_UNISTD_H
24#include <unistd.h>
25#endif
26#ifdef HAVE_ZLIB_H
27#include <zlib.h>
28#endif
29
Daniel Veillard6454aec1999-09-02 22:04:43 +000030#include "xmlmemory.h"
Daniel Veillardbe70ff71999-07-05 16:50:46 +000031#include "tree.h"
32#include "HTMLparser.h"
33#include "entities.h"
34#include "encoding.h"
35#include "valid.h"
36#include "parserInternals.h"
Daniel Veillarde2d034d1999-07-27 19:52:06 +000037#include "xmlIO.h"
38
39#define HTML_MAX_NAMELEN 1000
40#define INPUT_CHUNK 50
Daniel Veillardbe70ff71999-07-05 16:50:46 +000041
Daniel Veillard82150d81999-07-07 07:32:15 +000042/* #define DEBUG */
Daniel Veillard5233ffc1999-07-06 22:25:25 +000043
44/************************************************************************
45 * *
46 * Parser stacks related functions and macros *
47 * *
48 ************************************************************************/
49
50/*
51 * Generic function for accessing stacks in the Parser Context
52 */
53
54#define PUSH_AND_POP(type, name) \
55int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
56 if (ctxt->name##Nr >= ctxt->name##Max) { \
57 ctxt->name##Max *= 2; \
Daniel Veillard6454aec1999-09-02 22:04:43 +000058 ctxt->name##Tab = (void *) xmlRealloc(ctxt->name##Tab, \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000059 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
60 if (ctxt->name##Tab == NULL) { \
61 fprintf(stderr, "realloc failed !\n"); \
62 exit(1); \
63 } \
64 } \
65 ctxt->name##Tab[ctxt->name##Nr] = value; \
66 ctxt->name = value; \
67 return(ctxt->name##Nr++); \
68} \
69type html##name##Pop(htmlParserCtxtPtr ctxt) { \
70 type ret; \
71 if (ctxt->name##Nr <= 0) return(0); \
72 ctxt->name##Nr--; \
73 if (ctxt->name##Nr > 0) \
74 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
75 else \
76 ctxt->name = NULL; \
77 ret = ctxt->name##Tab[ctxt->name##Nr]; \
78 ctxt->name##Tab[ctxt->name##Nr] = 0; \
79 return(ret); \
80} \
81
82PUSH_AND_POP(xmlNodePtr, node)
83
84/*
85 * Macros for accessing the content. Those should be used only by the parser,
86 * and not exported.
87 *
88 * Dirty macros, i.e. one need to make assumption on the context to use them
89 *
90 * CUR_PTR return the current pointer to the CHAR to be parsed.
91 * CUR returns the current CHAR value, i.e. a 8 bit value if compiled
92 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
93 * in UNICODE mode. This should be used internally by the parser
94 * only to compare to ASCII values otherwise it would break when
95 * running with UTF-8 encoding.
96 * NXT(n) returns the n'th next CHAR. Same as CUR is should be used only
97 * to compare on ASCII based substring.
98 * UPP(n) returns the n'th next CHAR converted to uppercase. Same as CUR
99 * it should be used only to compare on ASCII based substring.
100 * SKIP(n) Skip n CHAR, and must also be used only to skip ASCII defined
101 * strings within the parser.
102 *
103 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
104 *
105 * CURRENT Returns the current char value, with the full decoding of
106 * UTF-8 if we are using this mode. It returns an int.
107 * NEXT Skip to the next character, this does the proper decoding
108 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000109 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
110 */
111
112#define CUR (*ctxt->input->cur)
113#define UPPER (toupper(*ctxt->input->cur))
114#define SKIP(val) ctxt->input->cur += (val)
115#define NXT(val) ctxt->input->cur[(val)]
116#define UPP(val) (toupper(ctxt->input->cur[(val)]))
117#define CUR_PTR ctxt->input->cur
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000118#define SHRINK xmlParserInputShrink(ctxt->input)
119#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000120
121#define SKIP_BLANKS \
122 while (IS_BLANK(*(ctxt->input->cur))) NEXT
123
124#ifndef USE_UTF_8
125#define CURRENT (*ctxt->input->cur)
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000126#define NEXT { \
127 if ((*ctxt->input->cur == 0) && \
128 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) { \
129 xmlPopInput(ctxt); \
130 } else { \
131 if (*(ctxt->input->cur) == '\n') { \
132 ctxt->input->line++; ctxt->input->col = 1; \
133 } else ctxt->input->col++; \
134 ctxt->input->cur++; \
135 if (*ctxt->input->cur == 0) \
136 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); \
137 }}
138
139/****************************************
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000140#define NEXT ((*ctxt->input->cur) ? \
141 (((*(ctxt->input->cur) == '\n') ? \
142 (ctxt->input->line++, ctxt->input->col = 1) : \
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000143 (ctxt->input->col++)), \
144 (ctxt->input->cur++), \
145 ((*ctxt->input->cur) ? \
146 (xmlParserInputGrow(ctxt->input, 100), \
147 ctxt->input->cur): \
148 (ctxt->input->cur))) : \
149 ((xmlParserInputGrow(ctxt->input, 100) > 0) ? \
150 ctxt->input->cur: \
151 (xmlPopInput(ctxt), ctxt->input->cur)))
152 ****************************************/
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000153#else
154#endif
155
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000156
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000157
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000158/************************************************************************
159 * *
160 * The list of HTML elements and their properties *
161 * *
162 ************************************************************************/
163
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000164/*
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000165 * Start Tag: 1 means the start tag can be ommited
166 * End Tag: 1 means the end tag can be ommited
167 * 2 means it's forbidden (empty elements)
168 * Depr: this element is deprecated
169 * DTD: 1 means that this element is valid only in the Loose DTD
170 * 2 means that this element is valid only in the Frameset DTD
171 *
172 * Name,Start Tag,End Tag, Empty, Depr., DTD, Description
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000173 */
174htmlElemDesc html40ElementTable[] = {
175{ "A", 0, 0, 0, 0, 0, "anchor " },
176{ "ABBR", 0, 0, 0, 0, 0, "abbreviated form" },
177{ "ACRONYM", 0, 0, 0, 0, 0, "" },
178{ "ADDRESS", 0, 0, 0, 0, 0, "information on author " },
179{ "APPLET", 0, 0, 0, 1, 1, "Java applet " },
180{ "AREA", 0, 2, 1, 0, 0, "client-side image map area " },
181{ "B", 0, 0, 0, 0, 0, "bold text style" },
182{ "BASE", 0, 2, 1, 0, 0, "document base URI " },
183{ "BASEFONT", 0, 2, 1, 1, 1, "base font size " },
184{ "BDO", 0, 0, 0, 0, 0, "I18N BiDi over-ride " },
185{ "BIG", 0, 0, 0, 0, 0, "large text style" },
186{ "BLOCKQUOTE", 0, 0, 0, 0, 0, "long quotation " },
187{ "BODY", 1, 1, 0, 0, 0, "document body " },
188{ "BR", 0, 2, 1, 0, 0, "forced line break " },
189{ "BUTTON", 0, 0, 0, 0, 0, "push button " },
190{ "CAPTION", 0, 0, 0, 0, 0, "table caption " },
191{ "CENTER", 0, 0, 0, 1, 1, "shorthand for DIV align=center " },
192{ "CITE", 0, 0, 0, 0, 0, "citation" },
193{ "CODE", 0, 0, 0, 0, 0, "computer code fragment" },
194{ "COL", 0, 2, 1, 0, 0, "table column " },
195{ "COLGROUP", 0, 1, 0, 0, 0, "table column group " },
196{ "DD", 0, 1, 0, 0, 0, "definition description " },
197{ "DEL", 0, 0, 0, 0, 0, "deleted text " },
198{ "DFN", 0, 0, 0, 0, 0, "instance definition" },
199{ "DIR", 0, 0, 0, 1, 1, "directory list" },
200{ "DIV", 0, 0, 0, 0, 0, "generic language/style container"},
201{ "DL", 0, 0, 0, 0, 0, "definition list " },
202{ "DT", 0, 1, 0, 0, 0, "definition term " },
203{ "EM", 0, 0, 0, 0, 0, "emphasis" },
204{ "FIELDSET", 0, 0, 0, 0, 0, "form control group " },
205{ "FONT", 0, 0, 0, 1, 1, "local change to font " },
206{ "FORM", 0, 0, 0, 0, 0, "interactive form " },
207{ "FRAME", 0, 2, 1, 0, 2, "subwindow " },
208{ "FRAMESET", 0, 0, 0, 0, 2, "window subdivision" },
209{ "H1", 0, 0, 0, 0, 0, "heading " },
210{ "H2", 0, 0, 0, 0, 0, "heading " },
211{ "H3", 0, 0, 0, 0, 0, "heading " },
212{ "H4", 0, 0, 0, 0, 0, "heading " },
213{ "H5", 0, 0, 0, 0, 0, "heading " },
214{ "H6", 0, 0, 0, 0, 0, "heading " },
215{ "HEAD", 1, 1, 0, 0, 0, "document head " },
216{ "HR", 0, 2, 1, 0, 0, "horizontal rule " },
217{ "HTML", 1, 1, 0, 0, 0, "document root element " },
218{ "I", 0, 0, 0, 0, 0, "italic text style" },
219{ "IFRAME", 0, 0, 0, 0, 1, "inline subwindow " },
220{ "IMG", 0, 2, 1, 0, 0, "Embedded image " },
221{ "INPUT", 0, 2, 1, 0, 0, "form control " },
222{ "INS", 0, 0, 0, 0, 0, "inserted text" },
223{ "ISINDEX", 0, 2, 1, 1, 1, "single line prompt " },
224{ "KBD", 0, 0, 0, 0, 0, "text to be entered by the user" },
225{ "LABEL", 0, 0, 0, 0, 0, "form field label text " },
226{ "LEGEND", 0, 0, 0, 0, 0, "fieldset legend " },
227{ "LI", 0, 1, 0, 0, 0, "list item " },
228{ "LINK", 0, 2, 1, 0, 0, "a media-independent link " },
229{ "MAP", 0, 0, 0, 0, 0, "client-side image map " },
230{ "MENU", 0, 0, 0, 1, 1, "menu list " },
231{ "META", 0, 2, 1, 0, 0, "generic metainformation " },
232{ "NOFRAMES", 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " },
233{ "NOSCRIPT", 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
234{ "OBJECT", 0, 0, 0, 0, 0, "generic embedded object " },
235{ "OL", 0, 0, 0, 0, 0, "ordered list " },
236{ "OPTGROUP", 0, 0, 0, 0, 0, "option group " },
237{ "OPTION", 0, 1, 0, 0, 0, "selectable choice " },
238{ "P", 0, 1, 0, 0, 0, "paragraph " },
239{ "PARAM", 0, 2, 1, 0, 0, "named property value " },
240{ "PRE", 0, 0, 0, 0, 0, "preformatted text " },
241{ "Q", 0, 0, 0, 0, 0, "short inline quotation " },
242{ "S", 0, 0, 0, 1, 1, "strike-through text style" },
243{ "SAMP", 0, 0, 0, 0, 0, "sample program output, scripts, etc." },
244{ "SCRIPT", 0, 0, 0, 0, 0, "script statements " },
245{ "SELECT", 0, 0, 0, 0, 0, "option selector " },
246{ "SMALL", 0, 0, 0, 0, 0, "small text style" },
247{ "SPAN", 0, 0, 0, 0, 0, "generic language/style container " },
248{ "STRIKE", 0, 0, 0, 1, 1, "strike-through text" },
249{ "STRONG", 0, 0, 0, 0, 0, "strong emphasis" },
250{ "STYLE", 0, 0, 0, 0, 0, "style info " },
251{ "SUB", 0, 0, 0, 0, 0, "subscript" },
252{ "SUP", 0, 0, 0, 0, 0, "superscript " },
253{ "TABLE", 0, 0, 0, 0, 0, "&#160;" },
254{ "TBODY", 1, 1, 0, 0, 0, "table body " },
255{ "TD", 0, 1, 0, 0, 0, "table data cell" },
256{ "TEXTAREA", 0, 0, 0, 0, 0, "multi-line text field " },
257{ "TFOOT", 0, 1, 0, 0, 0, "table footer " },
258{ "TH", 0, 1, 0, 0, 0, "table header cell" },
259{ "THEAD", 0, 1, 0, 0, 0, "table header " },
260{ "TITLE", 0, 0, 0, 0, 0, "document title " },
261{ "TR", 0, 1, 0, 0, 0, "table row " },
262{ "TT", 0, 0, 0, 0, 0, "teletype or monospaced text style" },
263{ "U", 0, 0, 0, 1, 1, "underlined text style" },
264{ "UL", 0, 0, 0, 0, 0, "unordered list " },
265{ "VAR", 0, 0, 0, 0, 0, "instance of a variable or program argument" },
266};
267
268/*
269 * start tags that imply the end of a current element
270 * any tag of each line implies the end of the current element if the type of
271 * that element is in the same line
272 */
Daniel Veillardb96e6431999-08-29 21:02:19 +0000273char *htmlEquEnd[] = {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000274"DT", "DD", "LI", "OPTION", NULL,
275"H1", "H2", "H3", "H4", "H5", "H6", NULL,
276"OL", "MENU", "DIR", "ADDRESS", "PRE", "LISTING", "XMP", NULL,
277NULL
278};
279/*
280 * acording the HTML DTD, HR should be added to the 2nd line above, as it
281 * is not allowed within a H1, H2, H3, etc. But we should tolerate that case
282 * because many documents contain rules in headings...
283 */
284
285/*
286 * start tags that imply the end of current element
287 */
Daniel Veillardb96e6431999-08-29 21:02:19 +0000288char *htmlStartClose[] = {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000289"FORM", "FORM", "P", "HR", "H1", "H2", "H3", "H4", "H5", "H6",
290 "DL", "UL", "OL", "MENU", "DIR", "ADDRESS", "PRE",
291 "LISTING", "XMP", "HEAD", NULL,
292"HEAD", "P", NULL,
293"TITLE", "P", NULL,
294"BODY", "HEAD", "STYLE", "LINK", "TITLE", "P", NULL,
295"LI", "P", "H1", "H2", "H3", "H4", "H5", "H6", "DL", "ADDRESS",
296 "PRE", "LISTING", "XMP", "HEAD", NULL,
297"HR", "P", "HEAD", NULL,
298"H1", "P", "HEAD", NULL,
299"H2", "P", "HEAD", NULL,
300"H3", "P", "HEAD", NULL,
301"H4", "P", "HEAD", NULL,
302"H5", "P", "HEAD", NULL,
303"H6", "P", "HEAD", NULL,
304"DIR", "P", "HEAD", NULL,
305"ADDRESS", "P", "HEAD", "UL", NULL,
306"PRE", "P", "HEAD", "UL", NULL,
307"LISTING", "P", "HEAD", NULL,
308"XMP", "P", "HEAD", NULL,
309"BLOCKQUOTE", "P", "HEAD", NULL,
310"DL", "P", "DT", "MENU", "DIR", "ADDRESS", "PRE", "LISTING",
311 "XMP", "HEAD", NULL,
312"DT", "P", "MENU", "DIR", "ADDRESS", "PRE", "LISTING", "XMP", "HEAD", NULL,
313"DD", "P", "MENU", "DIR", "ADDRESS", "PRE", "LISTING", "XMP", "HEAD", NULL,
314"UL", "P", "HEAD", "OL", "MENU", "DIR", "ADDRESS", "PRE",
315 "LISTING", "XMP", NULL,
316"OL", "P", "HEAD", "UL", NULL,
317"MENU", "P", "HEAD", "UL", NULL,
318"P", "P", "HEAD", "H1", "H2", "H3", "H4", "H5", "H6", NULL,
319"DIV", "P", "HEAD", NULL,
320"NOSCRIPT", "P", "HEAD", NULL,
321"CENTER", "FONT", "B", "I", "P", "HEAD", NULL,
322"A", "A", NULL,
323"CAPTION", "P", NULL,
324"COLGROUP", "CAPTION", "COLGROUP", "COL", "P", NULL,
325"COL", "CAPTION", "COL", "P", NULL,
326"TABLE", "P", "HEAD", "H1", "H2", "H3", "H4", "H5", "H6", "PRE",
327 "LISTING", "XMP", "A", NULL,
328"TH", "TH", "TD", NULL,
329"TD", "TH", "TD", NULL,
330"TR", "TH", "TD", "TR", "CAPTION", "COL", "COLGROUP", NULL,
331"THEAD", "CAPTION", "COL", "COLGROUP", NULL,
332"TFOOT", "TH", "TD", "TR", "CAPTION", "COL", "COLGROUP", "THEAD",
333 "TBODY", NULL,
334"TBODY", "TH", "TD", "TR", "CAPTION", "COL", "COLGROUP", "THEAD",
335 "TFOOT", "TBODY", NULL,
336"OPTGROUP", "OPTION", NULL,
337"FIELDSET", "LEGEND", "P", "HEAD", "H1", "H2", "H3", "H4", "H5", "H6",
338 "PRE", "LISTING", "XMP", "A", NULL,
339NULL
340};
341
Daniel Veillardb96e6431999-08-29 21:02:19 +0000342static char** htmlStartCloseIndex[100];
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000343static int htmlStartCloseIndexinitialized = 0;
344
345/************************************************************************
346 * *
347 * functions to handle HTML specific data *
348 * *
349 ************************************************************************/
350
351/**
352 * htmlInitAutoClose:
353 *
354 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
355 *
356 */
357void
358htmlInitAutoClose(void) {
359 int index, i = 0;
360
361 if (htmlStartCloseIndexinitialized) return;
362
363 for (index = 0;index < 100;index ++) htmlStartCloseIndex[index] = NULL;
364 index = 0;
365 while ((htmlStartClose[i] != NULL) && (index < 100 - 1)) {
366 htmlStartCloseIndex[index++] = &htmlStartClose[i];
367 while (htmlStartClose[i] != NULL) i++;
368 i++;
369 }
370}
371
372/**
373 * htmlTagLookup:
374 * @tag: The tag name
375 *
376 * Lookup the HTML tag in the ElementTable
377 *
378 * Returns the related htmlElemDescPtr or NULL if not found.
379 */
380htmlElemDescPtr
381htmlTagLookup(const CHAR *tag) {
382 int i = 0;
383
384 for (i = 0; i < (sizeof(html40ElementTable) /
385 sizeof(html40ElementTable[0]));i++) {
Daniel Veillardb96e6431999-08-29 21:02:19 +0000386 if (!xmlStrcmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000387 return(&html40ElementTable[i]);
388 }
389 return(NULL);
390}
391
392/**
393 * htmlCheckAutoClose:
394 * @new: The new tag name
395 * @old: The old tag name
396 *
397 * Checks wether the new tag is one of the registered valid tags for closing old.
398 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
399 *
400 * Returns 0 if no, 1 if yes.
401 */
402int
403htmlCheckAutoClose(const CHAR *new, const CHAR *old) {
404 int i, index;
Daniel Veillardb96e6431999-08-29 21:02:19 +0000405 char **close;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000406
407 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
408
409 /* inefficient, but not a big deal */
410 for (index = 0; index < 100;index++) {
411 close = htmlStartCloseIndex[index];
412 if (close == NULL) return(0);
Daniel Veillardb96e6431999-08-29 21:02:19 +0000413 if (!xmlStrcmp(BAD_CAST *close, new)) break;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000414 }
415
416 i = close - htmlStartClose;
417 i++;
418 while (htmlStartClose[i] != NULL) {
Daniel Veillardb96e6431999-08-29 21:02:19 +0000419 if (!xmlStrcmp(BAD_CAST htmlStartClose[i], old)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000420 return(1);
421 }
422 i++;
423 }
424 return(0);
425}
426
427/**
428 * htmlAutoClose:
429 * @ctxt: an HTML parser context
430 * @new: The new tag name
431 *
432 * The HTmL DtD allows a tag to implicitely close other tags.
433 * The list is kept in htmlStartClose array. This function is
434 * called when a new tag has been detected and generates the
435 * appropriates closes if possible/needed.
436 */
437void
438htmlAutoClose(htmlParserCtxtPtr ctxt, const CHAR *new) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000439
440 while ((ctxt->node != NULL) &&
441 (htmlCheckAutoClose(new, ctxt->node->name))) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000442#ifdef DEBUG
443 printf("htmlAutoClose: %s closes %s\n", new, ctxt->node->name);
444#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000445 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
446 ctxt->sax->endElement(ctxt->userData, ctxt->node->name);
447 }
448}
449
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000450/**
451 * htmlAutoCloseOnClose:
452 * @ctxt: an HTML parser context
453 * @new: The new tag name
454 *
455 * The HTmL DtD allows an ending tag to implicitely close other tags.
456 */
457void
458htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const CHAR *new) {
459 htmlElemDescPtr info;
460
461 while ((ctxt->node != NULL) &&
462 (xmlStrcmp(new, ctxt->node->name))) {
463 info = htmlTagLookup(ctxt->node->name);
464 if ((info == NULL) || (info->endTag == 1)) {
465#ifdef DEBUG
466 printf("htmlAutoCloseOnClose: %s closes %s\n", new, ctxt->node->name);
467#endif
468 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
469 ctxt->sax->endElement(ctxt->userData, ctxt->node->name);
470 } else
471 break;
472 }
473}
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000474
475/************************************************************************
476 * *
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000477 * The list of HTML predefined entities *
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000478 * *
479 ************************************************************************/
480
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000481
482htmlEntityDesc html40EntitiesTable[] = {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000483/*
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000484 * the 4 absolute ones,
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000485 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000486{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
487{ 38, "amp", "ampersand, U+0026 ISOnum" },
Daniel Veillard1566d3a1999-07-15 14:24:29 +0000488{ 39, "apos", "single quote" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000489{ 60, "lt", "less-than sign, U+003C ISOnum" },
490{ 62, "gt", "greater-than sign, U+003E ISOnum" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000491
492/*
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000493 * A bunch still in the 128-255 range
494 * Replacing them depend really on the charset used.
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000495 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000496{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
497{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
498{ 162, "cent", "cent sign, U+00A2 ISOnum" },
499{ 163, "pound","pound sign, U+00A3 ISOnum" },
500{ 164, "curren","currency sign, U+00A4 ISOnum" },
501{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
502{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
503{ 167, "sect", "section sign, U+00A7 ISOnum" },
504{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
505{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
506{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
507{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
508{ 172, "not", "not sign, U+00AC ISOnum" },
509{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
510{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
511{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
512{ 176, "deg", "degree sign, U+00B0 ISOnum" },
513{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
514{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
515{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
516{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
517{ 181, "micro","micro sign, U+00B5 ISOnum" },
518{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000519{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000520{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
521{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
522{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000523{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000524{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
525{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
526{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
527{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
528{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
529{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
530{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
531{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
532{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
533{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
534{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
535{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
536{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
537{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
538{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
539{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
540{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
541{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
542{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
543{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
544{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
545{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
546{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
547{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
548{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
549{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
550{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
551{ 215, "times","multiplication sign, U+00D7 ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000552{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000553{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
554{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
555{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
556{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
557{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
558{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
559{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
560{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
561{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
562{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
563{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
564{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
565{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
566{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
567{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
568{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
569{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
570{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
571{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
572{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
573{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
574{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
575{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
576{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
577{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
578{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
579{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
580{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
581{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
582{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
583{ 247, "divide","division sign, U+00F7 ISOnum" },
584{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
585{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
586{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
587{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
588{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
589{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
590{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
591{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000592
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000593/*
594 * Anything below should really be kept as entities references
595 */
596{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000597
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000598{ 913, "Alpha","greek capital letter alpha, U+0391" },
599{ 914, "Beta", "greek capital letter beta, U+0392" },
600{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
601{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
602{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
603{ 918, "Zeta", "greek capital letter zeta, U+0396" },
604{ 919, "Eta", "greek capital letter eta, U+0397" },
605{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
606{ 921, "Iota", "greek capital letter iota, U+0399" },
607{ 922, "Kappa","greek capital letter kappa, U+039A" },
608{ 923, "Lambda""greek capital letter lambda, U+039B ISOgrk3" },
609{ 924, "Mu", "greek capital letter mu, U+039C" },
610{ 925, "Nu", "greek capital letter nu, U+039D" },
611{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
612{ 927, "Omicron","greek capital letter omicron, U+039F" },
613{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
614{ 929, "Rho", "greek capital letter rho, U+03A1" },
615{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
616{ 932, "Tau", "greek capital letter tau, U+03A4" },
617{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
618{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
619{ 935, "Chi", "greek capital letter chi, U+03A7" },
620{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
621{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000622
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000623{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
624{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
625{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
626{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
627{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
628{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
629{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
630{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
631{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
632{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
633{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
634{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
635{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
636{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
637{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
638{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
639{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
640{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
641{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
642{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
643{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
644{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
645{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
646{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
647{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
648{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
649{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
650{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000651
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000652{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
653{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
654{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
655{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
656{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
657{ 8260, "frasl","fraction slash, U+2044 NEW" },
658
Daniel Veillardb05deb71999-08-10 19:04:08 +0000659{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000660{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
661{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
662{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
663{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
664{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
665{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
666{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
667{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
668{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
669{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
670{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
671{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
672{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
673{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
674{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
675
676
677{ 8704, "forall","for all, U+2200 ISOtech" },
678{ 8706, "part", "partial differential, U+2202 ISOtech" },
679{ 8707, "exist","there exists, U+2203 ISOtech" },
680{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
681{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
682{ 8712, "isin", "element of, U+2208 ISOtech" },
683{ 8713, "notin","not an element of, U+2209 ISOtech" },
684{ 8715, "ni", "contains as member, U+220B ISOtech" },
685{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
686{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
687{ 8722, "minus","minus sign, U+2212 ISOtech" },
688{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
689{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
690{ 8733, "prop", "proportional to, U+221D ISOtech" },
691{ 8734, "infin","infinity, U+221E ISOtech" },
692{ 8736, "ang", "angle, U+2220 ISOamso" },
693{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
694{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
695{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
696{ 8746, "cup", "union = cup, U+222A ISOtech" },
697{ 8747, "int", "integral, U+222B ISOtech" },
698{ 8756, "there4","therefore, U+2234 ISOtech" },
699{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
700{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
701{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
702{ 8800, "ne", "not equal to, U+2260 ISOtech" },
703{ 8801, "equiv","identical to, U+2261 ISOtech" },
704{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
705{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
706{ 8834, "sub", "subset of, U+2282 ISOtech" },
707{ 8835, "sup", "superset of, U+2283 ISOtech" },
708{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
709{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
710{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
711{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
712{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
713{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
714{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
715{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
716{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
717{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
718{ 8971, "rfloor","right floor, U+230B ISOamsc" },
719{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
720{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
721{ 9674, "loz", "lozenge, U+25CA ISOpub" },
722
723{ 9824, "spades","black spade suit, U+2660 ISOpub" },
724{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
725{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
726{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
727
728{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
729{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
730{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
731{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
732{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
733{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
734{ 732, "tilde","small tilde, U+02DC ISOdia" },
735
736{ 8194, "ensp", "en space, U+2002 ISOpub" },
737{ 8195, "emsp", "em space, U+2003 ISOpub" },
738{ 8201, "thinsp","thin space, U+2009 ISOpub" },
739{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
740{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
741{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
742{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
743{ 8211, "ndash","en dash, U+2013 ISOpub" },
744{ 8212, "mdash","em dash, U+2014 ISOpub" },
745{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
746{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
747{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
748{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
749{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
750{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
751{ 8224, "dagger","dagger, U+2020 ISOpub" },
752{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
753{ 8240, "permil","per mille sign, U+2030 ISOtech" },
754{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000755{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000756{ 8364, "euro", "euro sign, U+20AC NEW" }
757};
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000758
759/************************************************************************
760 * *
761 * Commodity functions to handle entities *
762 * *
763 ************************************************************************/
764
765/*
766 * Macro used to grow the current buffer.
767 */
768#define growBuffer(buffer) { \
769 buffer##_size *= 2; \
Daniel Veillard6454aec1999-09-02 22:04:43 +0000770 buffer = (CHAR *) xmlRealloc(buffer, buffer##_size * sizeof(CHAR)); \
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000771 if (buffer == NULL) { \
772 perror("realloc failed"); \
773 exit(1); \
774 } \
775}
776
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000777/**
778 * htmlEntityLookup:
779 * @name: the entity name
780 *
781 * Lookup the given entity in EntitiesTable
782 *
783 * TODO: the linear scan is really ugly, an hash table is really needed.
784 *
785 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
786 */
787htmlEntityDescPtr
788htmlEntityLookup(const CHAR *name) {
789 int i;
790
791 for (i = 0;i < (sizeof(html40EntitiesTable)/
792 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillardb96e6431999-08-29 21:02:19 +0000793 if (!xmlStrcmp(name, BAD_CAST html40EntitiesTable[i].name)) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000794#ifdef DEBUG
795 printf("Found entity %s\n", name);
796#endif
797 return(&html40EntitiesTable[i]);
798 }
799 }
800 return(NULL);
801}
802
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000803
804/**
805 * htmlDecodeEntities:
806 * @ctxt: the parser context
807 * @len: the len to decode (in bytes !), -1 for no size limit
808 * @end: an end marker CHAR, 0 if none
809 * @end2: an end marker CHAR, 0 if none
810 * @end3: an end marker CHAR, 0 if none
811 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000812 * Subtitute the HTML entities by their value
813 *
814 * TODO: once the internal representation will be UTF-8, all entities
815 * will be substituable, in the meantime we only apply the substitution
816 * to the one with values in the 0-255 UNICODE range
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000817 *
818 * Returns A newly allocated string with the substitution done. The caller
819 * must deallocate it !
820 */
821CHAR *
822htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
823 CHAR end, CHAR end2, CHAR end3) {
824 CHAR *buffer = NULL;
825 int buffer_size = 0;
826 CHAR *out = NULL;
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000827 CHAR *name = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000828
829 CHAR *cur = NULL;
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000830 htmlEntityDescPtr ent;
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000831 int nbchars = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000832 unsigned int max = (unsigned int) len;
833
834 /*
835 * allocate a translation buffer.
836 */
837 buffer_size = 1000;
Daniel Veillard6454aec1999-09-02 22:04:43 +0000838 buffer = (CHAR *) xmlMalloc(buffer_size * sizeof(CHAR));
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000839 if (buffer == NULL) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000840 perror("htmlDecodeEntities: malloc failed");
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000841 return(NULL);
842 }
843 out = buffer;
844
845 /*
846 * Ok loop until we reach one of the ending char or a size limit.
847 */
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000848 while ((nbchars < max) && (CUR != end) &&
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000849 (CUR != end2) && (CUR != end3)) {
850
851 if (CUR == '&') {
852 if (NXT(1) == '#') {
853 int val = htmlParseCharRef(ctxt);
Daniel Veillardb96e6431999-08-29 21:02:19 +0000854 /* invalid for UTF-8 variable encoding !!!!! */
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000855 *out++ = val;
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000856 nbchars += 3; /* !!!! */
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000857 } else {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000858 ent = htmlParseEntityRef(ctxt, &name);
859 if (name != NULL) {
860 if ((ent == NULL) || (ent->value <= 0) ||
861 (ent->value >= 255)) {
862 *out++ = '&';
863 cur = name;
864 while (*cur != 0) {
865 if (out - buffer > buffer_size - 100) {
866 int index = out - buffer;
867
868 growBuffer(buffer);
869 out = &buffer[index];
870 }
871 *out++ = *cur++;
872 }
873 *out++ = ';';
874 } else {
Daniel Veillardb96e6431999-08-29 21:02:19 +0000875 /* invalid for UTF-8 variable encoding !!!!! */
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000876 *out++ = (CHAR)ent->value;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000877 if (out - buffer > buffer_size - 100) {
878 int index = out - buffer;
879
880 growBuffer(buffer);
881 out = &buffer[index];
882 }
883 }
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000884 nbchars += 2 + xmlStrlen(name);
Daniel Veillard6454aec1999-09-02 22:04:43 +0000885 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000886 }
887 }
888 } else {
Daniel Veillardb96e6431999-08-29 21:02:19 +0000889 /* invalid for UTF-8 , use COPY(out); !!!!! */
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000890 *out++ = CUR;
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000891 nbchars++;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000892 if (out - buffer > buffer_size - 100) {
893 int index = out - buffer;
894
895 growBuffer(buffer);
896 out = &buffer[index];
897 }
898 NEXT;
899 }
900 }
901 *out++ = 0;
902 return(buffer);
903}
904
905
906/************************************************************************
907 * *
908 * Commodity functions to handle encodings *
909 * *
910 ************************************************************************/
911
912/**
913 * htmlSwitchEncoding:
914 * @ctxt: the parser context
915 * @len: the len of @cur
916 *
917 * change the input functions when discovering the character encoding
918 * of a given entity.
919 *
920 */
921void
922htmlSwitchEncoding(htmlParserCtxtPtr ctxt, xmlCharEncoding enc)
923{
924 switch (enc) {
925 case XML_CHAR_ENCODING_ERROR:
926 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
927 ctxt->sax->error(ctxt->userData, "encoding unknown\n");
928 ctxt->wellFormed = 0;
929 break;
930 case XML_CHAR_ENCODING_NONE:
931 /* let's assume it's UTF-8 without the XML decl */
932 return;
933 case XML_CHAR_ENCODING_UTF8:
934 /* default encoding, no conversion should be needed */
935 return;
936 case XML_CHAR_ENCODING_UTF16LE:
937 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
938 ctxt->sax->error(ctxt->userData,
939 "char encoding UTF16 little endian not supported\n");
940 break;
941 case XML_CHAR_ENCODING_UTF16BE:
942 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
943 ctxt->sax->error(ctxt->userData,
944 "char encoding UTF16 big endian not supported\n");
945 break;
946 case XML_CHAR_ENCODING_UCS4LE:
947 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
948 ctxt->sax->error(ctxt->userData,
949 "char encoding USC4 little endian not supported\n");
950 break;
951 case XML_CHAR_ENCODING_UCS4BE:
952 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
953 ctxt->sax->error(ctxt->userData,
954 "char encoding USC4 big endian not supported\n");
955 break;
956 case XML_CHAR_ENCODING_EBCDIC:
957 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
958 ctxt->sax->error(ctxt->userData,
959 "char encoding EBCDIC not supported\n");
960 break;
961 case XML_CHAR_ENCODING_UCS4_2143:
962 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
963 ctxt->sax->error(ctxt->userData,
964 "char encoding UCS4 2143 not supported\n");
965 break;
966 case XML_CHAR_ENCODING_UCS4_3412:
967 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
968 ctxt->sax->error(ctxt->userData,
969 "char encoding UCS4 3412 not supported\n");
970 break;
971 case XML_CHAR_ENCODING_UCS2:
972 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
973 ctxt->sax->error(ctxt->userData,
974 "char encoding UCS2 not supported\n");
975 break;
976 case XML_CHAR_ENCODING_8859_1:
977 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
978 ctxt->sax->error(ctxt->userData,
979 "char encoding ISO_8859_1 ISO Latin 1 not supported\n");
980 break;
981 case XML_CHAR_ENCODING_8859_2:
982 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
983 ctxt->sax->error(ctxt->userData,
984 "char encoding ISO_8859_2 ISO Latin 2 not supported\n");
985 break;
986 case XML_CHAR_ENCODING_8859_3:
987 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
988 ctxt->sax->error(ctxt->userData,
989 "char encoding ISO_8859_3 not supported\n");
990 break;
991 case XML_CHAR_ENCODING_8859_4:
992 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
993 ctxt->sax->error(ctxt->userData,
994 "char encoding ISO_8859_4 not supported\n");
995 break;
996 case XML_CHAR_ENCODING_8859_5:
997 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
998 ctxt->sax->error(ctxt->userData,
999 "char encoding ISO_8859_5 not supported\n");
1000 break;
1001 case XML_CHAR_ENCODING_8859_6:
1002 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1003 ctxt->sax->error(ctxt->userData,
1004 "char encoding ISO_8859_6 not supported\n");
1005 break;
1006 case XML_CHAR_ENCODING_8859_7:
1007 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1008 ctxt->sax->error(ctxt->userData,
1009 "char encoding ISO_8859_7 not supported\n");
1010 break;
1011 case XML_CHAR_ENCODING_8859_8:
1012 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1013 ctxt->sax->error(ctxt->userData,
1014 "char encoding ISO_8859_8 not supported\n");
1015 break;
1016 case XML_CHAR_ENCODING_8859_9:
1017 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1018 ctxt->sax->error(ctxt->userData,
1019 "char encoding ISO_8859_9 not supported\n");
1020 break;
1021 case XML_CHAR_ENCODING_2022_JP:
1022 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1023 ctxt->sax->error(ctxt->userData,
1024 "char encoding ISO-2022-JPnot supported\n");
1025 break;
1026 case XML_CHAR_ENCODING_SHIFT_JIS:
1027 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1028 ctxt->sax->error(ctxt->userData,
1029 "char encoding Shift_JISnot supported\n");
1030 break;
1031 case XML_CHAR_ENCODING_EUC_JP:
1032 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1033 ctxt->sax->error(ctxt->userData,
1034 "char encoding EUC-JPnot supported\n");
1035 break;
1036 }
1037}
1038
1039
1040/************************************************************************
1041 * *
1042 * Commodity functions, cleanup needed ? *
1043 * *
1044 ************************************************************************/
1045
1046/**
1047 * areBlanks:
1048 * @ctxt: an HTML parser context
1049 * @str: a CHAR *
1050 * @len: the size of @str
1051 *
1052 * Is this a sequence of blank chars that one can ignore ?
1053 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001054 * Returns 1 if ignorable 0 otherwise.
1055 */
1056
1057static int areBlanks(htmlParserCtxtPtr ctxt, const CHAR *str, int len) {
1058 int i;
1059 xmlNodePtr lastChild;
1060
1061 for (i = 0;i < len;i++)
1062 if (!(IS_BLANK(str[i]))) return(0);
1063
1064 if (CUR != '<') return(0);
1065 if (ctxt->node == NULL) return(0);
1066 lastChild = xmlGetLastChild(ctxt->node);
1067 if (lastChild == NULL) {
1068 if (ctxt->node->content != NULL) return(0);
1069 } else if (xmlNodeIsText(lastChild))
1070 return(0);
1071 return(1);
1072}
1073
1074/**
1075 * htmlHandleEntity:
1076 * @ctxt: an HTML parser context
1077 * @entity: an XML entity pointer.
1078 *
1079 * Default handling of an HTML entity, call the parser with the
1080 * substitution string
1081 */
1082
1083void
1084htmlHandleEntity(htmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
1085 int len;
1086
1087 if (entity->content == NULL) {
1088 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1089 ctxt->sax->error(ctxt->userData, "htmlHandleEntity %s: content == NULL\n",
1090 entity->name);
1091 ctxt->wellFormed = 0;
1092 return;
1093 }
1094 len = xmlStrlen(entity->content);
1095
1096 /*
1097 * Just handle the content as a set of chars.
1098 */
1099 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
1100 ctxt->sax->characters(ctxt->userData, entity->content, len);
1101
1102}
1103
1104/**
1105 * htmlNewDoc:
1106 * @URI: URI for the dtd, or NULL
1107 * @ExternalID: the external ID of the DTD, or NULL
1108 *
1109 * Returns a new document
1110 */
1111htmlDocPtr
1112htmlNewDoc(const CHAR *URI, const CHAR *ExternalID) {
1113 xmlDocPtr cur;
1114
1115 /*
1116 * Allocate a new document and fill the fields.
1117 */
Daniel Veillard6454aec1999-09-02 22:04:43 +00001118 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001119 if (cur == NULL) {
1120 fprintf(stderr, "xmlNewDoc : malloc failed\n");
1121 return(NULL);
1122 }
Daniel Veillarde7a5a771999-08-30 13:05:42 +00001123 memset(cur, 0, sizeof(xmlDoc));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001124
1125 cur->type = XML_DOCUMENT_NODE;
1126 cur->version = NULL;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001127 cur->intSubset = NULL;
Daniel Veillardb96e6431999-08-29 21:02:19 +00001128 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001129 cur->name = NULL;
1130 cur->root = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001131 cur->extSubset = NULL;
1132 cur->oldNs = NULL;
1133 cur->encoding = NULL;
1134 cur->standalone = 1;
1135 cur->compression = 0;
Daniel Veillardc08a2c61999-09-08 21:35:25 +00001136 cur->ids = NULL;
1137 cur->refs = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001138#ifndef XML_WITHOUT_CORBA
1139 cur->_private = NULL;
1140 cur->vepv = NULL;
1141#endif
1142 return(cur);
1143}
1144
1145
1146/************************************************************************
1147 * *
1148 * The parser itself *
1149 * Relates to http://www.w3.org/TR/html40 *
1150 * *
1151 ************************************************************************/
1152
1153/************************************************************************
1154 * *
1155 * The parser itself *
1156 * *
1157 ************************************************************************/
1158
1159/**
1160 * htmlParseHTMLName:
1161 * @ctxt: an HTML parser context
1162 *
1163 * parse an HTML tag or attribute name, note that we convert it to uppercase
1164 * since HTML names are not case-sensitive.
1165 *
1166 * Returns the Tag Name parsed or NULL
1167 */
1168
1169CHAR *
1170htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1171 CHAR *ret = NULL;
1172 int i = 0;
1173 CHAR loc[100];
1174
1175 if (!IS_LETTER(CUR) && (CUR != '_') &&
1176 (CUR != ':')) return(NULL);
1177
1178 while ((i < 100) && ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)))) {
1179 if ((CUR >= 0x61) && (CUR <= 0x7a)) loc[i] = CUR - 0x20;
1180 else loc[i] = CUR;
1181 i++;
1182
1183 NEXT;
1184 }
1185
1186 ret = xmlStrndup(loc, i);
1187
1188 return(ret);
1189}
1190
1191/**
1192 * htmlParseName:
1193 * @ctxt: an HTML parser context
1194 *
1195 * parse an HTML name, this routine is case sensistive.
1196 *
1197 * Returns the Name parsed or NULL
1198 */
1199
1200CHAR *
1201htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001202 CHAR buf[HTML_MAX_NAMELEN];
1203 int len = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001204
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001205 GROW;
1206 if (!IS_LETTER(CUR) && (CUR != '_')) {
1207 return(NULL);
1208 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001209
1210 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1211 (CUR == '.') || (CUR == '-') ||
1212 (CUR == '_') || (CUR == ':') ||
1213 (IS_COMBINING(CUR)) ||
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001214 (IS_EXTENDER(CUR))) {
1215 buf[len++] = CUR;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001216 NEXT;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001217 if (len >= HTML_MAX_NAMELEN) {
1218 fprintf(stderr,
1219 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1220 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1221 (CUR == '.') || (CUR == '-') ||
1222 (CUR == '_') || (CUR == ':') ||
1223 (IS_COMBINING(CUR)) ||
1224 (IS_EXTENDER(CUR)))
1225 NEXT;
1226 break;
1227 }
1228 }
1229 return(xmlStrndup(buf, len));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001230}
1231
1232/**
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001233 * htmlParseHTMLAttribute:
1234 * @ctxt: an HTML parser context
1235 *
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001236 * parse an HTML attribute value (without quotes).
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001237 *
1238 * Returns the Nmtoken parsed or NULL
1239 */
1240
1241CHAR *
1242htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt) {
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001243 CHAR buf[HTML_MAX_NAMELEN];
1244 int len = 0;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001245
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001246 GROW;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001247 while ((!IS_BLANK(CUR)) && (CUR != '<') &&
1248 (CUR != '&') && (CUR != '>') &&
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001249 (CUR != '\'') && (CUR != '"')) {
1250 buf[len++] = CUR;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001251 NEXT;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001252 if (len >= HTML_MAX_NAMELEN) {
1253 fprintf(stderr,
1254 "htmlParseHTMLAttribute: reached HTML_MAX_NAMELEN limit\n");
1255 while ((!IS_BLANK(CUR)) && (CUR != '<') &&
1256 (CUR != '&') && (CUR != '>') &&
1257 (CUR != '\'') && (CUR != '"'))
1258 NEXT;
1259 break;
1260 }
1261 }
1262 return(xmlStrndup(buf, len));
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001263}
1264
1265/**
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001266 * htmlParseNmtoken:
1267 * @ctxt: an HTML parser context
1268 *
1269 * parse an HTML Nmtoken.
1270 *
1271 * Returns the Nmtoken parsed or NULL
1272 */
1273
1274CHAR *
1275htmlParseNmtoken(htmlParserCtxtPtr ctxt) {
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001276 CHAR buf[HTML_MAX_NAMELEN];
1277 int len = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001278
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001279 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001280 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1281 (CUR == '.') || (CUR == '-') ||
1282 (CUR == '_') || (CUR == ':') ||
1283 (IS_COMBINING(CUR)) ||
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001284 (IS_EXTENDER(CUR))) {
1285 buf[len++] = CUR;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001286 NEXT;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001287 if (len >= HTML_MAX_NAMELEN) {
1288 fprintf(stderr,
1289 "htmlParseNmtoken: reached HTML_MAX_NAMELEN limit\n");
1290 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1291 (CUR == '.') || (CUR == '-') ||
1292 (CUR == '_') || (CUR == ':') ||
1293 (IS_COMBINING(CUR)) ||
1294 (IS_EXTENDER(CUR)))
1295 NEXT;
1296 break;
1297 }
1298 }
1299 return(xmlStrndup(buf, len));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001300}
1301
1302/**
1303 * htmlParseEntityRef:
1304 * @ctxt: an HTML parser context
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001305 * @str: location to store the entity name
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001306 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001307 * parse an HTML ENTITY references
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001308 *
1309 * [68] EntityRef ::= '&' Name ';'
1310 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001311 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
1312 * if non-NULL *str will have to be freed by the caller.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001313 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001314htmlEntityDescPtr
1315htmlParseEntityRef(htmlParserCtxtPtr ctxt, CHAR **str) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001316 CHAR *name;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001317 htmlEntityDescPtr ent = NULL;
1318 *str = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001319
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001320 if (CUR == '&') {
1321 NEXT;
1322 name = htmlParseName(ctxt);
1323 if (name == NULL) {
1324 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1325 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
1326 ctxt->wellFormed = 0;
1327 } else {
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001328 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001329 if (CUR == ';') {
1330 NEXT;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001331 *str = name;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001332
1333 /*
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001334 * Lookup the entity in the table.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001335 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001336 ent = htmlEntityLookup(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001337 } else {
1338 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1339 ctxt->sax->error(ctxt->userData,
1340 "htmlParseEntityRef: expecting ';'\n");
1341 ctxt->wellFormed = 0;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001342 if (ctxt->sax->characters != NULL) {
Daniel Veillardb96e6431999-08-29 21:02:19 +00001343 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001344 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
1345 }
Daniel Veillard6454aec1999-09-02 22:04:43 +00001346 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001347 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001348 }
1349 }
1350 return(ent);
1351}
1352
1353/**
1354 * htmlParseAttValue:
1355 * @ctxt: an HTML parser context
1356 *
1357 * parse a value for an attribute
1358 * Note: the parser won't do substitution of entities here, this
1359 * will be handled later in xmlStringGetNodeList, unless it was
1360 * asked for ctxt->replaceEntities != 0
1361 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001362 * Returns the AttValue parsed or NULL.
1363 */
1364
1365CHAR *
1366htmlParseAttValue(htmlParserCtxtPtr ctxt) {
1367 CHAR *ret = NULL;
1368
1369 if (CUR == '"') {
1370 NEXT;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001371 ret = htmlDecodeEntities(ctxt, -1, '"', '<', 0);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001372 if (CUR == '<') {
1373 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1374 ctxt->sax->error(ctxt->userData,
1375 "Unescaped '<' not allowed in attributes values\n");
1376 ctxt->wellFormed = 0;
1377 }
1378 if (CUR != '"') {
1379 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1380 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
1381 ctxt->wellFormed = 0;
1382 } else
1383 NEXT;
1384 } else if (CUR == '\'') {
1385 NEXT;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001386 ret = htmlDecodeEntities(ctxt, -1, '\'', '<', 0);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001387 if (CUR == '<') {
1388 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1389 ctxt->sax->error(ctxt->userData,
1390 "Unescaped '<' not allowed in attributes values\n");
1391 ctxt->wellFormed = 0;
1392 }
1393 if (CUR != '\'') {
1394 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1395 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
1396 ctxt->wellFormed = 0;
1397 } else
1398 NEXT;
1399 } else {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001400 /*
1401 * That's an HTMLism, the attribute value may not be quoted
1402 */
1403 ret = htmlParseHTMLAttribute(ctxt);
1404 if (ret == NULL) {
1405 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1406 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
1407 ctxt->wellFormed = 0;
1408 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001409 }
1410
1411 return(ret);
1412}
1413
1414/**
1415 * htmlParseSystemLiteral:
1416 * @ctxt: an HTML parser context
1417 *
1418 * parse an HTML Literal
1419 *
1420 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
1421 *
1422 * Returns the SystemLiteral parsed or NULL
1423 */
1424
1425CHAR *
1426htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
1427 const CHAR *q;
1428 CHAR *ret = NULL;
1429
1430 if (CUR == '"') {
1431 NEXT;
1432 q = CUR_PTR;
1433 while ((IS_CHAR(CUR)) && (CUR != '"'))
1434 NEXT;
1435 if (!IS_CHAR(CUR)) {
1436 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1437 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
1438 ctxt->wellFormed = 0;
1439 } else {
1440 ret = xmlStrndup(q, CUR_PTR - q);
1441 NEXT;
1442 }
1443 } else if (CUR == '\'') {
1444 NEXT;
1445 q = CUR_PTR;
1446 while ((IS_CHAR(CUR)) && (CUR != '\''))
1447 NEXT;
1448 if (!IS_CHAR(CUR)) {
1449 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1450 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
1451 ctxt->wellFormed = 0;
1452 } else {
1453 ret = xmlStrndup(q, CUR_PTR - q);
1454 NEXT;
1455 }
1456 } else {
1457 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1458 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
1459 ctxt->wellFormed = 0;
1460 }
1461
1462 return(ret);
1463}
1464
1465/**
1466 * htmlParsePubidLiteral:
1467 * @ctxt: an HTML parser context
1468 *
1469 * parse an HTML public literal
1470 *
1471 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
1472 *
1473 * Returns the PubidLiteral parsed or NULL.
1474 */
1475
1476CHAR *
1477htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
1478 const CHAR *q;
1479 CHAR *ret = NULL;
1480 /*
1481 * Name ::= (Letter | '_') (NameChar)*
1482 */
1483 if (CUR == '"') {
1484 NEXT;
1485 q = CUR_PTR;
1486 while (IS_PUBIDCHAR(CUR)) NEXT;
1487 if (CUR != '"') {
1488 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1489 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
1490 ctxt->wellFormed = 0;
1491 } else {
1492 ret = xmlStrndup(q, CUR_PTR - q);
1493 NEXT;
1494 }
1495 } else if (CUR == '\'') {
1496 NEXT;
1497 q = CUR_PTR;
1498 while ((IS_LETTER(CUR)) && (CUR != '\''))
1499 NEXT;
1500 if (!IS_LETTER(CUR)) {
1501 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1502 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
1503 ctxt->wellFormed = 0;
1504 } else {
1505 ret = xmlStrndup(q, CUR_PTR - q);
1506 NEXT;
1507 }
1508 } else {
1509 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1510 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
1511 ctxt->wellFormed = 0;
1512 }
1513
1514 return(ret);
1515}
1516
1517/**
1518 * htmlParseCharData:
1519 * @ctxt: an HTML parser context
1520 * @cdata: int indicating whether we are within a CDATA section
1521 *
1522 * parse a CharData section.
1523 * if we are within a CDATA section ']]>' marks an end of section.
1524 *
1525 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
1526 */
1527
1528void
1529htmlParseCharData(htmlParserCtxtPtr ctxt, int cdata) {
1530 const CHAR *q;
1531
1532 q = CUR_PTR;
1533 while ((IS_CHAR(CUR)) && (CUR != '<') &&
1534 (CUR != '&')) {
1535 if ((CUR == ']') && (NXT(1) == ']') &&
1536 (NXT(2) == '>')) {
1537 if (cdata) break;
1538 else {
1539 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1540 ctxt->sax->error(ctxt->userData,
1541 "Sequence ']]>' not allowed in content\n");
1542 ctxt->wellFormed = 0;
1543 }
1544 }
1545 NEXT;
1546 }
1547 if (q == CUR_PTR) return;
1548
1549 /*
1550 * Ok the segment [q CUR_PTR] is to be consumed as chars.
1551 */
1552 if (ctxt->sax != NULL) {
1553 if (areBlanks(ctxt, q, CUR_PTR - q)) {
1554 if (ctxt->sax->ignorableWhitespace != NULL)
1555 ctxt->sax->ignorableWhitespace(ctxt->userData, q, CUR_PTR - q);
1556 } else {
1557 if (ctxt->sax->characters != NULL)
1558 ctxt->sax->characters(ctxt->userData, q, CUR_PTR - q);
1559 }
1560 }
1561}
1562
1563/**
1564 * htmlParseExternalID:
1565 * @ctxt: an HTML parser context
1566 * @publicID: a CHAR** receiving PubidLiteral
1567 * @strict: indicate whether we should restrict parsing to only
1568 * production [75], see NOTE below
1569 *
1570 * Parse an External ID or a Public ID
1571 *
1572 * NOTE: Productions [75] and [83] interract badly since [75] can generate
1573 * 'PUBLIC' S PubidLiteral S SystemLiteral
1574 *
1575 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
1576 * | 'PUBLIC' S PubidLiteral S SystemLiteral
1577 *
1578 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
1579 *
1580 * Returns the function returns SystemLiteral and in the second
1581 * case publicID receives PubidLiteral, is strict is off
1582 * it is possible to return NULL and have publicID set.
1583 */
1584
1585CHAR *
1586htmlParseExternalID(htmlParserCtxtPtr ctxt, CHAR **publicID, int strict) {
1587 CHAR *URI = NULL;
1588
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001589 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
1590 (UPP(2) == 'S') && (UPP(3) == 'T') &&
1591 (UPP(4) == 'E') && (UPP(5) == 'M')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001592 SKIP(6);
1593 if (!IS_BLANK(CUR)) {
1594 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1595 ctxt->sax->error(ctxt->userData,
1596 "Space required after 'SYSTEM'\n");
1597 ctxt->wellFormed = 0;
1598 }
1599 SKIP_BLANKS;
1600 URI = htmlParseSystemLiteral(ctxt);
1601 if (URI == NULL) {
1602 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1603 ctxt->sax->error(ctxt->userData,
1604 "htmlParseExternalID: SYSTEM, no URI\n");
1605 ctxt->wellFormed = 0;
1606 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001607 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
1608 (UPP(2) == 'B') && (UPP(3) == 'L') &&
1609 (UPP(4) == 'I') && (UPP(5) == 'C')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001610 SKIP(6);
1611 if (!IS_BLANK(CUR)) {
1612 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1613 ctxt->sax->error(ctxt->userData,
1614 "Space required after 'PUBLIC'\n");
1615 ctxt->wellFormed = 0;
1616 }
1617 SKIP_BLANKS;
1618 *publicID = htmlParsePubidLiteral(ctxt);
1619 if (*publicID == NULL) {
1620 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1621 ctxt->sax->error(ctxt->userData,
1622 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
1623 ctxt->wellFormed = 0;
1624 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001625 SKIP_BLANKS;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001626 if ((CUR == '"') || (CUR == '\'')) {
1627 URI = htmlParseSystemLiteral(ctxt);
1628 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001629 }
1630 return(URI);
1631}
1632
1633/**
1634 * htmlParseComment:
1635 * @ctxt: an HTML parser context
1636 * @create: should we create a node, or just skip the content
1637 *
1638 * Parse an XML (SGML) comment <!-- .... -->
1639 *
1640 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
1641 */
1642void
1643htmlParseComment(htmlParserCtxtPtr ctxt, int create) {
1644 const CHAR *q, *start;
1645 const CHAR *r;
1646 CHAR *val;
1647
1648 /*
1649 * Check that there is a comment right here.
1650 */
1651 if ((CUR != '<') || (NXT(1) != '!') ||
1652 (NXT(2) != '-') || (NXT(3) != '-')) return;
1653
1654 SKIP(4);
1655 start = q = CUR_PTR;
1656 NEXT;
1657 r = CUR_PTR;
1658 NEXT;
1659 while (IS_CHAR(CUR) &&
1660 ((CUR == ':') || (CUR != '>') ||
1661 (*r != '-') || (*q != '-'))) {
1662 if ((*r == '-') && (*q == '-')) {
1663 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1664 ctxt->sax->error(ctxt->userData,
1665 "Comment must not contain '--' (double-hyphen)`\n");
1666 ctxt->wellFormed = 0;
1667 }
1668 NEXT;r++;q++;
1669 }
1670 if (!IS_CHAR(CUR)) {
1671 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1672 ctxt->sax->error(ctxt->userData, "Comment not terminated \n<!--%.50s\n", start);
1673 ctxt->wellFormed = 0;
1674 } else {
1675 NEXT;
1676 if (create) {
1677 val = xmlStrndup(start, q - start);
1678 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL))
1679 ctxt->sax->comment(ctxt->userData, val);
Daniel Veillard6454aec1999-09-02 22:04:43 +00001680 xmlFree(val);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001681 }
1682 }
1683}
1684
1685/**
1686 * htmlParseCharRef:
1687 * @ctxt: an HTML parser context
1688 *
1689 * parse Reference declarations
1690 *
1691 * [66] CharRef ::= '&#' [0-9]+ ';' |
1692 * '&#x' [0-9a-fA-F]+ ';'
1693 *
1694 * Returns the value parsed (as an int)
1695 */
1696int
1697htmlParseCharRef(htmlParserCtxtPtr ctxt) {
1698 int val = 0;
1699
1700 if ((CUR == '&') && (NXT(1) == '#') &&
1701 (NXT(2) == 'x')) {
1702 SKIP(3);
1703 while (CUR != ';') {
1704 if ((CUR >= '0') && (CUR <= '9'))
1705 val = val * 16 + (CUR - '0');
1706 else if ((CUR >= 'a') && (CUR <= 'f'))
1707 val = val * 16 + (CUR - 'a') + 10;
1708 else if ((CUR >= 'A') && (CUR <= 'F'))
1709 val = val * 16 + (CUR - 'A') + 10;
1710 else {
1711 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1712 ctxt->sax->error(ctxt->userData,
1713 "htmlParseCharRef: invalid hexadecimal value\n");
1714 ctxt->wellFormed = 0;
1715 val = 0;
1716 break;
1717 }
1718 NEXT;
1719 }
1720 if (CUR == ';')
1721 NEXT;
1722 } else if ((CUR == '&') && (NXT(1) == '#')) {
1723 SKIP(2);
1724 while (CUR != ';') {
1725 if ((CUR >= '0') && (CUR <= '9'))
1726 val = val * 10 + (CUR - '0');
1727 else {
1728 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1729 ctxt->sax->error(ctxt->userData,
1730 "htmlParseCharRef: invalid decimal value\n");
1731 ctxt->wellFormed = 0;
1732 val = 0;
1733 break;
1734 }
1735 NEXT;
1736 }
1737 if (CUR == ';')
1738 NEXT;
1739 } else {
1740 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1741 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
1742 ctxt->wellFormed = 0;
1743 }
1744 /*
1745 * Check the value IS_CHAR ...
1746 */
1747 if (IS_CHAR(val)) {
1748 return(val);
1749 } else {
1750 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1751 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid CHAR value %d\n",
1752 val);
1753 ctxt->wellFormed = 0;
1754 }
1755 return(0);
1756}
1757
1758
1759/**
1760 * htmlParseDocTypeDecl :
1761 * @ctxt: an HTML parser context
1762 *
1763 * parse a DOCTYPE declaration
1764 *
1765 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
1766 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
1767 */
1768
1769void
1770htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
1771 CHAR *name;
1772 CHAR *ExternalID = NULL;
1773 CHAR *URI = NULL;
1774
1775 /*
1776 * We know that '<!DOCTYPE' has been detected.
1777 */
1778 SKIP(9);
1779
1780 SKIP_BLANKS;
1781
1782 /*
1783 * Parse the DOCTYPE name.
1784 */
1785 name = htmlParseName(ctxt);
1786 if (name == NULL) {
1787 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1788 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
1789 ctxt->wellFormed = 0;
1790 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001791 /*
1792 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
1793 */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001794
1795 SKIP_BLANKS;
1796
1797 /*
1798 * Check for SystemID and ExternalID
1799 */
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001800 URI = htmlParseExternalID(ctxt, &ExternalID, 0);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001801 SKIP_BLANKS;
1802
1803 /*
1804 * We should be at the end of the DOCTYPE declaration.
1805 */
1806 if (CUR != '>') {
1807 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1808 ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
1809 ctxt->wellFormed = 0;
1810 /* We shouldn't try to resynchronize ... */
1811 } else {
1812 }
1813 NEXT;
1814
1815 /*
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001816 * Create the document accordingly to the DOCTYPE
1817 */
1818 ctxt->myDoc = htmlNewDoc(URI, ExternalID);
1819
1820 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001821 * Cleanup, since we don't use all those identifiers
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001822 */
Daniel Veillard6454aec1999-09-02 22:04:43 +00001823 if (URI != NULL) xmlFree(URI);
1824 if (ExternalID != NULL) xmlFree(ExternalID);
1825 if (name != NULL) xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001826}
1827
1828/**
1829 * htmlParseAttribute:
1830 * @ctxt: an HTML parser context
1831 * @value: a CHAR ** used to store the value of the attribute
1832 *
1833 * parse an attribute
1834 *
1835 * [41] Attribute ::= Name Eq AttValue
1836 *
1837 * [25] Eq ::= S? '=' S?
1838 *
1839 * With namespace:
1840 *
1841 * [NS 11] Attribute ::= QName Eq AttValue
1842 *
1843 * Also the case QName == xmlns:??? is handled independently as a namespace
1844 * definition.
1845 *
1846 * Returns the attribute name, and the value in *value.
1847 */
1848
1849CHAR *
1850htmlParseAttribute(htmlParserCtxtPtr ctxt, CHAR **value) {
1851 CHAR *name, *val;
1852
1853 *value = NULL;
1854 name = htmlParseName(ctxt);
1855 if (name == NULL) {
1856 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1857 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
1858 ctxt->wellFormed = 0;
1859 return(NULL);
1860 }
1861
1862 /*
1863 * read the value
1864 */
1865 SKIP_BLANKS;
1866 if (CUR == '=') {
1867 NEXT;
1868 SKIP_BLANKS;
1869 val = htmlParseAttValue(ctxt);
1870 } else {
1871 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1872 ctxt->sax->error(ctxt->userData,
1873 "Specification mandate value for attribute %s\n", name);
1874 ctxt->wellFormed = 0;
1875 return(NULL);
1876 }
1877
1878 *value = val;
1879 return(name);
1880}
1881
1882/**
1883 * htmlParseStartTag:
1884 * @ctxt: an HTML parser context
1885 *
1886 * parse a start of tag either for rule element or
1887 * EmptyElement. In both case we don't parse the tag closing chars.
1888 *
1889 * [40] STag ::= '<' Name (S Attribute)* S? '>'
1890 *
1891 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
1892 *
1893 * With namespace:
1894 *
1895 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
1896 *
1897 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
1898 *
1899 * Returns the element name parsed
1900 */
1901
1902CHAR *
1903htmlParseStartTag(htmlParserCtxtPtr ctxt) {
1904 CHAR *name;
1905 CHAR *attname;
1906 CHAR *attvalue;
1907 const CHAR **atts = NULL;
1908 int nbatts = 0;
1909 int maxatts = 0;
1910 int i;
1911
1912 if (CUR != '<') return(NULL);
1913 NEXT;
1914
1915 name = htmlParseHTMLName(ctxt);
1916 if (name == NULL) {
1917 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1918 ctxt->sax->error(ctxt->userData,
1919 "htmlParseStartTag: invalid element name\n");
1920 ctxt->wellFormed = 0;
1921 return(NULL);
1922 }
1923
1924 /*
1925 * Check for auto-closure of HTML elements.
1926 */
1927 htmlAutoClose(ctxt, name);
1928
1929 /*
1930 * Now parse the attributes, it ends up with the ending
1931 *
1932 * (S Attribute)* S?
1933 */
1934 SKIP_BLANKS;
1935 while ((IS_CHAR(CUR)) &&
1936 (CUR != '>') &&
1937 ((CUR != '/') || (NXT(1) != '>'))) {
1938 const CHAR *q = CUR_PTR;
1939
1940 attname = htmlParseAttribute(ctxt, &attvalue);
1941 if ((attname != NULL) && (attvalue != NULL)) {
1942 /*
1943 * Well formedness requires at most one declaration of an attribute
1944 */
1945 for (i = 0; i < nbatts;i += 2) {
1946 if (!xmlStrcmp(atts[i], attname)) {
1947 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1948 ctxt->sax->error(ctxt->userData, "Attribute %s redefined\n",
1949 name);
1950 ctxt->wellFormed = 0;
Daniel Veillard6454aec1999-09-02 22:04:43 +00001951 xmlFree(attname);
1952 xmlFree(attvalue);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001953 break;
1954 }
1955 }
1956
1957 /*
1958 * Add the pair to atts
1959 */
1960 if (atts == NULL) {
1961 maxatts = 10;
Daniel Veillard6454aec1999-09-02 22:04:43 +00001962 atts = (const CHAR **) xmlMalloc(maxatts * sizeof(CHAR *));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001963 if (atts == NULL) {
1964 fprintf(stderr, "malloc of %ld byte failed\n",
Daniel Veillard82150d81999-07-07 07:32:15 +00001965 maxatts * (long)sizeof(CHAR *));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001966 return(NULL);
1967 }
1968 } else if (nbatts + 2 < maxatts) {
1969 maxatts *= 2;
Daniel Veillard6454aec1999-09-02 22:04:43 +00001970 atts = (const CHAR **) xmlRealloc(atts, maxatts * sizeof(CHAR *));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001971 if (atts == NULL) {
1972 fprintf(stderr, "realloc of %ld byte failed\n",
Daniel Veillard82150d81999-07-07 07:32:15 +00001973 maxatts * (long)sizeof(CHAR *));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001974 return(NULL);
1975 }
1976 }
1977 atts[nbatts++] = attname;
1978 atts[nbatts++] = attvalue;
1979 atts[nbatts] = NULL;
1980 atts[nbatts + 1] = NULL;
1981 }
1982
1983 SKIP_BLANKS;
1984 if (q == CUR_PTR) {
1985 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1986 ctxt->sax->error(ctxt->userData,
1987 "htmlParseStartTag: problem parsing attributes\n");
1988 ctxt->wellFormed = 0;
1989 break;
1990 }
1991 }
1992
1993 /*
1994 * SAX: Start of Element !
1995 */
1996 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1997 ctxt->sax->startElement(ctxt->userData, name, atts);
1998
1999 if (atts != NULL) {
Daniel Veillard6454aec1999-09-02 22:04:43 +00002000 for (i = 0;i < nbatts;i++) xmlFree((CHAR *) atts[i]);
2001 xmlFree(atts);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002002 }
2003 return(name);
2004}
2005
2006/**
2007 * htmlParseEndTag:
2008 * @ctxt: an HTML parser context
2009 * @tagname: the tag name as parsed in the opening tag.
2010 *
2011 * parse an end of tag
2012 *
2013 * [42] ETag ::= '</' Name S? '>'
2014 *
2015 * With namespace
2016 *
2017 * [NS 9] ETag ::= '</' QName S? '>'
2018 */
2019
2020void
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002021htmlParseEndTag(htmlParserCtxtPtr ctxt, const CHAR *tagname) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002022 CHAR *name;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002023 int i;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002024
2025 if ((CUR != '<') || (NXT(1) != '/')) {
2026 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2027 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
2028 ctxt->wellFormed = 0;
2029 return;
2030 }
2031 SKIP(2);
2032
2033 name = htmlParseHTMLName(ctxt);
2034
2035 /*
2036 * We should definitely be at the ending "S? '>'" part
2037 */
2038 SKIP_BLANKS;
2039 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
2040 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2041 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
2042 ctxt->wellFormed = 0;
2043 } else
2044 NEXT;
2045
2046 /*
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002047 * Check that we are not closing an already closed tag,
2048 * <p><b>...</p></b> is a really common error !
2049 */
2050 for (i = ctxt->nodeNr - 1;i >= 0;i--) {
2051 if ((ctxt->nodeTab[i] != NULL) &&
2052 (!xmlStrcmp(tagname, ctxt->nodeTab[i]->name)))
2053 break;
2054 }
2055 if (i < 0) {
2056 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2057 ctxt->sax->error(ctxt->userData,
2058 "htmlParseEndTag: unexpected close for tag %s\n",
2059 tagname);
Daniel Veillard6454aec1999-09-02 22:04:43 +00002060 xmlFree(name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002061 ctxt->wellFormed = 0;
2062 return;
2063 }
2064
2065 /*
2066 * Check for auto-closure of HTML elements.
2067 */
2068 htmlAutoCloseOnClose(ctxt, name);
2069
2070 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002071 * Well formedness constraints, opening and closing must match.
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002072 * With the exception that the autoclose may have popped stuff out
2073 * of the stack.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002074 */
2075 if (xmlStrcmp(name, tagname)) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002076 if ((ctxt->node != NULL) &&
2077 (xmlStrcmp(ctxt->node->name, name))) {
2078 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2079 ctxt->sax->error(ctxt->userData,
2080 "Opening and ending tag mismatch: %s and %s\n",
2081 name, ctxt->node->name);
2082 ctxt->wellFormed = 0;
2083 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002084 }
2085
2086 /*
2087 * SAX: End of Tag
2088 */
2089 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2090 ctxt->sax->endElement(ctxt->userData, name);
2091
2092 if (name != NULL)
Daniel Veillard6454aec1999-09-02 22:04:43 +00002093 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002094
2095 return;
2096}
2097
2098
2099/**
2100 * htmlParseReference:
2101 * @ctxt: an HTML parser context
2102 *
2103 * parse and handle entity references in content,
2104 * this will end-up in a call to character() since this is either a
2105 * CharRef, or a predefined entity.
2106 */
2107void
2108htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002109 htmlEntityDescPtr ent;
2110 CHAR out[2];
2111 CHAR *name;
2112 int val;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002113 if (CUR != '&') return;
2114
2115 if (NXT(1) == '#') {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002116 val = htmlParseCharRef(ctxt);
Daniel Veillardb96e6431999-08-29 21:02:19 +00002117 /* invalid for UTF-8 variable encoding !!!!! */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002118 out[0] = val;
2119 out[1] = 0;
2120 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
2121 ctxt->sax->characters(ctxt->userData, out, 1);
2122 } else {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002123 ent = htmlParseEntityRef(ctxt, &name);
2124 if (name == NULL) return; /* Shall we output & anyway ? */
2125 if ((ent == NULL) || (ent->value <= 0) || (ent->value >= 255)) {
2126 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
Daniel Veillardb96e6431999-08-29 21:02:19 +00002127 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002128 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
Daniel Veillardb96e6431999-08-29 21:02:19 +00002129 ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002130 }
2131 } else {
Daniel Veillardb96e6431999-08-29 21:02:19 +00002132 /* invalid for UTF-8 variable encoding !!!!! */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002133 out[0] = ent->value;
2134 out[1] = 0;
2135 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
2136 ctxt->sax->characters(ctxt->userData, out, 1);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002137 }
Daniel Veillard6454aec1999-09-02 22:04:43 +00002138 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002139 }
2140}
2141
2142/**
2143 * htmlParseContent:
2144 * @ctxt: an HTML parser context
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002145 * @name: the node name
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002146 *
2147 * Parse a content: comment, sub-element, reference or text.
2148 *
2149 */
2150
2151void
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002152htmlParseContent(htmlParserCtxtPtr ctxt, const CHAR *name) {
2153 htmlNodePtr currentNode;
2154
2155 currentNode = ctxt->node;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002156 while ((CUR != '<') || (NXT(1) != '/')) {
2157 const CHAR *test = CUR_PTR;
2158
2159 /*
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002160 * Has this node been popped out during parsing of
2161 * the next element
2162 */
2163 if (currentNode != ctxt->node) return;
2164
2165 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002166 * First case : a comment
2167 */
2168 if ((CUR == '<') && (NXT(1) == '!') &&
2169 (NXT(2) == '-') && (NXT(3) == '-')) {
2170 htmlParseComment(ctxt, 1);
2171 }
2172
2173 /*
2174 * Second case : a sub-element.
2175 */
2176 else if (CUR == '<') {
2177 htmlParseElement(ctxt);
2178 }
2179
2180 /*
2181 * Third case : a reference. If if has not been resolved,
2182 * parsing returns it's Name, create the node
2183 */
2184 else if (CUR == '&') {
2185 htmlParseReference(ctxt);
2186 }
2187
2188 /*
2189 * Last case, text. Note that References are handled directly.
2190 */
2191 else {
2192 htmlParseCharData(ctxt, 0);
2193 }
2194
2195 if (test == CUR_PTR) {
2196 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2197 ctxt->sax->error(ctxt->userData,
2198 "detected an error in element content\n");
2199 ctxt->wellFormed = 0;
2200 break;
2201 }
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002202 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002203 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002204
2205 /*
2206 * parse the end of tag: '</' should be here.
2207 */
2208 htmlParseEndTag(ctxt, name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002209}
2210
2211/**
2212 * htmlParseElement:
2213 * @ctxt: an HTML parser context
2214 *
2215 * parse an HTML element, this is highly recursive
2216 *
2217 * [39] element ::= EmptyElemTag | STag content ETag
2218 *
2219 * [41] Attribute ::= Name Eq AttValue
2220 */
2221
2222void
2223htmlParseElement(htmlParserCtxtPtr ctxt) {
2224 const CHAR *openTag = CUR_PTR;
2225 CHAR *name;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002226 htmlNodePtr currentNode;
2227 htmlElemDescPtr info;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002228 htmlParserNodeInfo node_info;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002229
2230 /* Capture start position */
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002231 if (ctxt->record_info) {
2232 node_info.begin_pos = ctxt->input->consumed +
2233 (CUR_PTR - ctxt->input->base);
2234 node_info.begin_line = ctxt->input->line;
2235 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002236
2237 name = htmlParseStartTag(ctxt);
2238 if (name == NULL) {
2239 return;
2240 }
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002241 currentNode = ctxt->node;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002242
2243 /*
2244 * Lookup the info for that element.
2245 */
2246 info = htmlTagLookup(name);
2247 if (info == NULL) {
2248 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2249 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
2250 name);
2251 ctxt->wellFormed = 0;
2252 } else if (info->depr) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002253/***************************
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002254 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
2255 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
2256 name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002257 ***************************/
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002258 }
2259
2260 /*
2261 * Check for an Empty Element labelled the XML/SGML way
2262 */
2263 if ((CUR == '/') && (NXT(1) == '>')) {
2264 SKIP(2);
2265 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2266 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6454aec1999-09-02 22:04:43 +00002267 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002268 return;
2269 }
2270
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002271 if (CUR == '>') {
2272 NEXT;
2273 } else {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002274 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2275 ctxt->sax->error(ctxt->userData, "Couldn't find end of Start Tag\n%.30s\n",
2276 openTag);
2277 ctxt->wellFormed = 0;
2278
2279 /*
2280 * end of parsing of this node.
2281 */
2282 nodePop(ctxt);
Daniel Veillard6454aec1999-09-02 22:04:43 +00002283 xmlFree(name);
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002284
2285 /*
2286 * Capture end position and add node
2287 */
2288 if ( currentNode != NULL && ctxt->record_info ) {
2289 node_info.end_pos = ctxt->input->consumed +
2290 (CUR_PTR - ctxt->input->base);
2291 node_info.end_line = ctxt->input->line;
2292 node_info.node = currentNode;
2293 xmlParserAddNodeInfo(ctxt, &node_info);
2294 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002295 return;
2296 }
2297
2298 /*
2299 * Check for an Empty Element from DTD definition
2300 */
2301 if ((info != NULL) && (info->empty)) {
2302 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2303 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6454aec1999-09-02 22:04:43 +00002304 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002305 return;
2306 }
2307
2308 /*
2309 * Parse the content of the element:
2310 */
2311 currentNode = ctxt->node;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002312 htmlParseContent(ctxt, name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002313
2314 /*
2315 * check whether the element get popped due to auto closure
2316 * on start tag
2317 */
2318 if (currentNode != ctxt->node) {
Daniel Veillard6454aec1999-09-02 22:04:43 +00002319 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002320 return;
2321 }
2322
2323 if (!IS_CHAR(CUR)) {
2324 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2325 ctxt->sax->error(ctxt->userData,
2326 "Premature end of data in tag %.30s\n", openTag);
2327 ctxt->wellFormed = 0;
2328
2329 /*
2330 * end of parsing of this node.
2331 */
2332 nodePop(ctxt);
Daniel Veillard6454aec1999-09-02 22:04:43 +00002333 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002334 return;
2335 }
2336
Daniel Veillard6454aec1999-09-02 22:04:43 +00002337 xmlFree(name);
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002338
2339 /*
2340 * Capture end position and add node
2341 */
2342 if ( currentNode != NULL && ctxt->record_info ) {
2343 node_info.end_pos = ctxt->input->consumed +
2344 (CUR_PTR - ctxt->input->base);
2345 node_info.end_line = ctxt->input->line;
2346 node_info.node = currentNode;
2347 xmlParserAddNodeInfo(ctxt, &node_info);
2348 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002349}
2350
2351/**
2352 * htmlParseDocument :
2353 * @ctxt: an HTML parser context
2354 *
2355 * parse an HTML document (and build a tree if using the standard SAX
2356 * interface).
2357 *
2358 * Returns 0, -1 in case of error. the parser context is augmented
2359 * as a result of the parsing.
2360 */
2361
2362int
2363htmlParseDocument(htmlParserCtxtPtr ctxt) {
2364 htmlDefaultSAXHandlerInit();
2365 ctxt->html = 1;
2366
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002367 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002368 /*
Daniel Veillardb96e6431999-08-29 21:02:19 +00002369 * SAX: beginning of the document processing.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002370 */
2371 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
2372 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
2373
2374 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002375 * Wipe out everything which is before the first '<'
2376 */
2377 if (IS_BLANK(CUR)) {
2378 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2379 ctxt->sax->error(ctxt->userData,
2380 "Extra spaces at the beginning of the document are not allowed\n");
2381 ctxt->wellFormed = 0;
2382 SKIP_BLANKS;
2383 }
2384
2385 if (CUR == 0) {
2386 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2387 ctxt->sax->error(ctxt->userData, "Document is empty\n");
2388 ctxt->wellFormed = 0;
2389 }
2390
2391
2392 /*
2393 * Then possibly doc type declaration(s) and more Misc
2394 * (doctypedecl Misc*)?
2395 */
2396 if ((CUR == '<') && (NXT(1) == '!') &&
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002397 (UPP(2) == 'D') && (UPP(3) == 'O') &&
2398 (UPP(4) == 'C') && (UPP(5) == 'T') &&
2399 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
2400 (UPP(8) == 'E')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002401 htmlParseDocTypeDecl(ctxt);
2402 }
2403 SKIP_BLANKS;
2404
2405 /*
2406 * Create the document if not done already.
2407 */
2408 if (ctxt->myDoc == NULL) {
2409 ctxt->myDoc = htmlNewDoc(NULL, NULL);
2410 }
2411
2412 /*
2413 * Time to start parsing the tree itself
2414 */
2415 htmlParseElement(ctxt);
2416
2417 /*
2418 * SAX: end of the document processing.
2419 */
2420 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
2421 ctxt->sax->endDocument(ctxt->userData);
2422 if (! ctxt->wellFormed) return(-1);
2423 return(0);
2424}
2425
2426
2427/********************************************************************************
2428 * *
2429 * Parser contexts handling *
2430 * *
2431 ********************************************************************************/
2432
2433/**
2434 * xmlInitParserCtxt:
2435 * @ctxt: an HTML parser context
2436 *
2437 * Initialize a parser context
2438 */
2439
2440void
2441htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
2442{
2443 htmlSAXHandler *sax;
2444
Daniel Veillard6454aec1999-09-02 22:04:43 +00002445 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002446 if (sax == NULL) {
2447 fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
2448 }
2449
2450 /* Allocate the Input stack */
Daniel Veillard6454aec1999-09-02 22:04:43 +00002451 ctxt->inputTab = (htmlParserInputPtr *) xmlMalloc(5 * sizeof(htmlParserInputPtr));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002452 ctxt->inputNr = 0;
2453 ctxt->inputMax = 5;
2454 ctxt->input = NULL;
2455 ctxt->version = NULL;
2456 ctxt->encoding = NULL;
2457 ctxt->standalone = -1;
2458
2459 /* Allocate the Node stack */
Daniel Veillard6454aec1999-09-02 22:04:43 +00002460 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002461 ctxt->nodeNr = 0;
2462 ctxt->nodeMax = 10;
2463 ctxt->node = NULL;
2464
2465 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
2466 else {
2467 ctxt->sax = sax;
2468 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
2469 }
2470 ctxt->userData = ctxt;
2471 ctxt->myDoc = NULL;
2472 ctxt->wellFormed = 1;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002473 ctxt->replaceEntities = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002474 ctxt->html = 1;
2475 ctxt->record_info = 0;
2476 xmlInitNodeInfoSeq(&ctxt->node_seq);
2477}
2478
2479/**
2480 * htmlFreeParserCtxt:
2481 * @ctxt: an HTML parser context
2482 *
2483 * Free all the memory used by a parser context. However the parsed
2484 * document in ctxt->myDoc is not freed.
2485 */
2486
2487void
2488htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
2489{
2490 htmlParserInputPtr input;
2491
2492 if (ctxt == NULL) return;
2493
2494 while ((input = inputPop(ctxt)) != NULL) {
2495 xmlFreeInputStream(input);
2496 }
2497
Daniel Veillard6454aec1999-09-02 22:04:43 +00002498 if (ctxt->nodeTab != NULL) xmlFree(ctxt->nodeTab);
2499 if (ctxt->inputTab != NULL) xmlFree(ctxt->inputTab);
2500 if (ctxt->version != NULL) xmlFree((char *) ctxt->version);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002501 if ((ctxt->sax != NULL) && (ctxt->sax != &htmlDefaultSAXHandler))
Daniel Veillard6454aec1999-09-02 22:04:43 +00002502 xmlFree(ctxt->sax);
2503 xmlFree(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002504}
2505
2506/**
2507 * htmlCreateDocParserCtxt :
2508 * @cur: a pointer to an array of CHAR
2509 * @encoding: a free form C string describing the HTML document encoding, or NULL
2510 *
2511 * Create a parser context for an HTML document.
2512 *
2513 * Returns the new parser context or NULL
2514 */
2515htmlParserCtxtPtr
2516htmlCreateDocParserCtxt(CHAR *cur, const char *encoding) {
2517 htmlParserCtxtPtr ctxt;
2518 htmlParserInputPtr input;
2519 /* htmlCharEncoding enc; */
2520
Daniel Veillard6454aec1999-09-02 22:04:43 +00002521 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002522 if (ctxt == NULL) {
2523 perror("malloc");
2524 return(NULL);
2525 }
2526 htmlInitParserCtxt(ctxt);
Daniel Veillard6454aec1999-09-02 22:04:43 +00002527 input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002528 if (input == NULL) {
2529 perror("malloc");
Daniel Veillard6454aec1999-09-02 22:04:43 +00002530 xmlFree(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002531 return(NULL);
2532 }
2533
2534 /*
2535 * plug some encoding conversion routines here. !!!
2536 if (encoding != NULL) {
2537 enc = htmlDetectCharEncoding(cur);
2538 htmlSwitchEncoding(ctxt, enc);
2539 }
2540 */
2541
2542 input->filename = NULL;
2543 input->line = 1;
2544 input->col = 1;
2545 input->base = cur;
2546 input->cur = cur;
2547 input->free = NULL;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002548 input->buf = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002549
2550 inputPush(ctxt, input);
2551 return(ctxt);
2552}
2553
2554/********************************************************************************
2555 * *
2556 * User entry points *
2557 * *
2558 ********************************************************************************/
2559
2560/**
2561 * htmlSAXParseDoc :
2562 * @cur: a pointer to an array of CHAR
2563 * @encoding: a free form C string describing the HTML document encoding, or NULL
2564 * @sax: the SAX handler block
2565 * @userData: if using SAX, this pointer will be provided on callbacks.
2566 *
2567 * parse an HTML in-memory document and build a tree.
2568 * It use the given SAX function block to handle the parsing callback.
2569 * If sax is NULL, fallback to the default DOM tree building routines.
2570 *
2571 * Returns the resulting document tree
2572 */
2573
2574htmlDocPtr
2575htmlSAXParseDoc(CHAR *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
2576 htmlDocPtr ret;
2577 htmlParserCtxtPtr ctxt;
2578
2579 if (cur == NULL) return(NULL);
2580
2581
2582 ctxt = htmlCreateDocParserCtxt(cur, encoding);
2583 if (ctxt == NULL) return(NULL);
2584 if (sax != NULL) {
2585 ctxt->sax = sax;
2586 ctxt->userData = userData;
2587 }
2588
2589 htmlParseDocument(ctxt);
2590 ret = ctxt->myDoc;
2591 if (sax != NULL) {
2592 ctxt->sax = NULL;
2593 ctxt->userData = NULL;
2594 }
2595 htmlFreeParserCtxt(ctxt);
2596
2597 return(ret);
2598}
2599
2600/**
2601 * htmlParseDoc :
2602 * @cur: a pointer to an array of CHAR
2603 * @encoding: a free form C string describing the HTML document encoding, or NULL
2604 *
2605 * parse an HTML in-memory document and build a tree.
2606 *
2607 * Returns the resulting document tree
2608 */
2609
2610htmlDocPtr
2611htmlParseDoc(CHAR *cur, const char *encoding) {
2612 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
2613}
2614
2615
2616/**
2617 * htmlCreateFileParserCtxt :
2618 * @filename: the filename
2619 * @encoding: a free form C string describing the HTML document encoding, or NULL
2620 *
2621 * Create a parser context for a file content.
2622 * Automatic support for ZLIB/Compress compressed document is provided
2623 * by default if found at compile-time.
2624 *
2625 * Returns the new parser context or NULL
2626 */
2627htmlParserCtxtPtr
2628htmlCreateFileParserCtxt(const char *filename, const char *encoding)
2629{
2630 htmlParserCtxtPtr ctxt;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002631 htmlParserInputPtr inputStream;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002632 xmlParserInputBufferPtr buf;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002633 /* htmlCharEncoding enc; */
2634
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002635 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
2636 if (buf == NULL) return(NULL);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002637
Daniel Veillard6454aec1999-09-02 22:04:43 +00002638 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002639 if (ctxt == NULL) {
2640 perror("malloc");
2641 return(NULL);
2642 }
2643 htmlInitParserCtxt(ctxt);
Daniel Veillard6454aec1999-09-02 22:04:43 +00002644 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002645 if (inputStream == NULL) {
2646 perror("malloc");
Daniel Veillard6454aec1999-09-02 22:04:43 +00002647 xmlFree(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002648 return(NULL);
2649 }
2650
Daniel Veillard6454aec1999-09-02 22:04:43 +00002651 inputStream->filename = xmlMemStrdup(filename);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002652 inputStream->line = 1;
2653 inputStream->col = 1;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002654 inputStream->buf = buf;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002655
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002656 inputStream->base = inputStream->buf->buffer->content;
2657 inputStream->cur = inputStream->buf->buffer->content;
2658 inputStream->free = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002659
2660 inputPush(ctxt, inputStream);
2661 return(ctxt);
2662}
2663
2664/**
2665 * htmlSAXParseFile :
2666 * @filename: the filename
2667 * @encoding: a free form C string describing the HTML document encoding, or NULL
2668 * @sax: the SAX handler block
2669 * @userData: if using SAX, this pointer will be provided on callbacks.
2670 *
2671 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
2672 * compressed document is provided by default if found at compile-time.
2673 * It use the given SAX function block to handle the parsing callback.
2674 * If sax is NULL, fallback to the default DOM tree building routines.
2675 *
2676 * Returns the resulting document tree
2677 */
2678
2679htmlDocPtr
2680htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
2681 void *userData) {
2682 htmlDocPtr ret;
2683 htmlParserCtxtPtr ctxt;
2684
2685 ctxt = htmlCreateFileParserCtxt(filename, encoding);
2686 if (ctxt == NULL) return(NULL);
2687 if (sax != NULL) {
2688 ctxt->sax = sax;
2689 ctxt->userData = userData;
2690 }
2691
2692 htmlParseDocument(ctxt);
2693
2694 ret = ctxt->myDoc;
2695 if (sax != NULL) {
2696 ctxt->sax = NULL;
2697 ctxt->userData = NULL;
2698 }
2699 htmlFreeParserCtxt(ctxt);
2700
2701 return(ret);
2702}
2703
2704/**
2705 * htmlParseFile :
2706 * @filename: the filename
2707 * @encoding: a free form C string describing the HTML document encoding, or NULL
2708 *
2709 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
2710 * compressed document is provided by default if found at compile-time.
2711 *
2712 * Returns the resulting document tree
2713 */
2714
2715htmlDocPtr
2716htmlParseFile(const char *filename, const char *encoding) {
2717 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
2718}