blob: ee7b0f70f8edde6e297db0afe73c1ba10a1561c1 [file] [log] [blame]
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * Daniel.Veillard@w3.org
7 */
8
9#ifdef WIN32
10#define HAVE_FCNTL_H
11#include <io.h>
12#else
Daniel Veillard7f7d1111999-09-22 09:46:25 +000013#include "config.h"
Daniel Veillardbe70ff71999-07-05 16:50:46 +000014#endif
Daniel Veillard7f7d1111999-09-22 09:46:25 +000015
Daniel Veillardbe70ff71999-07-05 16:50:46 +000016#include <stdio.h>
Daniel Veillardbe70ff71999-07-05 16:50:46 +000017#include <string.h> /* for memset() only */
Daniel Veillard7f7d1111999-09-22 09:46:25 +000018#ifdef HAVE_CTYPE_H
19#include <ctype.h>
20#endif
21#ifdef HAVE_STDLIB_H
Daniel Veillardbe70ff71999-07-05 16:50:46 +000022#include <stdlib.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000023#endif
24#ifdef HAVE_SYS_STAT_H
Daniel Veillardbe70ff71999-07-05 16:50:46 +000025#include <sys/stat.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000026#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +000027#ifdef HAVE_FCNTL_H
28#include <fcntl.h>
29#endif
30#ifdef HAVE_UNISTD_H
31#include <unistd.h>
32#endif
33#ifdef HAVE_ZLIB_H
34#include <zlib.h>
35#endif
36
Daniel Veillard6454aec1999-09-02 22:04:43 +000037#include "xmlmemory.h"
Daniel Veillardbe70ff71999-07-05 16:50:46 +000038#include "tree.h"
39#include "HTMLparser.h"
40#include "entities.h"
41#include "encoding.h"
42#include "valid.h"
43#include "parserInternals.h"
Daniel Veillarde2d034d1999-07-27 19:52:06 +000044#include "xmlIO.h"
45
46#define HTML_MAX_NAMELEN 1000
47#define INPUT_CHUNK 50
Daniel Veillardbe70ff71999-07-05 16:50:46 +000048
Daniel Veillard82150d81999-07-07 07:32:15 +000049/* #define DEBUG */
Daniel Veillard5233ffc1999-07-06 22:25:25 +000050
51/************************************************************************
52 * *
53 * Parser stacks related functions and macros *
54 * *
55 ************************************************************************/
56
57/*
58 * Generic function for accessing stacks in the Parser Context
59 */
60
61#define PUSH_AND_POP(type, name) \
62int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
63 if (ctxt->name##Nr >= ctxt->name##Max) { \
64 ctxt->name##Max *= 2; \
Daniel Veillard6454aec1999-09-02 22:04:43 +000065 ctxt->name##Tab = (void *) xmlRealloc(ctxt->name##Tab, \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000066 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
67 if (ctxt->name##Tab == NULL) { \
68 fprintf(stderr, "realloc failed !\n"); \
69 exit(1); \
70 } \
71 } \
72 ctxt->name##Tab[ctxt->name##Nr] = value; \
73 ctxt->name = value; \
74 return(ctxt->name##Nr++); \
75} \
76type html##name##Pop(htmlParserCtxtPtr ctxt) { \
77 type ret; \
Daniel Veillard7c1206f1999-10-14 09:10:25 +000078 if (ctxt->name##Nr < 0) return(0); \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000079 ctxt->name##Nr--; \
Daniel Veillard7c1206f1999-10-14 09:10:25 +000080 if (ctxt->name##Nr < 0) return(0); \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000081 if (ctxt->name##Nr > 0) \
82 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
83 else \
84 ctxt->name = NULL; \
85 ret = ctxt->name##Tab[ctxt->name##Nr]; \
86 ctxt->name##Tab[ctxt->name##Nr] = 0; \
87 return(ret); \
88} \
89
90PUSH_AND_POP(xmlNodePtr, node)
Daniel Veillard2673d3c1999-10-08 14:37:09 +000091PUSH_AND_POP(xmlChar*, name)
Daniel Veillard5233ffc1999-07-06 22:25:25 +000092
93/*
94 * Macros for accessing the content. Those should be used only by the parser,
95 * and not exported.
96 *
97 * Dirty macros, i.e. one need to make assumption on the context to use them
98 *
Daniel Veillarddd6b3671999-09-23 22:19:22 +000099 * CUR_PTR return the current pointer to the xmlChar to be parsed.
100 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000101 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
102 * in UNICODE mode. This should be used internally by the parser
103 * only to compare to ASCII values otherwise it would break when
104 * running with UTF-8 encoding.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000105 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000106 * to compare on ASCII based substring.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000107 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000108 * it should be used only to compare on ASCII based substring.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000109 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000110 * strings within the parser.
111 *
112 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
113 *
114 * CURRENT Returns the current char value, with the full decoding of
115 * UTF-8 if we are using this mode. It returns an int.
116 * NEXT Skip to the next character, this does the proper decoding
117 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000118 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
119 */
120
121#define CUR (*ctxt->input->cur)
122#define UPPER (toupper(*ctxt->input->cur))
123#define SKIP(val) ctxt->input->cur += (val)
124#define NXT(val) ctxt->input->cur[(val)]
125#define UPP(val) (toupper(ctxt->input->cur[(val)]))
126#define CUR_PTR ctxt->input->cur
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000127#define SHRINK xmlParserInputShrink(ctxt->input)
128#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000129
130#define SKIP_BLANKS \
131 while (IS_BLANK(*(ctxt->input->cur))) NEXT
132
133#ifndef USE_UTF_8
134#define CURRENT (*ctxt->input->cur)
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000135#define NEXT { \
136 if ((*ctxt->input->cur == 0) && \
137 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) { \
138 xmlPopInput(ctxt); \
139 } else { \
140 if (*(ctxt->input->cur) == '\n') { \
141 ctxt->input->line++; ctxt->input->col = 1; \
142 } else ctxt->input->col++; \
143 ctxt->input->cur++; \
144 if (*ctxt->input->cur == 0) \
145 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); \
146 }}
147
148/****************************************
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000149#define NEXT ((*ctxt->input->cur) ? \
150 (((*(ctxt->input->cur) == '\n') ? \
151 (ctxt->input->line++, ctxt->input->col = 1) : \
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000152 (ctxt->input->col++)), \
153 (ctxt->input->cur++), \
154 ((*ctxt->input->cur) ? \
155 (xmlParserInputGrow(ctxt->input, 100), \
156 ctxt->input->cur): \
157 (ctxt->input->cur))) : \
158 ((xmlParserInputGrow(ctxt->input, 100) > 0) ? \
159 ctxt->input->cur: \
160 (xmlPopInput(ctxt), ctxt->input->cur)))
161 ****************************************/
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000162#else
163#endif
164
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000165
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000166
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000167/************************************************************************
168 * *
169 * The list of HTML elements and their properties *
170 * *
171 ************************************************************************/
172
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000173/*
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000174 * Start Tag: 1 means the start tag can be ommited
175 * End Tag: 1 means the end tag can be ommited
176 * 2 means it's forbidden (empty elements)
177 * Depr: this element is deprecated
178 * DTD: 1 means that this element is valid only in the Loose DTD
179 * 2 means that this element is valid only in the Frameset DTD
180 *
181 * Name,Start Tag,End Tag, Empty, Depr., DTD, Description
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000182 */
183htmlElemDesc html40ElementTable[] = {
184{ "A", 0, 0, 0, 0, 0, "anchor " },
185{ "ABBR", 0, 0, 0, 0, 0, "abbreviated form" },
186{ "ACRONYM", 0, 0, 0, 0, 0, "" },
187{ "ADDRESS", 0, 0, 0, 0, 0, "information on author " },
188{ "APPLET", 0, 0, 0, 1, 1, "Java applet " },
189{ "AREA", 0, 2, 1, 0, 0, "client-side image map area " },
190{ "B", 0, 0, 0, 0, 0, "bold text style" },
191{ "BASE", 0, 2, 1, 0, 0, "document base URI " },
192{ "BASEFONT", 0, 2, 1, 1, 1, "base font size " },
193{ "BDO", 0, 0, 0, 0, 0, "I18N BiDi over-ride " },
194{ "BIG", 0, 0, 0, 0, 0, "large text style" },
195{ "BLOCKQUOTE", 0, 0, 0, 0, 0, "long quotation " },
196{ "BODY", 1, 1, 0, 0, 0, "document body " },
197{ "BR", 0, 2, 1, 0, 0, "forced line break " },
198{ "BUTTON", 0, 0, 0, 0, 0, "push button " },
199{ "CAPTION", 0, 0, 0, 0, 0, "table caption " },
200{ "CENTER", 0, 0, 0, 1, 1, "shorthand for DIV align=center " },
201{ "CITE", 0, 0, 0, 0, 0, "citation" },
202{ "CODE", 0, 0, 0, 0, 0, "computer code fragment" },
203{ "COL", 0, 2, 1, 0, 0, "table column " },
204{ "COLGROUP", 0, 1, 0, 0, 0, "table column group " },
205{ "DD", 0, 1, 0, 0, 0, "definition description " },
206{ "DEL", 0, 0, 0, 0, 0, "deleted text " },
207{ "DFN", 0, 0, 0, 0, 0, "instance definition" },
208{ "DIR", 0, 0, 0, 1, 1, "directory list" },
209{ "DIV", 0, 0, 0, 0, 0, "generic language/style container"},
210{ "DL", 0, 0, 0, 0, 0, "definition list " },
211{ "DT", 0, 1, 0, 0, 0, "definition term " },
212{ "EM", 0, 0, 0, 0, 0, "emphasis" },
213{ "FIELDSET", 0, 0, 0, 0, 0, "form control group " },
214{ "FONT", 0, 0, 0, 1, 1, "local change to font " },
215{ "FORM", 0, 0, 0, 0, 0, "interactive form " },
216{ "FRAME", 0, 2, 1, 0, 2, "subwindow " },
217{ "FRAMESET", 0, 0, 0, 0, 2, "window subdivision" },
218{ "H1", 0, 0, 0, 0, 0, "heading " },
219{ "H2", 0, 0, 0, 0, 0, "heading " },
220{ "H3", 0, 0, 0, 0, 0, "heading " },
221{ "H4", 0, 0, 0, 0, 0, "heading " },
222{ "H5", 0, 0, 0, 0, 0, "heading " },
223{ "H6", 0, 0, 0, 0, 0, "heading " },
224{ "HEAD", 1, 1, 0, 0, 0, "document head " },
225{ "HR", 0, 2, 1, 0, 0, "horizontal rule " },
226{ "HTML", 1, 1, 0, 0, 0, "document root element " },
227{ "I", 0, 0, 0, 0, 0, "italic text style" },
228{ "IFRAME", 0, 0, 0, 0, 1, "inline subwindow " },
229{ "IMG", 0, 2, 1, 0, 0, "Embedded image " },
230{ "INPUT", 0, 2, 1, 0, 0, "form control " },
231{ "INS", 0, 0, 0, 0, 0, "inserted text" },
232{ "ISINDEX", 0, 2, 1, 1, 1, "single line prompt " },
233{ "KBD", 0, 0, 0, 0, 0, "text to be entered by the user" },
234{ "LABEL", 0, 0, 0, 0, 0, "form field label text " },
235{ "LEGEND", 0, 0, 0, 0, 0, "fieldset legend " },
236{ "LI", 0, 1, 0, 0, 0, "list item " },
237{ "LINK", 0, 2, 1, 0, 0, "a media-independent link " },
238{ "MAP", 0, 0, 0, 0, 0, "client-side image map " },
239{ "MENU", 0, 0, 0, 1, 1, "menu list " },
240{ "META", 0, 2, 1, 0, 0, "generic metainformation " },
241{ "NOFRAMES", 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " },
242{ "NOSCRIPT", 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
243{ "OBJECT", 0, 0, 0, 0, 0, "generic embedded object " },
244{ "OL", 0, 0, 0, 0, 0, "ordered list " },
245{ "OPTGROUP", 0, 0, 0, 0, 0, "option group " },
246{ "OPTION", 0, 1, 0, 0, 0, "selectable choice " },
247{ "P", 0, 1, 0, 0, 0, "paragraph " },
248{ "PARAM", 0, 2, 1, 0, 0, "named property value " },
249{ "PRE", 0, 0, 0, 0, 0, "preformatted text " },
250{ "Q", 0, 0, 0, 0, 0, "short inline quotation " },
251{ "S", 0, 0, 0, 1, 1, "strike-through text style" },
252{ "SAMP", 0, 0, 0, 0, 0, "sample program output, scripts, etc." },
253{ "SCRIPT", 0, 0, 0, 0, 0, "script statements " },
254{ "SELECT", 0, 0, 0, 0, 0, "option selector " },
255{ "SMALL", 0, 0, 0, 0, 0, "small text style" },
256{ "SPAN", 0, 0, 0, 0, 0, "generic language/style container " },
257{ "STRIKE", 0, 0, 0, 1, 1, "strike-through text" },
258{ "STRONG", 0, 0, 0, 0, 0, "strong emphasis" },
259{ "STYLE", 0, 0, 0, 0, 0, "style info " },
260{ "SUB", 0, 0, 0, 0, 0, "subscript" },
261{ "SUP", 0, 0, 0, 0, 0, "superscript " },
262{ "TABLE", 0, 0, 0, 0, 0, "&#160;" },
263{ "TBODY", 1, 1, 0, 0, 0, "table body " },
264{ "TD", 0, 1, 0, 0, 0, "table data cell" },
265{ "TEXTAREA", 0, 0, 0, 0, 0, "multi-line text field " },
266{ "TFOOT", 0, 1, 0, 0, 0, "table footer " },
267{ "TH", 0, 1, 0, 0, 0, "table header cell" },
268{ "THEAD", 0, 1, 0, 0, 0, "table header " },
269{ "TITLE", 0, 0, 0, 0, 0, "document title " },
270{ "TR", 0, 1, 0, 0, 0, "table row " },
271{ "TT", 0, 0, 0, 0, 0, "teletype or monospaced text style" },
272{ "U", 0, 0, 0, 1, 1, "underlined text style" },
273{ "UL", 0, 0, 0, 0, 0, "unordered list " },
274{ "VAR", 0, 0, 0, 0, 0, "instance of a variable or program argument" },
275};
276
277/*
278 * start tags that imply the end of a current element
279 * any tag of each line implies the end of the current element if the type of
280 * that element is in the same line
281 */
Daniel Veillardb96e6431999-08-29 21:02:19 +0000282char *htmlEquEnd[] = {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000283"DT", "DD", "LI", "OPTION", NULL,
284"H1", "H2", "H3", "H4", "H5", "H6", NULL,
285"OL", "MENU", "DIR", "ADDRESS", "PRE", "LISTING", "XMP", NULL,
286NULL
287};
288/*
289 * acording the HTML DTD, HR should be added to the 2nd line above, as it
290 * is not allowed within a H1, H2, H3, etc. But we should tolerate that case
291 * because many documents contain rules in headings...
292 */
293
294/*
295 * start tags that imply the end of current element
296 */
Daniel Veillardb96e6431999-08-29 21:02:19 +0000297char *htmlStartClose[] = {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000298"FORM", "FORM", "P", "HR", "H1", "H2", "H3", "H4", "H5", "H6",
299 "DL", "UL", "OL", "MENU", "DIR", "ADDRESS", "PRE",
300 "LISTING", "XMP", "HEAD", NULL,
301"HEAD", "P", NULL,
302"TITLE", "P", NULL,
303"BODY", "HEAD", "STYLE", "LINK", "TITLE", "P", NULL,
304"LI", "P", "H1", "H2", "H3", "H4", "H5", "H6", "DL", "ADDRESS",
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000305 "PRE", "LISTING", "XMP", "HEAD", "LI", NULL,
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000306"HR", "P", "HEAD", NULL,
307"H1", "P", "HEAD", NULL,
308"H2", "P", "HEAD", NULL,
309"H3", "P", "HEAD", NULL,
310"H4", "P", "HEAD", NULL,
311"H5", "P", "HEAD", NULL,
312"H6", "P", "HEAD", NULL,
313"DIR", "P", "HEAD", NULL,
314"ADDRESS", "P", "HEAD", "UL", NULL,
315"PRE", "P", "HEAD", "UL", NULL,
316"LISTING", "P", "HEAD", NULL,
317"XMP", "P", "HEAD", NULL,
318"BLOCKQUOTE", "P", "HEAD", NULL,
319"DL", "P", "DT", "MENU", "DIR", "ADDRESS", "PRE", "LISTING",
320 "XMP", "HEAD", NULL,
Daniel Veillard35008381999-10-25 13:15:52 +0000321"DT", "P", "MENU", "DIR", "ADDRESS", "PRE", "LISTING", "XMP",
322 "HEAD", "DD", NULL,
323"DD", "P", "MENU", "DIR", "ADDRESS", "PRE", "LISTING", "XMP",
324 "HEAD", "DT", NULL,
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000325"UL", "P", "HEAD", "OL", "MENU", "DIR", "ADDRESS", "PRE",
326 "LISTING", "XMP", NULL,
327"OL", "P", "HEAD", "UL", NULL,
328"MENU", "P", "HEAD", "UL", NULL,
329"P", "P", "HEAD", "H1", "H2", "H3", "H4", "H5", "H6", NULL,
330"DIV", "P", "HEAD", NULL,
331"NOSCRIPT", "P", "HEAD", NULL,
332"CENTER", "FONT", "B", "I", "P", "HEAD", NULL,
333"A", "A", NULL,
334"CAPTION", "P", NULL,
335"COLGROUP", "CAPTION", "COLGROUP", "COL", "P", NULL,
336"COL", "CAPTION", "COL", "P", NULL,
337"TABLE", "P", "HEAD", "H1", "H2", "H3", "H4", "H5", "H6", "PRE",
338 "LISTING", "XMP", "A", NULL,
339"TH", "TH", "TD", NULL,
Daniel Veillard7d2c2761999-10-11 15:09:51 +0000340"TD", "TH", "TD", "P", NULL,
341"TR", "TH", "TD", "TR", "CAPTION", "COL", "COLGROUP", "P", NULL,
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000342"THEAD", "CAPTION", "COL", "COLGROUP", NULL,
343"TFOOT", "TH", "TD", "TR", "CAPTION", "COL", "COLGROUP", "THEAD",
Daniel Veillard7d2c2761999-10-11 15:09:51 +0000344 "TBODY", "P", NULL,
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000345"TBODY", "TH", "TD", "TR", "CAPTION", "COL", "COLGROUP", "THEAD",
Daniel Veillard7d2c2761999-10-11 15:09:51 +0000346 "TFOOT", "TBODY", "P", NULL,
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000347"OPTGROUP", "OPTION", NULL,
348"FIELDSET", "LEGEND", "P", "HEAD", "H1", "H2", "H3", "H4", "H5", "H6",
349 "PRE", "LISTING", "XMP", "A", NULL,
350NULL
351};
352
Daniel Veillardb96e6431999-08-29 21:02:19 +0000353static char** htmlStartCloseIndex[100];
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000354static int htmlStartCloseIndexinitialized = 0;
355
356/************************************************************************
357 * *
358 * functions to handle HTML specific data *
359 * *
360 ************************************************************************/
361
362/**
363 * htmlInitAutoClose:
364 *
365 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
366 *
367 */
368void
369htmlInitAutoClose(void) {
370 int index, i = 0;
371
372 if (htmlStartCloseIndexinitialized) return;
373
374 for (index = 0;index < 100;index ++) htmlStartCloseIndex[index] = NULL;
375 index = 0;
376 while ((htmlStartClose[i] != NULL) && (index < 100 - 1)) {
377 htmlStartCloseIndex[index++] = &htmlStartClose[i];
378 while (htmlStartClose[i] != NULL) i++;
379 i++;
380 }
381}
382
383/**
384 * htmlTagLookup:
385 * @tag: The tag name
386 *
387 * Lookup the HTML tag in the ElementTable
388 *
389 * Returns the related htmlElemDescPtr or NULL if not found.
390 */
391htmlElemDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000392htmlTagLookup(const xmlChar *tag) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000393 int i = 0;
394
395 for (i = 0; i < (sizeof(html40ElementTable) /
396 sizeof(html40ElementTable[0]));i++) {
Daniel Veillardb96e6431999-08-29 21:02:19 +0000397 if (!xmlStrcmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000398 return(&html40ElementTable[i]);
399 }
400 return(NULL);
401}
402
403/**
404 * htmlCheckAutoClose:
405 * @new: The new tag name
406 * @old: The old tag name
407 *
408 * Checks wether the new tag is one of the registered valid tags for closing old.
409 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
410 *
411 * Returns 0 if no, 1 if yes.
412 */
413int
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000414htmlCheckAutoClose(const xmlChar *new, const xmlChar *old) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000415 int i, index;
Daniel Veillardb96e6431999-08-29 21:02:19 +0000416 char **close;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000417
418 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
419
420 /* inefficient, but not a big deal */
421 for (index = 0; index < 100;index++) {
422 close = htmlStartCloseIndex[index];
423 if (close == NULL) return(0);
Daniel Veillardb96e6431999-08-29 21:02:19 +0000424 if (!xmlStrcmp(BAD_CAST *close, new)) break;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000425 }
426
427 i = close - htmlStartClose;
428 i++;
429 while (htmlStartClose[i] != NULL) {
Daniel Veillardb96e6431999-08-29 21:02:19 +0000430 if (!xmlStrcmp(BAD_CAST htmlStartClose[i], old)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000431 return(1);
432 }
433 i++;
434 }
435 return(0);
436}
437
438/**
439 * htmlAutoClose:
440 * @ctxt: an HTML parser context
441 * @new: The new tag name
442 *
443 * The HTmL DtD allows a tag to implicitely close other tags.
444 * The list is kept in htmlStartClose array. This function is
445 * called when a new tag has been detected and generates the
446 * appropriates closes if possible/needed.
447 */
448void
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000449htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *new) {
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000450 xmlChar *oldname;
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000451 while ((ctxt->name != NULL) &&
452 (htmlCheckAutoClose(new, ctxt->name))) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000453#ifdef DEBUG
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000454 fprintf(stderr,"htmlAutoClose: %s closes %s\n", new, ctxt->name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000455#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000456 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000457 ctxt->sax->endElement(ctxt->userData, ctxt->name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +0000458 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000459 if (oldname != NULL) {
460#ifdef DEBUG
461 fprintf(stderr,"htmlAutoClose: popped %s\n", oldname);
462#endif
463 xmlFree(oldname);
464 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000465 }
466}
467
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000468/**
469 * htmlAutoCloseOnClose:
470 * @ctxt: an HTML parser context
471 * @new: The new tag name
472 *
473 * The HTmL DtD allows an ending tag to implicitely close other tags.
474 */
475void
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000476htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *new) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000477 htmlElemDescPtr info;
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000478 xmlChar *oldname;
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000479 int i;
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000480
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000481#ifdef DEBUG
482 fprintf(stderr,"Close of %s stack: %d elements\n", new, ctxt->nameNr);
483 for (i = 0;i < ctxt->nameNr;i++)
484 fprintf(stderr,"%d : %s\n", i, ctxt->nameTab[i]);
485#endif
486
487 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
488 if (!xmlStrcmp(new, ctxt->nameTab[i])) break;
489 }
490 if (i < 0) return;
491
492 while (xmlStrcmp(new, ctxt->name)) {
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000493 info = htmlTagLookup(ctxt->name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000494 if ((info == NULL) || (info->endTag == 1)) {
495#ifdef DEBUG
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000496 fprintf(stderr,"htmlAutoCloseOnClose: %s closes %s\n", new, ctxt->name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000497#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000498 } else {
499 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
500 ctxt->sax->error(ctxt->userData,
501 "Opening and ending tag mismatch: %s and %s\n",
502 new, ctxt->name);
503 ctxt->wellFormed = 0;
504 }
505 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
506 ctxt->sax->endElement(ctxt->userData, ctxt->name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +0000507 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000508 if (oldname != NULL) {
509#ifdef DEBUG
510 fprintf(stderr,"htmlAutoCloseOnClose: popped %s\n", oldname);
511#endif
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000512 xmlFree(oldname);
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000513 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000514 }
515}
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000516
517/************************************************************************
518 * *
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000519 * The list of HTML predefined entities *
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000520 * *
521 ************************************************************************/
522
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000523
524htmlEntityDesc html40EntitiesTable[] = {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000525/*
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000526 * the 4 absolute ones,
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000527 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000528{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
529{ 38, "amp", "ampersand, U+0026 ISOnum" },
Daniel Veillard1566d3a1999-07-15 14:24:29 +0000530{ 39, "apos", "single quote" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000531{ 60, "lt", "less-than sign, U+003C ISOnum" },
532{ 62, "gt", "greater-than sign, U+003E ISOnum" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000533
534/*
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000535 * A bunch still in the 128-255 range
536 * Replacing them depend really on the charset used.
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000537 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000538{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
539{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
540{ 162, "cent", "cent sign, U+00A2 ISOnum" },
541{ 163, "pound","pound sign, U+00A3 ISOnum" },
542{ 164, "curren","currency sign, U+00A4 ISOnum" },
543{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
544{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
545{ 167, "sect", "section sign, U+00A7 ISOnum" },
546{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
547{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
548{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
549{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
550{ 172, "not", "not sign, U+00AC ISOnum" },
551{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
552{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
553{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
554{ 176, "deg", "degree sign, U+00B0 ISOnum" },
555{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
556{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
557{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
558{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
559{ 181, "micro","micro sign, U+00B5 ISOnum" },
560{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000561{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000562{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
563{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
564{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000565{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000566{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
567{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
568{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
569{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
570{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
571{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
572{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
573{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
574{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
575{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
576{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
577{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
578{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
579{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
580{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
581{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
582{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
583{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
584{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
585{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
586{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
587{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
588{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
589{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
590{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
591{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
592{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
593{ 215, "times","multiplication sign, U+00D7 ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000594{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000595{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
596{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
597{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
598{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
599{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
600{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
601{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
602{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
603{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
604{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
605{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
606{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
607{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
608{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
609{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
610{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
611{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
612{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
613{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
614{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
615{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
616{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
617{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
618{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
619{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
620{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
621{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
622{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
623{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
624{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
625{ 247, "divide","division sign, U+00F7 ISOnum" },
626{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
627{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
628{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
629{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
630{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
631{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
632{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
633{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000634
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000635/*
636 * Anything below should really be kept as entities references
637 */
638{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000639
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000640{ 913, "Alpha","greek capital letter alpha, U+0391" },
641{ 914, "Beta", "greek capital letter beta, U+0392" },
642{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
643{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
644{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
645{ 918, "Zeta", "greek capital letter zeta, U+0396" },
646{ 919, "Eta", "greek capital letter eta, U+0397" },
647{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
648{ 921, "Iota", "greek capital letter iota, U+0399" },
649{ 922, "Kappa","greek capital letter kappa, U+039A" },
650{ 923, "Lambda""greek capital letter lambda, U+039B ISOgrk3" },
651{ 924, "Mu", "greek capital letter mu, U+039C" },
652{ 925, "Nu", "greek capital letter nu, U+039D" },
653{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
654{ 927, "Omicron","greek capital letter omicron, U+039F" },
655{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
656{ 929, "Rho", "greek capital letter rho, U+03A1" },
657{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
658{ 932, "Tau", "greek capital letter tau, U+03A4" },
659{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
660{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
661{ 935, "Chi", "greek capital letter chi, U+03A7" },
662{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
663{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000664
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000665{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
666{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
667{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
668{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
669{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
670{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
671{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
672{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
673{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
674{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
675{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
676{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
677{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
678{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
679{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
680{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
681{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
682{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
683{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
684{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
685{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
686{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
687{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
688{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
689{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
690{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
691{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
692{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000693
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000694{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
695{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
696{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
697{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
698{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
699{ 8260, "frasl","fraction slash, U+2044 NEW" },
700
Daniel Veillardb05deb71999-08-10 19:04:08 +0000701{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000702{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
703{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
704{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
705{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
706{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
707{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
708{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
709{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
710{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
711{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
712{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
713{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
714{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
715{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
716{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
717
718
719{ 8704, "forall","for all, U+2200 ISOtech" },
720{ 8706, "part", "partial differential, U+2202 ISOtech" },
721{ 8707, "exist","there exists, U+2203 ISOtech" },
722{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
723{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
724{ 8712, "isin", "element of, U+2208 ISOtech" },
725{ 8713, "notin","not an element of, U+2209 ISOtech" },
726{ 8715, "ni", "contains as member, U+220B ISOtech" },
727{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
728{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
729{ 8722, "minus","minus sign, U+2212 ISOtech" },
730{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
731{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
732{ 8733, "prop", "proportional to, U+221D ISOtech" },
733{ 8734, "infin","infinity, U+221E ISOtech" },
734{ 8736, "ang", "angle, U+2220 ISOamso" },
735{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
736{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
737{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
738{ 8746, "cup", "union = cup, U+222A ISOtech" },
739{ 8747, "int", "integral, U+222B ISOtech" },
740{ 8756, "there4","therefore, U+2234 ISOtech" },
741{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
742{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
743{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
744{ 8800, "ne", "not equal to, U+2260 ISOtech" },
745{ 8801, "equiv","identical to, U+2261 ISOtech" },
746{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
747{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
748{ 8834, "sub", "subset of, U+2282 ISOtech" },
749{ 8835, "sup", "superset of, U+2283 ISOtech" },
750{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
751{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
752{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
753{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
754{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
755{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
756{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
757{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
758{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
759{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
760{ 8971, "rfloor","right floor, U+230B ISOamsc" },
761{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
762{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
763{ 9674, "loz", "lozenge, U+25CA ISOpub" },
764
765{ 9824, "spades","black spade suit, U+2660 ISOpub" },
766{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
767{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
768{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
769
770{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
771{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
772{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
773{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
774{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
775{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
776{ 732, "tilde","small tilde, U+02DC ISOdia" },
777
778{ 8194, "ensp", "en space, U+2002 ISOpub" },
779{ 8195, "emsp", "em space, U+2003 ISOpub" },
780{ 8201, "thinsp","thin space, U+2009 ISOpub" },
781{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
782{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
783{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
784{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
785{ 8211, "ndash","en dash, U+2013 ISOpub" },
786{ 8212, "mdash","em dash, U+2014 ISOpub" },
787{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
788{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
789{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
790{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
791{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
792{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
793{ 8224, "dagger","dagger, U+2020 ISOpub" },
794{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
795{ 8240, "permil","per mille sign, U+2030 ISOtech" },
796{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000797{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000798{ 8364, "euro", "euro sign, U+20AC NEW" }
799};
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000800
801/************************************************************************
802 * *
803 * Commodity functions to handle entities *
804 * *
805 ************************************************************************/
806
807/*
808 * Macro used to grow the current buffer.
809 */
810#define growBuffer(buffer) { \
811 buffer##_size *= 2; \
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000812 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000813 if (buffer == NULL) { \
814 perror("realloc failed"); \
815 exit(1); \
816 } \
817}
818
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000819/**
820 * htmlEntityLookup:
821 * @name: the entity name
822 *
823 * Lookup the given entity in EntitiesTable
824 *
825 * TODO: the linear scan is really ugly, an hash table is really needed.
826 *
827 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
828 */
829htmlEntityDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000830htmlEntityLookup(const xmlChar *name) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000831 int i;
832
833 for (i = 0;i < (sizeof(html40EntitiesTable)/
834 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillardb96e6431999-08-29 21:02:19 +0000835 if (!xmlStrcmp(name, BAD_CAST html40EntitiesTable[i].name)) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000836#ifdef DEBUG
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000837 fprintf(stderr,"Found entity %s\n", name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000838#endif
839 return(&html40EntitiesTable[i]);
840 }
841 }
842 return(NULL);
843}
844
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000845
846/**
847 * htmlDecodeEntities:
848 * @ctxt: the parser context
849 * @len: the len to decode (in bytes !), -1 for no size limit
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000850 * @end: an end marker xmlChar, 0 if none
851 * @end2: an end marker xmlChar, 0 if none
852 * @end3: an end marker xmlChar, 0 if none
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000853 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000854 * Subtitute the HTML entities by their value
855 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000856 * DEPRECATED !!!!
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000857 *
858 * Returns A newly allocated string with the substitution done. The caller
859 * must deallocate it !
860 */
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000861xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000862htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000863 xmlChar end, xmlChar end2, xmlChar end3) {
864 xmlChar *buffer = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000865 int buffer_size = 0;
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000866 xmlChar *out = NULL;
867 xmlChar *name = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000868
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000869 xmlChar *cur = NULL;
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000870 htmlEntityDescPtr ent;
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000871 int nbchars = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000872 unsigned int max = (unsigned int) len;
873
874 /*
875 * allocate a translation buffer.
876 */
877 buffer_size = 1000;
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000878 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000879 if (buffer == NULL) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000880 perror("htmlDecodeEntities: malloc failed");
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000881 return(NULL);
882 }
883 out = buffer;
884
885 /*
886 * Ok loop until we reach one of the ending char or a size limit.
887 */
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000888 while ((nbchars < max) && (CUR != end) &&
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000889 (CUR != end2) && (CUR != end3)) {
890
891 if (CUR == '&') {
892 if (NXT(1) == '#') {
893 int val = htmlParseCharRef(ctxt);
Daniel Veillardb96e6431999-08-29 21:02:19 +0000894 /* invalid for UTF-8 variable encoding !!!!! */
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000895 *out++ = val;
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000896 nbchars += 3; /* !!!! */
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000897 } else {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000898 ent = htmlParseEntityRef(ctxt, &name);
899 if (name != NULL) {
900 if ((ent == NULL) || (ent->value <= 0) ||
901 (ent->value >= 255)) {
902 *out++ = '&';
903 cur = name;
904 while (*cur != 0) {
905 if (out - buffer > buffer_size - 100) {
906 int index = out - buffer;
907
908 growBuffer(buffer);
909 out = &buffer[index];
910 }
911 *out++ = *cur++;
912 }
913 *out++ = ';';
914 } else {
Daniel Veillardb96e6431999-08-29 21:02:19 +0000915 /* invalid for UTF-8 variable encoding !!!!! */
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000916 *out++ = (xmlChar)ent->value;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000917 if (out - buffer > buffer_size - 100) {
918 int index = out - buffer;
919
920 growBuffer(buffer);
921 out = &buffer[index];
922 }
923 }
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000924 nbchars += 2 + xmlStrlen(name);
Daniel Veillard6454aec1999-09-02 22:04:43 +0000925 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000926 }
927 }
928 } else {
Daniel Veillardb96e6431999-08-29 21:02:19 +0000929 /* invalid for UTF-8 , use COPY(out); !!!!! */
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000930 *out++ = CUR;
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000931 nbchars++;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000932 if (out - buffer > buffer_size - 100) {
933 int index = out - buffer;
934
935 growBuffer(buffer);
936 out = &buffer[index];
937 }
938 NEXT;
939 }
940 }
941 *out++ = 0;
942 return(buffer);
943}
944
945
946/************************************************************************
947 * *
948 * Commodity functions to handle encodings *
949 * *
950 ************************************************************************/
951
952/**
953 * htmlSwitchEncoding:
954 * @ctxt: the parser context
955 * @len: the len of @cur
956 *
957 * change the input functions when discovering the character encoding
958 * of a given entity.
959 *
960 */
961void
962htmlSwitchEncoding(htmlParserCtxtPtr ctxt, xmlCharEncoding enc)
963{
964 switch (enc) {
965 case XML_CHAR_ENCODING_ERROR:
966 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
967 ctxt->sax->error(ctxt->userData, "encoding unknown\n");
968 ctxt->wellFormed = 0;
969 break;
970 case XML_CHAR_ENCODING_NONE:
971 /* let's assume it's UTF-8 without the XML decl */
972 return;
973 case XML_CHAR_ENCODING_UTF8:
974 /* default encoding, no conversion should be needed */
975 return;
976 case XML_CHAR_ENCODING_UTF16LE:
977 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
978 ctxt->sax->error(ctxt->userData,
979 "char encoding UTF16 little endian not supported\n");
980 break;
981 case XML_CHAR_ENCODING_UTF16BE:
982 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
983 ctxt->sax->error(ctxt->userData,
984 "char encoding UTF16 big endian not supported\n");
985 break;
986 case XML_CHAR_ENCODING_UCS4LE:
987 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
988 ctxt->sax->error(ctxt->userData,
989 "char encoding USC4 little endian not supported\n");
990 break;
991 case XML_CHAR_ENCODING_UCS4BE:
992 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
993 ctxt->sax->error(ctxt->userData,
994 "char encoding USC4 big endian not supported\n");
995 break;
996 case XML_CHAR_ENCODING_EBCDIC:
997 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
998 ctxt->sax->error(ctxt->userData,
999 "char encoding EBCDIC not supported\n");
1000 break;
1001 case XML_CHAR_ENCODING_UCS4_2143:
1002 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1003 ctxt->sax->error(ctxt->userData,
1004 "char encoding UCS4 2143 not supported\n");
1005 break;
1006 case XML_CHAR_ENCODING_UCS4_3412:
1007 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1008 ctxt->sax->error(ctxt->userData,
1009 "char encoding UCS4 3412 not supported\n");
1010 break;
1011 case XML_CHAR_ENCODING_UCS2:
1012 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1013 ctxt->sax->error(ctxt->userData,
1014 "char encoding UCS2 not supported\n");
1015 break;
1016 case XML_CHAR_ENCODING_8859_1:
1017 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1018 ctxt->sax->error(ctxt->userData,
1019 "char encoding ISO_8859_1 ISO Latin 1 not supported\n");
1020 break;
1021 case XML_CHAR_ENCODING_8859_2:
1022 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1023 ctxt->sax->error(ctxt->userData,
1024 "char encoding ISO_8859_2 ISO Latin 2 not supported\n");
1025 break;
1026 case XML_CHAR_ENCODING_8859_3:
1027 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1028 ctxt->sax->error(ctxt->userData,
1029 "char encoding ISO_8859_3 not supported\n");
1030 break;
1031 case XML_CHAR_ENCODING_8859_4:
1032 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1033 ctxt->sax->error(ctxt->userData,
1034 "char encoding ISO_8859_4 not supported\n");
1035 break;
1036 case XML_CHAR_ENCODING_8859_5:
1037 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1038 ctxt->sax->error(ctxt->userData,
1039 "char encoding ISO_8859_5 not supported\n");
1040 break;
1041 case XML_CHAR_ENCODING_8859_6:
1042 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1043 ctxt->sax->error(ctxt->userData,
1044 "char encoding ISO_8859_6 not supported\n");
1045 break;
1046 case XML_CHAR_ENCODING_8859_7:
1047 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1048 ctxt->sax->error(ctxt->userData,
1049 "char encoding ISO_8859_7 not supported\n");
1050 break;
1051 case XML_CHAR_ENCODING_8859_8:
1052 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1053 ctxt->sax->error(ctxt->userData,
1054 "char encoding ISO_8859_8 not supported\n");
1055 break;
1056 case XML_CHAR_ENCODING_8859_9:
1057 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1058 ctxt->sax->error(ctxt->userData,
1059 "char encoding ISO_8859_9 not supported\n");
1060 break;
1061 case XML_CHAR_ENCODING_2022_JP:
1062 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1063 ctxt->sax->error(ctxt->userData,
1064 "char encoding ISO-2022-JPnot supported\n");
1065 break;
1066 case XML_CHAR_ENCODING_SHIFT_JIS:
1067 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1068 ctxt->sax->error(ctxt->userData,
1069 "char encoding Shift_JISnot supported\n");
1070 break;
1071 case XML_CHAR_ENCODING_EUC_JP:
1072 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1073 ctxt->sax->error(ctxt->userData,
1074 "char encoding EUC-JPnot supported\n");
1075 break;
1076 }
1077}
1078
1079
1080/************************************************************************
1081 * *
1082 * Commodity functions, cleanup needed ? *
1083 * *
1084 ************************************************************************/
1085
1086/**
1087 * areBlanks:
1088 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001089 * @str: a xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001090 * @len: the size of @str
1091 *
1092 * Is this a sequence of blank chars that one can ignore ?
1093 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001094 * Returns 1 if ignorable 0 otherwise.
1095 */
1096
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001097static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001098 int i;
1099 xmlNodePtr lastChild;
1100
1101 for (i = 0;i < len;i++)
1102 if (!(IS_BLANK(str[i]))) return(0);
1103
1104 if (CUR != '<') return(0);
1105 if (ctxt->node == NULL) return(0);
1106 lastChild = xmlGetLastChild(ctxt->node);
1107 if (lastChild == NULL) {
1108 if (ctxt->node->content != NULL) return(0);
1109 } else if (xmlNodeIsText(lastChild))
1110 return(0);
1111 return(1);
1112}
1113
1114/**
1115 * htmlHandleEntity:
1116 * @ctxt: an HTML parser context
1117 * @entity: an XML entity pointer.
1118 *
1119 * Default handling of an HTML entity, call the parser with the
1120 * substitution string
1121 */
1122
1123void
1124htmlHandleEntity(htmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
1125 int len;
1126
1127 if (entity->content == NULL) {
1128 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1129 ctxt->sax->error(ctxt->userData, "htmlHandleEntity %s: content == NULL\n",
1130 entity->name);
1131 ctxt->wellFormed = 0;
1132 return;
1133 }
1134 len = xmlStrlen(entity->content);
1135
1136 /*
1137 * Just handle the content as a set of chars.
1138 */
1139 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
1140 ctxt->sax->characters(ctxt->userData, entity->content, len);
1141
1142}
1143
1144/**
1145 * htmlNewDoc:
1146 * @URI: URI for the dtd, or NULL
1147 * @ExternalID: the external ID of the DTD, or NULL
1148 *
1149 * Returns a new document
1150 */
1151htmlDocPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001152htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001153 xmlDocPtr cur;
1154
1155 /*
1156 * Allocate a new document and fill the fields.
1157 */
Daniel Veillard6454aec1999-09-02 22:04:43 +00001158 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001159 if (cur == NULL) {
1160 fprintf(stderr, "xmlNewDoc : malloc failed\n");
1161 return(NULL);
1162 }
Daniel Veillarde7a5a771999-08-30 13:05:42 +00001163 memset(cur, 0, sizeof(xmlDoc));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001164
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001165 cur->type = XML_HTML_DOCUMENT_NODE;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001166 cur->version = NULL;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001167 cur->intSubset = NULL;
Daniel Veillardb96e6431999-08-29 21:02:19 +00001168 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001169 cur->name = NULL;
1170 cur->root = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001171 cur->extSubset = NULL;
1172 cur->oldNs = NULL;
1173 cur->encoding = NULL;
1174 cur->standalone = 1;
1175 cur->compression = 0;
Daniel Veillardc08a2c61999-09-08 21:35:25 +00001176 cur->ids = NULL;
1177 cur->refs = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001178#ifndef XML_WITHOUT_CORBA
1179 cur->_private = NULL;
1180 cur->vepv = NULL;
1181#endif
1182 return(cur);
1183}
1184
1185
1186/************************************************************************
1187 * *
1188 * The parser itself *
1189 * Relates to http://www.w3.org/TR/html40 *
1190 * *
1191 ************************************************************************/
1192
1193/************************************************************************
1194 * *
1195 * The parser itself *
1196 * *
1197 ************************************************************************/
1198
1199/**
1200 * htmlParseHTMLName:
1201 * @ctxt: an HTML parser context
1202 *
1203 * parse an HTML tag or attribute name, note that we convert it to uppercase
1204 * since HTML names are not case-sensitive.
1205 *
1206 * Returns the Tag Name parsed or NULL
1207 */
1208
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001209xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001210htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001211 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001212 int i = 0;
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001213 xmlChar loc[100];
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001214
1215 if (!IS_LETTER(CUR) && (CUR != '_') &&
1216 (CUR != ':')) return(NULL);
1217
1218 while ((i < 100) && ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)))) {
1219 if ((CUR >= 0x61) && (CUR <= 0x7a)) loc[i] = CUR - 0x20;
1220 else loc[i] = CUR;
1221 i++;
1222
1223 NEXT;
1224 }
1225
1226 ret = xmlStrndup(loc, i);
1227
1228 return(ret);
1229}
1230
1231/**
1232 * htmlParseName:
1233 * @ctxt: an HTML parser context
1234 *
1235 * parse an HTML name, this routine is case sensistive.
1236 *
1237 * Returns the Name parsed or NULL
1238 */
1239
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001240xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001241htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001242 xmlChar buf[HTML_MAX_NAMELEN];
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001243 int len = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001244
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001245 GROW;
1246 if (!IS_LETTER(CUR) && (CUR != '_')) {
1247 return(NULL);
1248 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001249
1250 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1251 (CUR == '.') || (CUR == '-') ||
1252 (CUR == '_') || (CUR == ':') ||
1253 (IS_COMBINING(CUR)) ||
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001254 (IS_EXTENDER(CUR))) {
1255 buf[len++] = CUR;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001256 NEXT;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001257 if (len >= HTML_MAX_NAMELEN) {
1258 fprintf(stderr,
1259 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1260 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1261 (CUR == '.') || (CUR == '-') ||
1262 (CUR == '_') || (CUR == ':') ||
1263 (IS_COMBINING(CUR)) ||
1264 (IS_EXTENDER(CUR)))
1265 NEXT;
1266 break;
1267 }
1268 }
1269 return(xmlStrndup(buf, len));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001270}
1271
1272/**
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001273 * htmlParseHTMLAttribute:
1274 * @ctxt: an HTML parser context
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001275 * @stop: a char stop value
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001276 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001277 * parse an HTML attribute value till the stop (quote), if
1278 * stop is 0 then it stops at the first space
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001279 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001280 * Returns the attribute parsed or NULL
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001281 */
1282
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001283xmlChar *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001284htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001285 xmlChar buf[HTML_MAX_NAMELEN];
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001286 int len = 0;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001287
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001288 GROW;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001289 while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1290 if ((stop == 0) && (IS_BLANK(CUR))) break;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001291 buf[len++] = CUR;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001292 NEXT;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001293 if (len >= HTML_MAX_NAMELEN) {
1294 fprintf(stderr,
1295 "htmlParseHTMLAttribute: reached HTML_MAX_NAMELEN limit\n");
1296 while ((!IS_BLANK(CUR)) && (CUR != '<') &&
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001297 (CUR != '>') &&
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001298 (CUR != '\'') && (CUR != '"'))
1299 NEXT;
1300 break;
1301 }
1302 }
1303 return(xmlStrndup(buf, len));
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001304}
1305
1306/**
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001307 * htmlParseNmtoken:
1308 * @ctxt: an HTML parser context
1309 *
1310 * parse an HTML Nmtoken.
1311 *
1312 * Returns the Nmtoken parsed or NULL
1313 */
1314
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001315xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001316htmlParseNmtoken(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001317 xmlChar buf[HTML_MAX_NAMELEN];
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001318 int len = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001319
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001320 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001321 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1322 (CUR == '.') || (CUR == '-') ||
1323 (CUR == '_') || (CUR == ':') ||
1324 (IS_COMBINING(CUR)) ||
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001325 (IS_EXTENDER(CUR))) {
1326 buf[len++] = CUR;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001327 NEXT;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001328 if (len >= HTML_MAX_NAMELEN) {
1329 fprintf(stderr,
1330 "htmlParseNmtoken: reached HTML_MAX_NAMELEN limit\n");
1331 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1332 (CUR == '.') || (CUR == '-') ||
1333 (CUR == '_') || (CUR == ':') ||
1334 (IS_COMBINING(CUR)) ||
1335 (IS_EXTENDER(CUR)))
1336 NEXT;
1337 break;
1338 }
1339 }
1340 return(xmlStrndup(buf, len));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001341}
1342
1343/**
1344 * htmlParseEntityRef:
1345 * @ctxt: an HTML parser context
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001346 * @str: location to store the entity name
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001347 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001348 * parse an HTML ENTITY references
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001349 *
1350 * [68] EntityRef ::= '&' Name ';'
1351 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001352 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
1353 * if non-NULL *str will have to be freed by the caller.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001354 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001355htmlEntityDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001356htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
1357 xmlChar *name;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001358 htmlEntityDescPtr ent = NULL;
1359 *str = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001360
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001361 if (CUR == '&') {
1362 NEXT;
1363 name = htmlParseName(ctxt);
1364 if (name == NULL) {
1365 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1366 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
1367 ctxt->wellFormed = 0;
1368 } else {
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001369 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001370 if (CUR == ';') {
1371 NEXT;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001372 *str = name;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001373
1374 /*
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001375 * Lookup the entity in the table.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001376 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001377 ent = htmlEntityLookup(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001378 } else {
1379 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1380 ctxt->sax->error(ctxt->userData,
1381 "htmlParseEntityRef: expecting ';'\n");
1382 ctxt->wellFormed = 0;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001383 if (ctxt->sax->characters != NULL) {
Daniel Veillardb96e6431999-08-29 21:02:19 +00001384 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001385 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
1386 }
Daniel Veillard6454aec1999-09-02 22:04:43 +00001387 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001388 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001389 }
1390 }
1391 return(ent);
1392}
1393
1394/**
1395 * htmlParseAttValue:
1396 * @ctxt: an HTML parser context
1397 *
1398 * parse a value for an attribute
1399 * Note: the parser won't do substitution of entities here, this
1400 * will be handled later in xmlStringGetNodeList, unless it was
1401 * asked for ctxt->replaceEntities != 0
1402 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001403 * Returns the AttValue parsed or NULL.
1404 */
1405
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001406xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001407htmlParseAttValue(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001408 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001409
1410 if (CUR == '"') {
1411 NEXT;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001412 ret = htmlParseHTMLAttribute(ctxt, '"');
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001413 if (CUR != '"') {
1414 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1415 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
1416 ctxt->wellFormed = 0;
1417 } else
1418 NEXT;
1419 } else if (CUR == '\'') {
1420 NEXT;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001421 ret = htmlParseHTMLAttribute(ctxt, '\'');
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001422 if (CUR != '\'') {
1423 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1424 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
1425 ctxt->wellFormed = 0;
1426 } else
1427 NEXT;
1428 } else {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001429 /*
1430 * That's an HTMLism, the attribute value may not be quoted
1431 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001432 ret = htmlParseHTMLAttribute(ctxt, 0);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001433 if (ret == NULL) {
1434 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1435 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
1436 ctxt->wellFormed = 0;
1437 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001438 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001439 return(ret);
1440}
1441
1442/**
1443 * htmlParseSystemLiteral:
1444 * @ctxt: an HTML parser context
1445 *
1446 * parse an HTML Literal
1447 *
1448 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
1449 *
1450 * Returns the SystemLiteral parsed or NULL
1451 */
1452
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001453xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001454htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001455 const xmlChar *q;
1456 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001457
1458 if (CUR == '"') {
1459 NEXT;
1460 q = CUR_PTR;
1461 while ((IS_CHAR(CUR)) && (CUR != '"'))
1462 NEXT;
1463 if (!IS_CHAR(CUR)) {
1464 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1465 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
1466 ctxt->wellFormed = 0;
1467 } else {
1468 ret = xmlStrndup(q, CUR_PTR - q);
1469 NEXT;
1470 }
1471 } else if (CUR == '\'') {
1472 NEXT;
1473 q = CUR_PTR;
1474 while ((IS_CHAR(CUR)) && (CUR != '\''))
1475 NEXT;
1476 if (!IS_CHAR(CUR)) {
1477 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1478 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
1479 ctxt->wellFormed = 0;
1480 } else {
1481 ret = xmlStrndup(q, CUR_PTR - q);
1482 NEXT;
1483 }
1484 } else {
1485 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1486 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
1487 ctxt->wellFormed = 0;
1488 }
1489
1490 return(ret);
1491}
1492
1493/**
1494 * htmlParsePubidLiteral:
1495 * @ctxt: an HTML parser context
1496 *
1497 * parse an HTML public literal
1498 *
1499 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
1500 *
1501 * Returns the PubidLiteral parsed or NULL.
1502 */
1503
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001504xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001505htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001506 const xmlChar *q;
1507 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001508 /*
1509 * Name ::= (Letter | '_') (NameChar)*
1510 */
1511 if (CUR == '"') {
1512 NEXT;
1513 q = CUR_PTR;
1514 while (IS_PUBIDCHAR(CUR)) NEXT;
1515 if (CUR != '"') {
1516 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1517 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
1518 ctxt->wellFormed = 0;
1519 } else {
1520 ret = xmlStrndup(q, CUR_PTR - q);
1521 NEXT;
1522 }
1523 } else if (CUR == '\'') {
1524 NEXT;
1525 q = CUR_PTR;
1526 while ((IS_LETTER(CUR)) && (CUR != '\''))
1527 NEXT;
1528 if (!IS_LETTER(CUR)) {
1529 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1530 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
1531 ctxt->wellFormed = 0;
1532 } else {
1533 ret = xmlStrndup(q, CUR_PTR - q);
1534 NEXT;
1535 }
1536 } else {
1537 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1538 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
1539 ctxt->wellFormed = 0;
1540 }
1541
1542 return(ret);
1543}
1544
1545/**
1546 * htmlParseCharData:
1547 * @ctxt: an HTML parser context
1548 * @cdata: int indicating whether we are within a CDATA section
1549 *
1550 * parse a CharData section.
1551 * if we are within a CDATA section ']]>' marks an end of section.
1552 *
1553 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
1554 */
1555
1556void
1557htmlParseCharData(htmlParserCtxtPtr ctxt, int cdata) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001558 const xmlChar *q;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001559
1560 q = CUR_PTR;
1561 while ((IS_CHAR(CUR)) && (CUR != '<') &&
1562 (CUR != '&')) {
1563 if ((CUR == ']') && (NXT(1) == ']') &&
1564 (NXT(2) == '>')) {
1565 if (cdata) break;
1566 else {
1567 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1568 ctxt->sax->error(ctxt->userData,
1569 "Sequence ']]>' not allowed in content\n");
1570 ctxt->wellFormed = 0;
1571 }
1572 }
1573 NEXT;
1574 }
1575 if (q == CUR_PTR) return;
1576
1577 /*
1578 * Ok the segment [q CUR_PTR] is to be consumed as chars.
1579 */
1580 if (ctxt->sax != NULL) {
1581 if (areBlanks(ctxt, q, CUR_PTR - q)) {
1582 if (ctxt->sax->ignorableWhitespace != NULL)
1583 ctxt->sax->ignorableWhitespace(ctxt->userData, q, CUR_PTR - q);
1584 } else {
1585 if (ctxt->sax->characters != NULL)
1586 ctxt->sax->characters(ctxt->userData, q, CUR_PTR - q);
1587 }
1588 }
1589}
1590
1591/**
1592 * htmlParseExternalID:
1593 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001594 * @publicID: a xmlChar** receiving PubidLiteral
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001595 * @strict: indicate whether we should restrict parsing to only
1596 * production [75], see NOTE below
1597 *
1598 * Parse an External ID or a Public ID
1599 *
1600 * NOTE: Productions [75] and [83] interract badly since [75] can generate
1601 * 'PUBLIC' S PubidLiteral S SystemLiteral
1602 *
1603 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
1604 * | 'PUBLIC' S PubidLiteral S SystemLiteral
1605 *
1606 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
1607 *
1608 * Returns the function returns SystemLiteral and in the second
1609 * case publicID receives PubidLiteral, is strict is off
1610 * it is possible to return NULL and have publicID set.
1611 */
1612
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001613xmlChar *
1614htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID, int strict) {
1615 xmlChar *URI = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001616
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001617 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
1618 (UPP(2) == 'S') && (UPP(3) == 'T') &&
1619 (UPP(4) == 'E') && (UPP(5) == 'M')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001620 SKIP(6);
1621 if (!IS_BLANK(CUR)) {
1622 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1623 ctxt->sax->error(ctxt->userData,
1624 "Space required after 'SYSTEM'\n");
1625 ctxt->wellFormed = 0;
1626 }
1627 SKIP_BLANKS;
1628 URI = htmlParseSystemLiteral(ctxt);
1629 if (URI == NULL) {
1630 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1631 ctxt->sax->error(ctxt->userData,
1632 "htmlParseExternalID: SYSTEM, no URI\n");
1633 ctxt->wellFormed = 0;
1634 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001635 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
1636 (UPP(2) == 'B') && (UPP(3) == 'L') &&
1637 (UPP(4) == 'I') && (UPP(5) == 'C')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001638 SKIP(6);
1639 if (!IS_BLANK(CUR)) {
1640 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1641 ctxt->sax->error(ctxt->userData,
1642 "Space required after 'PUBLIC'\n");
1643 ctxt->wellFormed = 0;
1644 }
1645 SKIP_BLANKS;
1646 *publicID = htmlParsePubidLiteral(ctxt);
1647 if (*publicID == NULL) {
1648 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1649 ctxt->sax->error(ctxt->userData,
1650 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
1651 ctxt->wellFormed = 0;
1652 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001653 SKIP_BLANKS;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001654 if ((CUR == '"') || (CUR == '\'')) {
1655 URI = htmlParseSystemLiteral(ctxt);
1656 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001657 }
1658 return(URI);
1659}
1660
1661/**
1662 * htmlParseComment:
1663 * @ctxt: an HTML parser context
1664 * @create: should we create a node, or just skip the content
1665 *
1666 * Parse an XML (SGML) comment <!-- .... -->
1667 *
1668 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
1669 */
1670void
1671htmlParseComment(htmlParserCtxtPtr ctxt, int create) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001672 const xmlChar *q, *start;
1673 const xmlChar *r;
1674 xmlChar *val;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001675
1676 /*
1677 * Check that there is a comment right here.
1678 */
1679 if ((CUR != '<') || (NXT(1) != '!') ||
1680 (NXT(2) != '-') || (NXT(3) != '-')) return;
1681
1682 SKIP(4);
1683 start = q = CUR_PTR;
1684 NEXT;
1685 r = CUR_PTR;
1686 NEXT;
1687 while (IS_CHAR(CUR) &&
1688 ((CUR == ':') || (CUR != '>') ||
1689 (*r != '-') || (*q != '-'))) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001690 NEXT;r++;q++;
1691 }
1692 if (!IS_CHAR(CUR)) {
1693 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1694 ctxt->sax->error(ctxt->userData, "Comment not terminated \n<!--%.50s\n", start);
1695 ctxt->wellFormed = 0;
1696 } else {
1697 NEXT;
1698 if (create) {
Daniel Veillard4c3a2031999-11-19 17:46:26 +00001699 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL)) {
1700 val = xmlStrndup(start, q - start);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001701 ctxt->sax->comment(ctxt->userData, val);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00001702 xmlFree(val);
1703 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001704 }
1705 }
1706}
1707
1708/**
1709 * htmlParseCharRef:
1710 * @ctxt: an HTML parser context
1711 *
1712 * parse Reference declarations
1713 *
1714 * [66] CharRef ::= '&#' [0-9]+ ';' |
1715 * '&#x' [0-9a-fA-F]+ ';'
1716 *
1717 * Returns the value parsed (as an int)
1718 */
1719int
1720htmlParseCharRef(htmlParserCtxtPtr ctxt) {
1721 int val = 0;
1722
1723 if ((CUR == '&') && (NXT(1) == '#') &&
1724 (NXT(2) == 'x')) {
1725 SKIP(3);
1726 while (CUR != ';') {
1727 if ((CUR >= '0') && (CUR <= '9'))
1728 val = val * 16 + (CUR - '0');
1729 else if ((CUR >= 'a') && (CUR <= 'f'))
1730 val = val * 16 + (CUR - 'a') + 10;
1731 else if ((CUR >= 'A') && (CUR <= 'F'))
1732 val = val * 16 + (CUR - 'A') + 10;
1733 else {
1734 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1735 ctxt->sax->error(ctxt->userData,
1736 "htmlParseCharRef: invalid hexadecimal value\n");
1737 ctxt->wellFormed = 0;
1738 val = 0;
1739 break;
1740 }
1741 NEXT;
1742 }
1743 if (CUR == ';')
1744 NEXT;
1745 } else if ((CUR == '&') && (NXT(1) == '#')) {
1746 SKIP(2);
1747 while (CUR != ';') {
1748 if ((CUR >= '0') && (CUR <= '9'))
1749 val = val * 10 + (CUR - '0');
1750 else {
1751 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1752 ctxt->sax->error(ctxt->userData,
1753 "htmlParseCharRef: invalid decimal value\n");
1754 ctxt->wellFormed = 0;
1755 val = 0;
1756 break;
1757 }
1758 NEXT;
1759 }
1760 if (CUR == ';')
1761 NEXT;
1762 } else {
1763 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1764 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
1765 ctxt->wellFormed = 0;
1766 }
1767 /*
1768 * Check the value IS_CHAR ...
1769 */
1770 if (IS_CHAR(val)) {
1771 return(val);
1772 } else {
1773 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001774 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001775 val);
1776 ctxt->wellFormed = 0;
1777 }
1778 return(0);
1779}
1780
1781
1782/**
1783 * htmlParseDocTypeDecl :
1784 * @ctxt: an HTML parser context
1785 *
1786 * parse a DOCTYPE declaration
1787 *
1788 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
1789 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
1790 */
1791
1792void
1793htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001794 xmlChar *name;
1795 xmlChar *ExternalID = NULL;
1796 xmlChar *URI = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001797
1798 /*
1799 * We know that '<!DOCTYPE' has been detected.
1800 */
1801 SKIP(9);
1802
1803 SKIP_BLANKS;
1804
1805 /*
1806 * Parse the DOCTYPE name.
1807 */
1808 name = htmlParseName(ctxt);
1809 if (name == NULL) {
1810 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1811 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
1812 ctxt->wellFormed = 0;
1813 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001814 /*
1815 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
1816 */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001817
1818 SKIP_BLANKS;
1819
1820 /*
1821 * Check for SystemID and ExternalID
1822 */
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001823 URI = htmlParseExternalID(ctxt, &ExternalID, 0);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001824 SKIP_BLANKS;
1825
1826 /*
1827 * We should be at the end of the DOCTYPE declaration.
1828 */
1829 if (CUR != '>') {
1830 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1831 ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
1832 ctxt->wellFormed = 0;
1833 /* We shouldn't try to resynchronize ... */
1834 } else {
1835 }
1836 NEXT;
1837
1838 /*
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001839 * Create the document accordingly to the DOCTYPE
1840 */
1841 ctxt->myDoc = htmlNewDoc(URI, ExternalID);
1842
1843 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001844 * Cleanup, since we don't use all those identifiers
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001845 */
Daniel Veillard6454aec1999-09-02 22:04:43 +00001846 if (URI != NULL) xmlFree(URI);
1847 if (ExternalID != NULL) xmlFree(ExternalID);
1848 if (name != NULL) xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001849}
1850
1851/**
1852 * htmlParseAttribute:
1853 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001854 * @value: a xmlChar ** used to store the value of the attribute
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001855 *
1856 * parse an attribute
1857 *
1858 * [41] Attribute ::= Name Eq AttValue
1859 *
1860 * [25] Eq ::= S? '=' S?
1861 *
1862 * With namespace:
1863 *
1864 * [NS 11] Attribute ::= QName Eq AttValue
1865 *
1866 * Also the case QName == xmlns:??? is handled independently as a namespace
1867 * definition.
1868 *
1869 * Returns the attribute name, and the value in *value.
1870 */
1871
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001872xmlChar *
1873htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
1874 xmlChar *name, *val;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001875
1876 *value = NULL;
1877 name = htmlParseName(ctxt);
1878 if (name == NULL) {
1879 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1880 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
1881 ctxt->wellFormed = 0;
1882 return(NULL);
1883 }
1884
1885 /*
1886 * read the value
1887 */
1888 SKIP_BLANKS;
1889 if (CUR == '=') {
1890 NEXT;
1891 SKIP_BLANKS;
1892 val = htmlParseAttValue(ctxt);
1893 } else {
1894 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1895 ctxt->sax->error(ctxt->userData,
1896 "Specification mandate value for attribute %s\n", name);
1897 ctxt->wellFormed = 0;
1898 return(NULL);
1899 }
1900
1901 *value = val;
1902 return(name);
1903}
1904
1905/**
1906 * htmlParseStartTag:
1907 * @ctxt: an HTML parser context
1908 *
1909 * parse a start of tag either for rule element or
1910 * EmptyElement. In both case we don't parse the tag closing chars.
1911 *
1912 * [40] STag ::= '<' Name (S Attribute)* S? '>'
1913 *
1914 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
1915 *
1916 * With namespace:
1917 *
1918 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
1919 *
1920 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
1921 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001922 */
1923
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001924void
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001925htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001926 xmlChar *name;
1927 xmlChar *attname;
1928 xmlChar *attvalue;
1929 const xmlChar **atts = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001930 int nbatts = 0;
1931 int maxatts = 0;
1932 int i;
1933
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001934 if (CUR != '<') return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001935 NEXT;
1936
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001937 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001938 name = htmlParseHTMLName(ctxt);
1939 if (name == NULL) {
1940 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1941 ctxt->sax->error(ctxt->userData,
1942 "htmlParseStartTag: invalid element name\n");
1943 ctxt->wellFormed = 0;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001944 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001945 }
1946
1947 /*
1948 * Check for auto-closure of HTML elements.
1949 */
1950 htmlAutoClose(ctxt, name);
1951
1952 /*
1953 * Now parse the attributes, it ends up with the ending
1954 *
1955 * (S Attribute)* S?
1956 */
1957 SKIP_BLANKS;
1958 while ((IS_CHAR(CUR)) &&
1959 (CUR != '>') &&
1960 ((CUR != '/') || (NXT(1) != '>'))) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001961 const xmlChar *q = CUR_PTR;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001962 int cons = ctxt->input->consumed;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001963
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001964 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001965 attname = htmlParseAttribute(ctxt, &attvalue);
1966 if ((attname != NULL) && (attvalue != NULL)) {
1967 /*
1968 * Well formedness requires at most one declaration of an attribute
1969 */
1970 for (i = 0; i < nbatts;i += 2) {
1971 if (!xmlStrcmp(atts[i], attname)) {
1972 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001973 ctxt->sax->error(ctxt->userData,
1974 "Attribute %s redefined\n",
1975 attname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001976 ctxt->wellFormed = 0;
Daniel Veillard6454aec1999-09-02 22:04:43 +00001977 xmlFree(attname);
1978 xmlFree(attvalue);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001979 goto failed;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001980 }
1981 }
1982
1983 /*
1984 * Add the pair to atts
1985 */
1986 if (atts == NULL) {
1987 maxatts = 10;
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001988 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001989 if (atts == NULL) {
1990 fprintf(stderr, "malloc of %ld byte failed\n",
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001991 maxatts * (long)sizeof(xmlChar *));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001992 if (name != NULL) xmlFree(name);
1993 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001994 }
Daniel Veillard51e3b151999-11-12 17:02:31 +00001995 } else if (nbatts + 4 > maxatts) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001996 maxatts *= 2;
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001997 atts = (const xmlChar **) xmlRealloc(atts, maxatts * sizeof(xmlChar *));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001998 if (atts == NULL) {
1999 fprintf(stderr, "realloc of %ld byte failed\n",
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002000 maxatts * (long)sizeof(xmlChar *));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002001 if (name != NULL) xmlFree(name);
2002 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002003 }
2004 }
2005 atts[nbatts++] = attname;
2006 atts[nbatts++] = attvalue;
2007 atts[nbatts] = NULL;
2008 atts[nbatts + 1] = NULL;
2009 }
2010
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002011failed:
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002012 SKIP_BLANKS;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002013 if ((cons == ctxt->input->consumed) && (q == CUR_PTR)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002014 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2015 ctxt->sax->error(ctxt->userData,
2016 "htmlParseStartTag: problem parsing attributes\n");
2017 ctxt->wellFormed = 0;
2018 break;
2019 }
2020 }
2021
2022 /*
2023 * SAX: Start of Element !
2024 */
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002025 htmlnamePush(ctxt, xmlStrdup(name));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002026#ifdef DEBUG
2027 fprintf(stderr,"Start of element %s: pushed %s\n", name, ctxt->name);
2028#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002029 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
2030 ctxt->sax->startElement(ctxt->userData, name, atts);
2031
2032 if (atts != NULL) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002033 for (i = 0;i < nbatts;i++) xmlFree((xmlChar *) atts[i]);
Daniel Veillard6454aec1999-09-02 22:04:43 +00002034 xmlFree(atts);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002035 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002036 if (name != NULL) xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002037}
2038
2039/**
2040 * htmlParseEndTag:
2041 * @ctxt: an HTML parser context
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002042 *
2043 * parse an end of tag
2044 *
2045 * [42] ETag ::= '</' Name S? '>'
2046 *
2047 * With namespace
2048 *
2049 * [NS 9] ETag ::= '</' QName S? '>'
2050 */
2051
2052void
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002053htmlParseEndTag(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002054 xmlChar *name;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002055 xmlChar *oldname;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002056 int i;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002057
2058 if ((CUR != '<') || (NXT(1) != '/')) {
2059 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2060 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
2061 ctxt->wellFormed = 0;
2062 return;
2063 }
2064 SKIP(2);
2065
2066 name = htmlParseHTMLName(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002067 if (name == NULL) return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002068
2069 /*
2070 * We should definitely be at the ending "S? '>'" part
2071 */
2072 SKIP_BLANKS;
2073 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
2074 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2075 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
2076 ctxt->wellFormed = 0;
2077 } else
2078 NEXT;
2079
2080 /*
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002081 * If the name read is not one of the element in the parsing stack
2082 * then return, it's just an error.
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002083 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002084 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
2085 if (!xmlStrcmp(name, ctxt->nameTab[i])) break;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002086 }
2087 if (i < 0) {
2088 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002089 ctxt->sax->error(ctxt->userData,
2090 "Unexpected end tag : %s\n", name);
Daniel Veillard6454aec1999-09-02 22:04:43 +00002091 xmlFree(name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002092 ctxt->wellFormed = 0;
2093 return;
2094 }
2095
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002096
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002097 /*
2098 * Check for auto-closure of HTML elements.
2099 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002100
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002101 htmlAutoCloseOnClose(ctxt, name);
2102
2103 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002104 * Well formedness constraints, opening and closing must match.
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002105 * With the exception that the autoclose may have popped stuff out
2106 * of the stack.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002107 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002108 if (xmlStrcmp(name, ctxt->name)) {
2109#ifdef DEBUG
2110 fprintf(stderr,"End of tag %s: expecting %s\n", name, ctxt->name);
2111#endif
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002112 if ((ctxt->name != NULL) &&
2113 (xmlStrcmp(ctxt->name, name))) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002114 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2115 ctxt->sax->error(ctxt->userData,
2116 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002117 name, ctxt->name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002118 ctxt->wellFormed = 0;
2119 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002120 }
2121
2122 /*
2123 * SAX: End of Tag
2124 */
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002125 oldname = ctxt->name;
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002126 if ((oldname != NULL) && (!xmlStrcmp(oldname, name))) {
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002127 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2128 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002129 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002130 if (oldname != NULL) {
2131#ifdef DEBUG
2132 fprintf(stderr,"End of tag %s: popping out %s\n", name, oldname);
2133#endif
2134 xmlFree(oldname);
2135#ifdef DEBUG
2136 } else {
2137 fprintf(stderr,"End of tag %s: stack empty !!!\n", name);
2138#endif
2139 }
2140 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002141
2142 if (name != NULL)
Daniel Veillard6454aec1999-09-02 22:04:43 +00002143 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002144
2145 return;
2146}
2147
2148
2149/**
2150 * htmlParseReference:
2151 * @ctxt: an HTML parser context
2152 *
2153 * parse and handle entity references in content,
2154 * this will end-up in a call to character() since this is either a
2155 * CharRef, or a predefined entity.
2156 */
2157void
2158htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002159 htmlEntityDescPtr ent;
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002160 xmlChar out[2];
2161 xmlChar *name;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002162 int val;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002163 if (CUR != '&') return;
2164
2165 if (NXT(1) == '#') {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002166 val = htmlParseCharRef(ctxt);
Daniel Veillardb96e6431999-08-29 21:02:19 +00002167 /* invalid for UTF-8 variable encoding !!!!! */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002168 out[0] = val;
2169 out[1] = 0;
2170 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
2171 ctxt->sax->characters(ctxt->userData, out, 1);
2172 } else {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002173 ent = htmlParseEntityRef(ctxt, &name);
2174 if (name == NULL) return; /* Shall we output & anyway ? */
2175 if ((ent == NULL) || (ent->value <= 0) || (ent->value >= 255)) {
2176 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
Daniel Veillardb96e6431999-08-29 21:02:19 +00002177 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002178 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
Daniel Veillardb96e6431999-08-29 21:02:19 +00002179 ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002180 }
2181 } else {
Daniel Veillardb96e6431999-08-29 21:02:19 +00002182 /* invalid for UTF-8 variable encoding !!!!! */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002183 out[0] = ent->value;
2184 out[1] = 0;
2185 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
2186 ctxt->sax->characters(ctxt->userData, out, 1);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002187 }
Daniel Veillard6454aec1999-09-02 22:04:43 +00002188 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002189 }
2190}
2191
2192/**
2193 * htmlParseContent:
2194 * @ctxt: an HTML parser context
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002195 * @name: the node name
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002196 *
2197 * Parse a content: comment, sub-element, reference or text.
2198 *
2199 */
2200
2201void
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002202htmlParseContent(htmlParserCtxtPtr ctxt) {
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002203 xmlChar *currentNode;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002204 int depth;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002205
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002206 currentNode = ctxt->name;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002207 depth = ctxt->nameNr;
2208 while (1) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002209 const xmlChar *test = CUR_PTR;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002210
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002211 GROW;
2212 /*
2213 * Our tag or one of it's parent or children is ending.
2214 */
2215 if ((CUR == '<') && (NXT(1) == '/')) {
2216 htmlParseEndTag(ctxt);
2217 return;
2218 }
2219
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002220 /*
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002221 * Has this node been popped out during parsing of
2222 * the next element
2223 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002224 if ((currentNode != ctxt->name) &&
2225 (depth >= ctxt->nameNr)) return;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002226
2227 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002228 * First case : a comment
2229 */
2230 if ((CUR == '<') && (NXT(1) == '!') &&
2231 (NXT(2) == '-') && (NXT(3) == '-')) {
2232 htmlParseComment(ctxt, 1);
2233 }
2234
2235 /*
2236 * Second case : a sub-element.
2237 */
2238 else if (CUR == '<') {
2239 htmlParseElement(ctxt);
2240 }
2241
2242 /*
2243 * Third case : a reference. If if has not been resolved,
2244 * parsing returns it's Name, create the node
2245 */
2246 else if (CUR == '&') {
2247 htmlParseReference(ctxt);
2248 }
2249
2250 /*
2251 * Last case, text. Note that References are handled directly.
2252 */
2253 else {
2254 htmlParseCharData(ctxt, 0);
2255 }
2256
2257 if (test == CUR_PTR) {
Daniel Veillard35008381999-10-25 13:15:52 +00002258 if (ctxt->node != NULL) {
2259 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2260 ctxt->sax->error(ctxt->userData,
2261 "detected an error in element content\n");
2262 ctxt->wellFormed = 0;
2263 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002264 break;
2265 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002266
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002267 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002268 }
2269}
2270
2271/**
2272 * htmlParseElement:
2273 * @ctxt: an HTML parser context
2274 *
2275 * parse an HTML element, this is highly recursive
2276 *
2277 * [39] element ::= EmptyElemTag | STag content ETag
2278 *
2279 * [41] Attribute ::= Name Eq AttValue
2280 */
2281
2282void
2283htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002284 const xmlChar *openTag = CUR_PTR;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002285 xmlChar *oldname;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002286 xmlChar *name;
Daniel Veillard7d2c2761999-10-11 15:09:51 +00002287 xmlChar *currentNode = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002288 htmlElemDescPtr info;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002289 htmlParserNodeInfo node_info;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002290 int depth = ctxt->nameNr;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002291
2292 /* Capture start position */
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002293 if (ctxt->record_info) {
2294 node_info.begin_pos = ctxt->input->consumed +
2295 (CUR_PTR - ctxt->input->base);
2296 node_info.begin_line = ctxt->input->line;
2297 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002298
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002299 oldname = ctxt->name;
2300 htmlParseStartTag(ctxt);
2301 name = ctxt->name;
2302#ifdef DEBUG
2303 if (oldname == NULL)
2304 fprintf(stderr, "Start of element %s\n", name);
2305 else if (name == NULL)
2306 fprintf(stderr, "Start of element failed, was %s\n", oldname);
2307 else
2308 fprintf(stderr, "Start of element %s, was %s\n", name, oldname);
2309#endif
2310 if (((depth == ctxt->nameNr) && (oldname == ctxt->name)) ||
2311 (name == NULL)) {
2312 if (CUR == '>')
2313 NEXT;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002314 return;
2315 }
2316
2317 /*
2318 * Lookup the info for that element.
2319 */
2320 info = htmlTagLookup(name);
2321 if (info == NULL) {
2322 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2323 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
2324 name);
2325 ctxt->wellFormed = 0;
2326 } else if (info->depr) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002327/***************************
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002328 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
2329 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
2330 name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002331 ***************************/
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002332 }
2333
2334 /*
2335 * Check for an Empty Element labelled the XML/SGML way
2336 */
2337 if ((CUR == '/') && (NXT(1) == '>')) {
2338 SKIP(2);
2339 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2340 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002341 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002342#ifdef DEBUG
2343 fprintf(stderr,"End of tag the XML way: popping out %s\n", oldname);
2344#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002345 if (oldname != NULL)
2346 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002347 return;
2348 }
2349
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002350 if (CUR == '>') {
2351 NEXT;
2352 } else {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002353 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2354 ctxt->sax->error(ctxt->userData, "Couldn't find end of Start Tag\n%.30s\n",
2355 openTag);
2356 ctxt->wellFormed = 0;
2357
2358 /*
2359 * end of parsing of this node.
2360 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002361 if (!xmlStrcmp(name, ctxt->name)) {
2362 nodePop(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002363 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002364#ifdef DEBUG
2365 fprintf(stderr,"End of start tag problem: popping out %s\n", oldname);
2366#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002367 if (oldname != NULL)
2368 xmlFree(oldname);
2369 }
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002370
2371 /*
2372 * Capture end position and add node
2373 */
2374 if ( currentNode != NULL && ctxt->record_info ) {
2375 node_info.end_pos = ctxt->input->consumed +
2376 (CUR_PTR - ctxt->input->base);
2377 node_info.end_line = ctxt->input->line;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002378 node_info.node = ctxt->node;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002379 xmlParserAddNodeInfo(ctxt, &node_info);
2380 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002381 return;
2382 }
2383
2384 /*
2385 * Check for an Empty Element from DTD definition
2386 */
2387 if ((info != NULL) && (info->empty)) {
2388 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2389 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002390 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002391#ifdef DEBUG
2392 fprintf(stderr,"End of empty tag %s : popping out %s\n", name, oldname);
2393#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002394 if (oldname != NULL)
2395 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002396 return;
2397 }
2398
2399 /*
2400 * Parse the content of the element:
2401 */
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002402 currentNode = ctxt->name;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002403 depth = ctxt->nameNr;
2404 while (IS_CHAR(CUR)) {
2405 htmlParseContent(ctxt);
2406 if (ctxt->nameNr < depth) break;
2407 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002408
2409 if (!IS_CHAR(CUR)) {
2410 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2411 ctxt->sax->error(ctxt->userData,
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002412 "Premature end of data in tag %s\n", currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002413 ctxt->wellFormed = 0;
2414
2415 /*
2416 * end of parsing of this node.
2417 */
2418 nodePop(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002419 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002420#ifdef DEBUG
2421 fprintf(stderr,"Premature end of tag %s : popping out %s\n", name, oldname);
2422#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002423 if (oldname != NULL)
2424 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002425 return;
2426 }
2427
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002428 /*
2429 * Capture end position and add node
2430 */
2431 if ( currentNode != NULL && ctxt->record_info ) {
2432 node_info.end_pos = ctxt->input->consumed +
2433 (CUR_PTR - ctxt->input->base);
2434 node_info.end_line = ctxt->input->line;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002435 node_info.node = ctxt->node;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002436 xmlParserAddNodeInfo(ctxt, &node_info);
2437 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002438}
2439
2440/**
2441 * htmlParseDocument :
2442 * @ctxt: an HTML parser context
2443 *
2444 * parse an HTML document (and build a tree if using the standard SAX
2445 * interface).
2446 *
2447 * Returns 0, -1 in case of error. the parser context is augmented
2448 * as a result of the parsing.
2449 */
2450
2451int
2452htmlParseDocument(htmlParserCtxtPtr ctxt) {
2453 htmlDefaultSAXHandlerInit();
2454 ctxt->html = 1;
2455
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002456 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002457 /*
Daniel Veillardb96e6431999-08-29 21:02:19 +00002458 * SAX: beginning of the document processing.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002459 */
2460 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
2461 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
2462
2463 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002464 * Wipe out everything which is before the first '<'
2465 */
Daniel Veillard35008381999-10-25 13:15:52 +00002466 SKIP_BLANKS;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002467 if (CUR == 0) {
2468 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2469 ctxt->sax->error(ctxt->userData, "Document is empty\n");
2470 ctxt->wellFormed = 0;
2471 }
2472
Daniel Veillard35008381999-10-25 13:15:52 +00002473 /*
2474 * Parse possible comments before any content
2475 */
2476 while ((CUR == '<') && (NXT(1) == '!') &&
2477 (NXT(2) == '-') && (NXT(3) == '-')) {
2478 ctxt->myDoc = htmlNewDoc(NULL, NULL);
2479 htmlParseComment(ctxt, 1);
2480 SKIP_BLANKS;
2481 }
2482
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002483
2484 /*
2485 * Then possibly doc type declaration(s) and more Misc
2486 * (doctypedecl Misc*)?
2487 */
2488 if ((CUR == '<') && (NXT(1) == '!') &&
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002489 (UPP(2) == 'D') && (UPP(3) == 'O') &&
2490 (UPP(4) == 'C') && (UPP(5) == 'T') &&
2491 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
2492 (UPP(8) == 'E')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002493 htmlParseDocTypeDecl(ctxt);
2494 }
2495 SKIP_BLANKS;
2496
2497 /*
2498 * Create the document if not done already.
2499 */
2500 if (ctxt->myDoc == NULL) {
2501 ctxt->myDoc = htmlNewDoc(NULL, NULL);
2502 }
2503
2504 /*
2505 * Time to start parsing the tree itself
2506 */
Daniel Veillard35008381999-10-25 13:15:52 +00002507 htmlParseContent(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002508
2509 /*
2510 * SAX: end of the document processing.
2511 */
2512 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
2513 ctxt->sax->endDocument(ctxt->userData);
2514 if (! ctxt->wellFormed) return(-1);
2515 return(0);
2516}
2517
2518
2519/********************************************************************************
2520 * *
2521 * Parser contexts handling *
2522 * *
2523 ********************************************************************************/
2524
2525/**
2526 * xmlInitParserCtxt:
2527 * @ctxt: an HTML parser context
2528 *
2529 * Initialize a parser context
2530 */
2531
2532void
2533htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
2534{
2535 htmlSAXHandler *sax;
2536
Daniel Veillard35008381999-10-25 13:15:52 +00002537 if (ctxt == NULL) return;
2538 memset(ctxt, 0, sizeof(htmlParserCtxt));
2539
Daniel Veillard6454aec1999-09-02 22:04:43 +00002540 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002541 if (sax == NULL) {
2542 fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
2543 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002544 memset(sax, 0, sizeof(htmlSAXHandler));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002545
2546 /* Allocate the Input stack */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002547 ctxt->inputTab = (htmlParserInputPtr *)
2548 xmlMalloc(5 * sizeof(htmlParserInputPtr));
2549 if (ctxt->inputTab == NULL) {
2550 fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
2551 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002552 ctxt->inputNr = 0;
2553 ctxt->inputMax = 5;
2554 ctxt->input = NULL;
2555 ctxt->version = NULL;
2556 ctxt->encoding = NULL;
2557 ctxt->standalone = -1;
2558
2559 /* Allocate the Node stack */
Daniel Veillard6454aec1999-09-02 22:04:43 +00002560 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002561 ctxt->nodeNr = 0;
2562 ctxt->nodeMax = 10;
2563 ctxt->node = NULL;
2564
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002565 /* Allocate the Name stack */
2566 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
2567 ctxt->nameNr = 0;
2568 ctxt->nameMax = 10;
2569 ctxt->name = NULL;
2570
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002571 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
2572 else {
2573 ctxt->sax = sax;
2574 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
2575 }
2576 ctxt->userData = ctxt;
2577 ctxt->myDoc = NULL;
2578 ctxt->wellFormed = 1;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002579 ctxt->replaceEntities = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002580 ctxt->html = 1;
2581 ctxt->record_info = 0;
Daniel Veillard35008381999-10-25 13:15:52 +00002582 ctxt->validate = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002583 xmlInitNodeInfoSeq(&ctxt->node_seq);
2584}
2585
2586/**
2587 * htmlFreeParserCtxt:
2588 * @ctxt: an HTML parser context
2589 *
2590 * Free all the memory used by a parser context. However the parsed
2591 * document in ctxt->myDoc is not freed.
2592 */
2593
2594void
2595htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
2596{
2597 htmlParserInputPtr input;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002598 xmlChar *oldname;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002599
2600 if (ctxt == NULL) return;
2601
2602 while ((input = inputPop(ctxt)) != NULL) {
2603 xmlFreeInputStream(input);
2604 }
2605
Daniel Veillard6454aec1999-09-02 22:04:43 +00002606 if (ctxt->nodeTab != NULL) xmlFree(ctxt->nodeTab);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002607 while ((oldname = htmlnamePop(ctxt)) != NULL) {
2608 xmlFree(oldname);
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002609 }
2610 if (ctxt->nameTab != NULL) xmlFree(ctxt->nameTab);
Daniel Veillard6454aec1999-09-02 22:04:43 +00002611 if (ctxt->inputTab != NULL) xmlFree(ctxt->inputTab);
2612 if (ctxt->version != NULL) xmlFree((char *) ctxt->version);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002613 if ((ctxt->sax != NULL) && (ctxt->sax != &htmlDefaultSAXHandler))
Daniel Veillard6454aec1999-09-02 22:04:43 +00002614 xmlFree(ctxt->sax);
2615 xmlFree(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002616}
2617
2618/**
2619 * htmlCreateDocParserCtxt :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002620 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002621 * @encoding: a free form C string describing the HTML document encoding, or NULL
2622 *
2623 * Create a parser context for an HTML document.
2624 *
2625 * Returns the new parser context or NULL
2626 */
2627htmlParserCtxtPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002628htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002629 htmlParserCtxtPtr ctxt;
2630 htmlParserInputPtr input;
2631 /* htmlCharEncoding enc; */
2632
Daniel Veillard6454aec1999-09-02 22:04:43 +00002633 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002634 if (ctxt == NULL) {
2635 perror("malloc");
2636 return(NULL);
2637 }
2638 htmlInitParserCtxt(ctxt);
Daniel Veillard6454aec1999-09-02 22:04:43 +00002639 input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002640 if (input == NULL) {
2641 perror("malloc");
Daniel Veillard6454aec1999-09-02 22:04:43 +00002642 xmlFree(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002643 return(NULL);
2644 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002645 memset(input, 0, sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002646
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002647 input->line = 1;
2648 input->col = 1;
2649 input->base = cur;
2650 input->cur = cur;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002651
2652 inputPush(ctxt, input);
2653 return(ctxt);
2654}
2655
2656/********************************************************************************
2657 * *
2658 * User entry points *
2659 * *
2660 ********************************************************************************/
2661
2662/**
2663 * htmlSAXParseDoc :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002664 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002665 * @encoding: a free form C string describing the HTML document encoding, or NULL
2666 * @sax: the SAX handler block
2667 * @userData: if using SAX, this pointer will be provided on callbacks.
2668 *
2669 * parse an HTML in-memory document and build a tree.
2670 * It use the given SAX function block to handle the parsing callback.
2671 * If sax is NULL, fallback to the default DOM tree building routines.
2672 *
2673 * Returns the resulting document tree
2674 */
2675
2676htmlDocPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002677htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002678 htmlDocPtr ret;
2679 htmlParserCtxtPtr ctxt;
2680
2681 if (cur == NULL) return(NULL);
2682
2683
2684 ctxt = htmlCreateDocParserCtxt(cur, encoding);
2685 if (ctxt == NULL) return(NULL);
2686 if (sax != NULL) {
2687 ctxt->sax = sax;
2688 ctxt->userData = userData;
2689 }
2690
2691 htmlParseDocument(ctxt);
2692 ret = ctxt->myDoc;
2693 if (sax != NULL) {
2694 ctxt->sax = NULL;
2695 ctxt->userData = NULL;
2696 }
2697 htmlFreeParserCtxt(ctxt);
2698
2699 return(ret);
2700}
2701
2702/**
2703 * htmlParseDoc :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002704 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002705 * @encoding: a free form C string describing the HTML document encoding, or NULL
2706 *
2707 * parse an HTML in-memory document and build a tree.
2708 *
2709 * Returns the resulting document tree
2710 */
2711
2712htmlDocPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002713htmlParseDoc(xmlChar *cur, const char *encoding) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002714 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
2715}
2716
2717
2718/**
2719 * htmlCreateFileParserCtxt :
2720 * @filename: the filename
2721 * @encoding: a free form C string describing the HTML document encoding, or NULL
2722 *
2723 * Create a parser context for a file content.
2724 * Automatic support for ZLIB/Compress compressed document is provided
2725 * by default if found at compile-time.
2726 *
2727 * Returns the new parser context or NULL
2728 */
2729htmlParserCtxtPtr
2730htmlCreateFileParserCtxt(const char *filename, const char *encoding)
2731{
2732 htmlParserCtxtPtr ctxt;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002733 htmlParserInputPtr inputStream;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002734 xmlParserInputBufferPtr buf;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002735 /* htmlCharEncoding enc; */
2736
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002737 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
2738 if (buf == NULL) return(NULL);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002739
Daniel Veillard6454aec1999-09-02 22:04:43 +00002740 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002741 if (ctxt == NULL) {
2742 perror("malloc");
2743 return(NULL);
2744 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002745 memset(ctxt, 0, sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002746 htmlInitParserCtxt(ctxt);
Daniel Veillard6454aec1999-09-02 22:04:43 +00002747 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002748 if (inputStream == NULL) {
2749 perror("malloc");
Daniel Veillard6454aec1999-09-02 22:04:43 +00002750 xmlFree(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002751 return(NULL);
2752 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002753 memset(inputStream, 0, sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002754
Daniel Veillard6454aec1999-09-02 22:04:43 +00002755 inputStream->filename = xmlMemStrdup(filename);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002756 inputStream->line = 1;
2757 inputStream->col = 1;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002758 inputStream->buf = buf;
Daniel Veillard35008381999-10-25 13:15:52 +00002759 inputStream->directory = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002760
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002761 inputStream->base = inputStream->buf->buffer->content;
2762 inputStream->cur = inputStream->buf->buffer->content;
2763 inputStream->free = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002764
2765 inputPush(ctxt, inputStream);
2766 return(ctxt);
2767}
2768
2769/**
2770 * htmlSAXParseFile :
2771 * @filename: the filename
2772 * @encoding: a free form C string describing the HTML document encoding, or NULL
2773 * @sax: the SAX handler block
2774 * @userData: if using SAX, this pointer will be provided on callbacks.
2775 *
2776 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
2777 * compressed document is provided by default if found at compile-time.
2778 * It use the given SAX function block to handle the parsing callback.
2779 * If sax is NULL, fallback to the default DOM tree building routines.
2780 *
2781 * Returns the resulting document tree
2782 */
2783
2784htmlDocPtr
2785htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
2786 void *userData) {
2787 htmlDocPtr ret;
2788 htmlParserCtxtPtr ctxt;
2789
2790 ctxt = htmlCreateFileParserCtxt(filename, encoding);
2791 if (ctxt == NULL) return(NULL);
2792 if (sax != NULL) {
2793 ctxt->sax = sax;
2794 ctxt->userData = userData;
2795 }
2796
2797 htmlParseDocument(ctxt);
2798
2799 ret = ctxt->myDoc;
2800 if (sax != NULL) {
2801 ctxt->sax = NULL;
2802 ctxt->userData = NULL;
2803 }
2804 htmlFreeParserCtxt(ctxt);
2805
2806 return(ret);
2807}
2808
2809/**
2810 * htmlParseFile :
2811 * @filename: the filename
2812 * @encoding: a free form C string describing the HTML document encoding, or NULL
2813 *
2814 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
2815 * compressed document is provided by default if found at compile-time.
2816 *
2817 * Returns the resulting document tree
2818 */
2819
2820htmlDocPtr
2821htmlParseFile(const char *filename, const char *encoding) {
2822 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
2823}