blob: a1fdac5cd8c65d5357915c33b26c6966b9f7e6f3 [file] [log] [blame]
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * Daniel.Veillard@w3.org
7 */
8
9#ifdef WIN32
Daniel Veillard3c558c31999-12-22 11:30:41 +000010#include "win32config.h"
Daniel Veillardbe70ff71999-07-05 16:50:46 +000011#else
Daniel Veillard7f7d1111999-09-22 09:46:25 +000012#include "config.h"
Daniel Veillardbe70ff71999-07-05 16:50:46 +000013#endif
Daniel Veillard7f7d1111999-09-22 09:46:25 +000014
Daniel Veillardbe70ff71999-07-05 16:50:46 +000015#include <stdio.h>
Daniel Veillardbe70ff71999-07-05 16:50:46 +000016#include <string.h> /* for memset() only */
Daniel Veillard7f7d1111999-09-22 09:46:25 +000017#ifdef HAVE_CTYPE_H
18#include <ctype.h>
19#endif
20#ifdef HAVE_STDLIB_H
Daniel Veillardbe70ff71999-07-05 16:50:46 +000021#include <stdlib.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000022#endif
23#ifdef HAVE_SYS_STAT_H
Daniel Veillardbe70ff71999-07-05 16:50:46 +000024#include <sys/stat.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000025#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +000026#ifdef HAVE_FCNTL_H
27#include <fcntl.h>
28#endif
29#ifdef HAVE_UNISTD_H
30#include <unistd.h>
31#endif
32#ifdef HAVE_ZLIB_H
33#include <zlib.h>
34#endif
35
Daniel Veillard6454aec1999-09-02 22:04:43 +000036#include "xmlmemory.h"
Daniel Veillardbe70ff71999-07-05 16:50:46 +000037#include "tree.h"
38#include "HTMLparser.h"
39#include "entities.h"
40#include "encoding.h"
41#include "valid.h"
42#include "parserInternals.h"
Daniel Veillarde2d034d1999-07-27 19:52:06 +000043#include "xmlIO.h"
Daniel Veillard5e5c6231999-12-29 12:49:06 +000044#include "xml-error.h"
Daniel Veillarde2d034d1999-07-27 19:52:06 +000045
46#define HTML_MAX_NAMELEN 1000
47#define INPUT_CHUNK 50
Daniel Veillard5e5c6231999-12-29 12:49:06 +000048#define HTML_PARSER_BIG_BUFFER_SIZE 1024
49#define HTML_PARSER_BUFFER_SIZE 100
Daniel Veillardbe70ff71999-07-05 16:50:46 +000050
Daniel Veillard82150d81999-07-07 07:32:15 +000051/* #define DEBUG */
Daniel Veillard5e5c6231999-12-29 12:49:06 +000052/* #define DEBUG_PUSH */
Daniel Veillard5233ffc1999-07-06 22:25:25 +000053
54/************************************************************************
55 * *
56 * Parser stacks related functions and macros *
57 * *
58 ************************************************************************/
59
60/*
61 * Generic function for accessing stacks in the Parser Context
62 */
63
Daniel Veillarddbfd6411999-12-28 16:35:14 +000064#define PUSH_AND_POP(scope, type, name) \
65scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000066 if (ctxt->name##Nr >= ctxt->name##Max) { \
67 ctxt->name##Max *= 2; \
Daniel Veillard6454aec1999-09-02 22:04:43 +000068 ctxt->name##Tab = (void *) xmlRealloc(ctxt->name##Tab, \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000069 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
70 if (ctxt->name##Tab == NULL) { \
71 fprintf(stderr, "realloc failed !\n"); \
Daniel Veillard0142b842000-01-14 14:45:24 +000072 return(0); \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000073 } \
74 } \
75 ctxt->name##Tab[ctxt->name##Nr] = value; \
76 ctxt->name = value; \
77 return(ctxt->name##Nr++); \
78} \
Daniel Veillarddbfd6411999-12-28 16:35:14 +000079scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000080 type ret; \
Daniel Veillard7c1206f1999-10-14 09:10:25 +000081 if (ctxt->name##Nr < 0) return(0); \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000082 ctxt->name##Nr--; \
Daniel Veillard7c1206f1999-10-14 09:10:25 +000083 if (ctxt->name##Nr < 0) return(0); \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000084 if (ctxt->name##Nr > 0) \
85 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
86 else \
87 ctxt->name = NULL; \
88 ret = ctxt->name##Tab[ctxt->name##Nr]; \
89 ctxt->name##Tab[ctxt->name##Nr] = 0; \
90 return(ret); \
91} \
92
Daniel Veillarddbfd6411999-12-28 16:35:14 +000093PUSH_AND_POP(extern, xmlNodePtr, node)
94PUSH_AND_POP(extern, xmlChar*, name)
Daniel Veillard5233ffc1999-07-06 22:25:25 +000095
96/*
97 * Macros for accessing the content. Those should be used only by the parser,
98 * and not exported.
99 *
100 * Dirty macros, i.e. one need to make assumption on the context to use them
101 *
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000102 * CUR_PTR return the current pointer to the xmlChar to be parsed.
103 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000104 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
105 * in UNICODE mode. This should be used internally by the parser
106 * only to compare to ASCII values otherwise it would break when
107 * running with UTF-8 encoding.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000108 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000109 * to compare on ASCII based substring.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000110 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000111 * it should be used only to compare on ASCII based substring.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000112 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000113 * strings within the parser.
114 *
115 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
116 *
117 * CURRENT Returns the current char value, with the full decoding of
118 * UTF-8 if we are using this mode. It returns an int.
119 * NEXT Skip to the next character, this does the proper decoding
120 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000121 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
122 */
123
124#define CUR (*ctxt->input->cur)
125#define UPPER (toupper(*ctxt->input->cur))
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000126#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000127#define NXT(val) ctxt->input->cur[(val)]
128#define UPP(val) (toupper(ctxt->input->cur[(val)]))
129#define CUR_PTR ctxt->input->cur
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000130#define SHRINK xmlParserInputShrink(ctxt->input)
131#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000132
133#define SKIP_BLANKS \
134 while (IS_BLANK(*(ctxt->input->cur))) NEXT
135
136#ifndef USE_UTF_8
137#define CURRENT (*ctxt->input->cur)
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000138#define NEXT { \
139 if ((*ctxt->input->cur == 0) && \
140 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) { \
141 xmlPopInput(ctxt); \
142 } else { \
143 if (*(ctxt->input->cur) == '\n') { \
144 ctxt->input->line++; ctxt->input->col = 1; \
145 } else ctxt->input->col++; \
146 ctxt->input->cur++; \
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000147 ctxt->nbChars++; \
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000148 if (*ctxt->input->cur == 0) \
149 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); \
150 }}
151
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000152#else
153#endif
154
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000155
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000156
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000157/************************************************************************
158 * *
159 * The list of HTML elements and their properties *
160 * *
161 ************************************************************************/
162
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000163/*
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000164 * Start Tag: 1 means the start tag can be ommited
165 * End Tag: 1 means the end tag can be ommited
166 * 2 means it's forbidden (empty elements)
167 * Depr: this element is deprecated
168 * DTD: 1 means that this element is valid only in the Loose DTD
169 * 2 means that this element is valid only in the Frameset DTD
170 *
171 * Name,Start Tag,End Tag, Empty, Depr., DTD, Description
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000172 */
173htmlElemDesc html40ElementTable[] = {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000174{ "a", 0, 0, 0, 0, 0, "anchor " },
175{ "abbr", 0, 0, 0, 0, 0, "abbreviated form" },
176{ "acronym", 0, 0, 0, 0, 0, "" },
177{ "address", 0, 0, 0, 0, 0, "information on author " },
178{ "applet", 0, 0, 0, 1, 1, "java applet " },
179{ "area", 0, 2, 1, 0, 0, "client-side image map area " },
180{ "b", 0, 0, 0, 0, 0, "bold text style" },
181{ "base", 0, 2, 1, 0, 0, "document base uri " },
182{ "basefont", 0, 2, 1, 1, 1, "base font size " },
183{ "bdo", 0, 0, 0, 0, 0, "i18n bidi over-ride " },
184{ "big", 0, 0, 0, 0, 0, "large text style" },
185{ "blockquote", 0, 0, 0, 0, 0, "long quotation " },
186{ "body", 1, 1, 0, 0, 0, "document body " },
187{ "br", 0, 2, 1, 0, 0, "forced line break " },
188{ "button", 0, 0, 0, 0, 0, "push button " },
189{ "caption", 0, 0, 0, 0, 0, "table caption " },
190{ "center", 0, 0, 0, 1, 1, "shorthand for div align=center " },
191{ "cite", 0, 0, 0, 0, 0, "citation" },
192{ "code", 0, 0, 0, 0, 0, "computer code fragment" },
193{ "col", 0, 2, 1, 0, 0, "table column " },
194{ "colgroup", 0, 1, 0, 0, 0, "table column group " },
195{ "dd", 0, 1, 0, 0, 0, "definition description " },
196{ "del", 0, 0, 0, 0, 0, "deleted text " },
197{ "dfn", 0, 0, 0, 0, 0, "instance definition" },
198{ "dir", 0, 0, 0, 1, 1, "directory list" },
199{ "div", 0, 0, 0, 0, 0, "generic language/style container"},
200{ "dl", 0, 0, 0, 0, 0, "definition list " },
201{ "dt", 0, 1, 0, 0, 0, "definition term " },
202{ "em", 0, 0, 0, 0, 0, "emphasis" },
203{ "fieldset", 0, 0, 0, 0, 0, "form control group " },
204{ "font", 0, 0, 0, 1, 1, "local change to font " },
205{ "form", 0, 0, 0, 0, 0, "interactive form " },
206{ "frame", 0, 2, 1, 0, 2, "subwindow " },
207{ "frameset", 0, 0, 0, 0, 2, "window subdivision" },
208{ "h1", 0, 0, 0, 0, 0, "heading " },
209{ "h2", 0, 0, 0, 0, 0, "heading " },
210{ "h3", 0, 0, 0, 0, 0, "heading " },
211{ "h4", 0, 0, 0, 0, 0, "heading " },
212{ "h5", 0, 0, 0, 0, 0, "heading " },
213{ "h6", 0, 0, 0, 0, 0, "heading " },
214{ "head", 1, 1, 0, 0, 0, "document head " },
215{ "hr", 0, 2, 1, 0, 0, "horizontal rule " },
216{ "html", 1, 1, 0, 0, 0, "document root element " },
217{ "i", 0, 0, 0, 0, 0, "italic text style" },
218{ "iframe", 0, 0, 0, 0, 1, "inline subwindow " },
219{ "img", 0, 2, 1, 0, 0, "embedded image " },
220{ "input", 0, 2, 1, 0, 0, "form control " },
221{ "ins", 0, 0, 0, 0, 0, "inserted text" },
222{ "isindex", 0, 2, 1, 1, 1, "single line prompt " },
223{ "kbd", 0, 0, 0, 0, 0, "text to be entered by the user" },
224{ "label", 0, 0, 0, 0, 0, "form field label text " },
225{ "legend", 0, 0, 0, 0, 0, "fieldset legend " },
226{ "li", 0, 1, 0, 0, 0, "list item " },
227{ "link", 0, 2, 1, 0, 0, "a media-independent link " },
228{ "map", 0, 0, 0, 0, 0, "client-side image map " },
229{ "menu", 0, 0, 0, 1, 1, "menu list " },
230{ "meta", 0, 2, 1, 0, 0, "generic metainformation " },
231{ "noframes", 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " },
232{ "noscript", 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
233{ "object", 0, 0, 0, 0, 0, "generic embedded object " },
234{ "ol", 0, 0, 0, 0, 0, "ordered list " },
235{ "optgroup", 0, 0, 0, 0, 0, "option group " },
236{ "option", 0, 1, 0, 0, 0, "selectable choice " },
237{ "p", 0, 1, 0, 0, 0, "paragraph " },
238{ "param", 0, 2, 1, 0, 0, "named property value " },
239{ "pre", 0, 0, 0, 0, 0, "preformatted text " },
240{ "q", 0, 0, 0, 0, 0, "short inline quotation " },
241{ "s", 0, 0, 0, 1, 1, "strike-through text style" },
242{ "samp", 0, 0, 0, 0, 0, "sample program output, scripts, etc." },
243{ "script", 0, 0, 0, 0, 0, "script statements " },
244{ "select", 0, 0, 0, 0, 0, "option selector " },
245{ "small", 0, 0, 0, 0, 0, "small text style" },
246{ "span", 0, 0, 0, 0, 0, "generic language/style container " },
247{ "strike", 0, 0, 0, 1, 1, "strike-through text" },
248{ "strong", 0, 0, 0, 0, 0, "strong emphasis" },
249{ "style", 0, 0, 0, 0, 0, "style info " },
250{ "sub", 0, 0, 0, 0, 0, "subscript" },
251{ "sup", 0, 0, 0, 0, 0, "superscript " },
252{ "table", 0, 0, 0, 0, 0, "&#160;" },
253{ "tbody", 1, 1, 0, 0, 0, "table body " },
254{ "td", 0, 1, 0, 0, 0, "table data cell" },
255{ "textarea", 0, 0, 0, 0, 0, "multi-line text field " },
256{ "tfoot", 0, 1, 0, 0, 0, "table footer " },
257{ "th", 0, 1, 0, 0, 0, "table header cell" },
258{ "thead", 0, 1, 0, 0, 0, "table header " },
259{ "title", 0, 0, 0, 0, 0, "document title " },
260{ "tr", 0, 1, 0, 0, 0, "table row " },
261{ "tt", 0, 0, 0, 0, 0, "teletype or monospaced text style" },
262{ "u", 0, 0, 0, 1, 1, "underlined text style" },
263{ "ul", 0, 0, 0, 0, 0, "unordered list " },
264{ "var", 0, 0, 0, 0, 0, "instance of a variable or program argument" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000265};
266
267/*
268 * start tags that imply the end of a current element
269 * any tag of each line implies the end of the current element if the type of
270 * that element is in the same line
271 */
Daniel Veillardb96e6431999-08-29 21:02:19 +0000272char *htmlEquEnd[] = {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000273"dt", "dd", "li", "option", NULL,
274"h1", "h2", "h3", "h4", "h5", "h6", NULL,
275"ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000276NULL
277};
278/*
279 * acording the HTML DTD, HR should be added to the 2nd line above, as it
280 * is not allowed within a H1, H2, H3, etc. But we should tolerate that case
281 * because many documents contain rules in headings...
282 */
283
284/*
285 * start tags that imply the end of current element
286 */
Daniel Veillardb96e6431999-08-29 21:02:19 +0000287char *htmlStartClose[] = {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000288"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
289 "dl", "ul", "ol", "menu", "dir", "address", "pre",
290 "listing", "xmp", "head", NULL,
291"head", "p", NULL,
292"title", "p", NULL,
293"body", "head", "style", "link", "title", "p", NULL,
294"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
295 "pre", "listing", "xmp", "head", "li", NULL,
296"hr", "p", "head", NULL,
297"h1", "p", "head", NULL,
298"h2", "p", "head", NULL,
299"h3", "p", "head", NULL,
300"h4", "p", "head", NULL,
301"h5", "p", "head", NULL,
302"h6", "p", "head", NULL,
303"dir", "p", "head", NULL,
304"address", "p", "head", "ul", NULL,
305"pre", "p", "head", "ul", NULL,
306"listing", "p", "head", NULL,
307"xmp", "p", "head", NULL,
308"blockquote", "p", "head", NULL,
309"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
310 "xmp", "head", NULL,
311"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
312 "head", "dd", NULL,
313"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
314 "head", "dt", NULL,
315"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
316 "listing", "xmp", NULL,
317"ol", "p", "head", "ul", NULL,
318"menu", "p", "head", "ul", NULL,
319"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
320"div", "p", "head", NULL,
321"noscript", "p", "head", NULL,
322"center", "font", "b", "i", "p", "head", NULL,
323"a", "a", NULL,
324"caption", "p", NULL,
325"colgroup", "caption", "colgroup", "col", "p", NULL,
326"col", "caption", "col", "p", NULL,
327"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
328 "listing", "xmp", "a", NULL,
329"th", "th", "td", NULL,
330"td", "th", "td", "p", NULL,
331"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
332"thead", "caption", "col", "colgroup", NULL,
333"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
334 "tbody", "p", NULL,
335"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
336 "tfoot", "tbody", "p", NULL,
337"optgroup", "option", NULL,
338"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
339 "pre", "listing", "xmp", "a", NULL,
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000340NULL
341};
342
Daniel Veillardb96e6431999-08-29 21:02:19 +0000343static char** htmlStartCloseIndex[100];
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000344static int htmlStartCloseIndexinitialized = 0;
345
346/************************************************************************
347 * *
348 * functions to handle HTML specific data *
349 * *
350 ************************************************************************/
351
352/**
353 * htmlInitAutoClose:
354 *
355 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
356 *
357 */
358void
359htmlInitAutoClose(void) {
360 int index, i = 0;
361
362 if (htmlStartCloseIndexinitialized) return;
363
364 for (index = 0;index < 100;index ++) htmlStartCloseIndex[index] = NULL;
365 index = 0;
366 while ((htmlStartClose[i] != NULL) && (index < 100 - 1)) {
367 htmlStartCloseIndex[index++] = &htmlStartClose[i];
368 while (htmlStartClose[i] != NULL) i++;
369 i++;
370 }
371}
372
373/**
374 * htmlTagLookup:
375 * @tag: The tag name
376 *
377 * Lookup the HTML tag in the ElementTable
378 *
379 * Returns the related htmlElemDescPtr or NULL if not found.
380 */
381htmlElemDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000382htmlTagLookup(const xmlChar *tag) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000383 int i = 0;
384
385 for (i = 0; i < (sizeof(html40ElementTable) /
386 sizeof(html40ElementTable[0]));i++) {
Daniel Veillardb96e6431999-08-29 21:02:19 +0000387 if (!xmlStrcmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000388 return(&html40ElementTable[i]);
389 }
390 return(NULL);
391}
392
393/**
394 * htmlCheckAutoClose:
395 * @new: The new tag name
396 * @old: The old tag name
397 *
398 * Checks wether the new tag is one of the registered valid tags for closing old.
399 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
400 *
401 * Returns 0 if no, 1 if yes.
402 */
403int
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000404htmlCheckAutoClose(const xmlChar *new, const xmlChar *old) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000405 int i, index;
Daniel Veillardb96e6431999-08-29 21:02:19 +0000406 char **close;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000407
408 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
409
410 /* inefficient, but not a big deal */
411 for (index = 0; index < 100;index++) {
412 close = htmlStartCloseIndex[index];
413 if (close == NULL) return(0);
Daniel Veillardb96e6431999-08-29 21:02:19 +0000414 if (!xmlStrcmp(BAD_CAST *close, new)) break;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000415 }
416
417 i = close - htmlStartClose;
418 i++;
419 while (htmlStartClose[i] != NULL) {
Daniel Veillardb96e6431999-08-29 21:02:19 +0000420 if (!xmlStrcmp(BAD_CAST htmlStartClose[i], old)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000421 return(1);
422 }
423 i++;
424 }
425 return(0);
426}
427
428/**
429 * htmlAutoClose:
430 * @ctxt: an HTML parser context
431 * @new: The new tag name
432 *
433 * The HTmL DtD allows a tag to implicitely close other tags.
434 * The list is kept in htmlStartClose array. This function is
435 * called when a new tag has been detected and generates the
436 * appropriates closes if possible/needed.
437 */
438void
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000439htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *new) {
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000440 xmlChar *oldname;
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000441 while ((ctxt->name != NULL) &&
442 (htmlCheckAutoClose(new, ctxt->name))) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000443#ifdef DEBUG
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000444 fprintf(stderr,"htmlAutoClose: %s closes %s\n", new, ctxt->name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000445#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000446 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000447 ctxt->sax->endElement(ctxt->userData, ctxt->name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +0000448 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000449 if (oldname != NULL) {
450#ifdef DEBUG
451 fprintf(stderr,"htmlAutoClose: popped %s\n", oldname);
452#endif
453 xmlFree(oldname);
454 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000455 }
456}
457
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000458/**
Daniel Veillard5cb5ab81999-12-21 15:35:29 +0000459 * htmlAutoCloseTag:
460 * @doc: the HTML document
461 * @name: The tag name
462 * @elem: the HTML element
463 *
464 * The HTmL DtD allows a tag to implicitely close other tags.
465 * The list is kept in htmlStartClose array. This function checks
466 * if the element or one of it's children would autoclose the
467 * given tag.
468 *
469 * Returns 1 if autoclose, 0 otherwise
470 */
471int
472htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
473 htmlNodePtr child;
474
475 if (elem == NULL) return(1);
476 if (!xmlStrcmp(name, elem->name)) return(0);
477 if (htmlCheckAutoClose(elem->name, name)) return(1);
478 child = elem->childs;
479 while (child != NULL) {
480 if (htmlAutoCloseTag(doc, name, child)) return(1);
481 child = child->next;
482 }
483 return(0);
484}
485
486/**
487 * htmlIsAutoClosed:
488 * @doc: the HTML document
489 * @elem: the HTML element
490 *
491 * The HTmL DtD allows a tag to implicitely close other tags.
492 * The list is kept in htmlStartClose array. This function checks
493 * if a tag is autoclosed by one of it's child
494 *
495 * Returns 1 if autoclosed, 0 otherwise
496 */
497int
498htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
499 htmlNodePtr child;
500
501 if (elem == NULL) return(1);
502 child = elem->childs;
503 while (child != NULL) {
504 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
505 child = child->next;
506 }
507 return(0);
508}
509
510/**
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000511 * htmlAutoCloseOnClose:
512 * @ctxt: an HTML parser context
513 * @new: The new tag name
514 *
515 * The HTmL DtD allows an ending tag to implicitely close other tags.
516 */
517void
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000518htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *new) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000519 htmlElemDescPtr info;
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000520 xmlChar *oldname;
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000521 int i;
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000522
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000523#ifdef DEBUG
524 fprintf(stderr,"Close of %s stack: %d elements\n", new, ctxt->nameNr);
525 for (i = 0;i < ctxt->nameNr;i++)
526 fprintf(stderr,"%d : %s\n", i, ctxt->nameTab[i]);
527#endif
528
529 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
530 if (!xmlStrcmp(new, ctxt->nameTab[i])) break;
531 }
532 if (i < 0) return;
533
534 while (xmlStrcmp(new, ctxt->name)) {
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000535 info = htmlTagLookup(ctxt->name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000536 if ((info == NULL) || (info->endTag == 1)) {
537#ifdef DEBUG
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000538 fprintf(stderr,"htmlAutoCloseOnClose: %s closes %s\n", new, ctxt->name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000539#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000540 } else {
541 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
542 ctxt->sax->error(ctxt->userData,
543 "Opening and ending tag mismatch: %s and %s\n",
544 new, ctxt->name);
545 ctxt->wellFormed = 0;
546 }
547 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
548 ctxt->sax->endElement(ctxt->userData, ctxt->name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +0000549 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000550 if (oldname != NULL) {
551#ifdef DEBUG
552 fprintf(stderr,"htmlAutoCloseOnClose: popped %s\n", oldname);
553#endif
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000554 xmlFree(oldname);
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000555 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000556 }
557}
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000558
559/************************************************************************
560 * *
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000561 * The list of HTML predefined entities *
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000562 * *
563 ************************************************************************/
564
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000565
566htmlEntityDesc html40EntitiesTable[] = {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000567/*
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000568 * the 4 absolute ones,
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000569 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000570{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
571{ 38, "amp", "ampersand, U+0026 ISOnum" },
572{ 60, "lt", "less-than sign, U+003C ISOnum" },
573{ 62, "gt", "greater-than sign, U+003E ISOnum" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000574
575/*
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000576 * A bunch still in the 128-255 range
577 * Replacing them depend really on the charset used.
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000578 */
Daniel Veillard5cb5ab81999-12-21 15:35:29 +0000579{ 39, "apos", "single quote" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000580{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
581{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
582{ 162, "cent", "cent sign, U+00A2 ISOnum" },
583{ 163, "pound","pound sign, U+00A3 ISOnum" },
584{ 164, "curren","currency sign, U+00A4 ISOnum" },
585{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
586{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
587{ 167, "sect", "section sign, U+00A7 ISOnum" },
588{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
589{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
590{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
591{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
592{ 172, "not", "not sign, U+00AC ISOnum" },
593{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
594{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
595{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
596{ 176, "deg", "degree sign, U+00B0 ISOnum" },
597{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
598{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
599{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
600{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
601{ 181, "micro","micro sign, U+00B5 ISOnum" },
602{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000603{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000604{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
605{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
606{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000607{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000608{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
609{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
610{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
611{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
612{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
613{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
614{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
615{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
616{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
617{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
618{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
619{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
620{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
621{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
622{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
623{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
624{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
625{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
626{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
627{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
628{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
629{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
630{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
631{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
632{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
633{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
634{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
635{ 215, "times","multiplication sign, U+00D7 ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000636{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000637{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
638{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
639{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
640{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
641{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
642{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
643{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
644{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
645{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
646{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
647{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
648{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
649{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
650{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
651{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
652{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
653{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
654{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
655{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
656{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
657{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
658{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
659{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
660{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
661{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
662{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
663{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
664{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
665{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
666{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
667{ 247, "divide","division sign, U+00F7 ISOnum" },
668{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
669{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
670{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
671{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
672{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
673{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
674{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
675{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000676
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000677/*
678 * Anything below should really be kept as entities references
679 */
680{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000681
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000682{ 913, "Alpha","greek capital letter alpha, U+0391" },
683{ 914, "Beta", "greek capital letter beta, U+0392" },
684{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
685{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
686{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
687{ 918, "Zeta", "greek capital letter zeta, U+0396" },
688{ 919, "Eta", "greek capital letter eta, U+0397" },
689{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
690{ 921, "Iota", "greek capital letter iota, U+0399" },
691{ 922, "Kappa","greek capital letter kappa, U+039A" },
692{ 923, "Lambda""greek capital letter lambda, U+039B ISOgrk3" },
693{ 924, "Mu", "greek capital letter mu, U+039C" },
694{ 925, "Nu", "greek capital letter nu, U+039D" },
695{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
696{ 927, "Omicron","greek capital letter omicron, U+039F" },
697{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
698{ 929, "Rho", "greek capital letter rho, U+03A1" },
699{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
700{ 932, "Tau", "greek capital letter tau, U+03A4" },
701{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
702{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
703{ 935, "Chi", "greek capital letter chi, U+03A7" },
704{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
705{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000706
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000707{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
708{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
709{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
710{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
711{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
712{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
713{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
714{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
715{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
716{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
717{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
718{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
719{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
720{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
721{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
722{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
723{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
724{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
725{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
726{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
727{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
728{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
729{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
730{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
731{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
732{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
733{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
734{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000735
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000736{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
737{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
738{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
739{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
740{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
741{ 8260, "frasl","fraction slash, U+2044 NEW" },
742
Daniel Veillardb05deb71999-08-10 19:04:08 +0000743{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000744{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
745{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
746{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
747{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
748{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
749{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
750{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
751{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
752{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
753{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
754{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
755{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
756{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
757{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
758{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
759
760
761{ 8704, "forall","for all, U+2200 ISOtech" },
762{ 8706, "part", "partial differential, U+2202 ISOtech" },
763{ 8707, "exist","there exists, U+2203 ISOtech" },
764{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
765{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
766{ 8712, "isin", "element of, U+2208 ISOtech" },
767{ 8713, "notin","not an element of, U+2209 ISOtech" },
768{ 8715, "ni", "contains as member, U+220B ISOtech" },
769{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
770{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
771{ 8722, "minus","minus sign, U+2212 ISOtech" },
772{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
773{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
774{ 8733, "prop", "proportional to, U+221D ISOtech" },
775{ 8734, "infin","infinity, U+221E ISOtech" },
776{ 8736, "ang", "angle, U+2220 ISOamso" },
777{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
778{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
779{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
780{ 8746, "cup", "union = cup, U+222A ISOtech" },
781{ 8747, "int", "integral, U+222B ISOtech" },
782{ 8756, "there4","therefore, U+2234 ISOtech" },
783{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
784{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
785{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
786{ 8800, "ne", "not equal to, U+2260 ISOtech" },
787{ 8801, "equiv","identical to, U+2261 ISOtech" },
788{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
789{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
790{ 8834, "sub", "subset of, U+2282 ISOtech" },
791{ 8835, "sup", "superset of, U+2283 ISOtech" },
792{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
793{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
794{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
795{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
796{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
797{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
798{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
799{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
800{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
801{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
802{ 8971, "rfloor","right floor, U+230B ISOamsc" },
803{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
804{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
805{ 9674, "loz", "lozenge, U+25CA ISOpub" },
806
807{ 9824, "spades","black spade suit, U+2660 ISOpub" },
808{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
809{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
810{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
811
812{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
813{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
814{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
815{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
816{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
817{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
818{ 732, "tilde","small tilde, U+02DC ISOdia" },
819
820{ 8194, "ensp", "en space, U+2002 ISOpub" },
821{ 8195, "emsp", "em space, U+2003 ISOpub" },
822{ 8201, "thinsp","thin space, U+2009 ISOpub" },
823{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
824{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
825{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
826{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
827{ 8211, "ndash","en dash, U+2013 ISOpub" },
828{ 8212, "mdash","em dash, U+2014 ISOpub" },
829{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
830{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
831{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
832{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
833{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
834{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
835{ 8224, "dagger","dagger, U+2020 ISOpub" },
836{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
837{ 8240, "permil","per mille sign, U+2030 ISOtech" },
838{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000839{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000840{ 8364, "euro", "euro sign, U+20AC NEW" }
841};
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000842
843/************************************************************************
844 * *
845 * Commodity functions to handle entities *
846 * *
847 ************************************************************************/
848
849/*
850 * Macro used to grow the current buffer.
851 */
852#define growBuffer(buffer) { \
853 buffer##_size *= 2; \
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000854 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000855 if (buffer == NULL) { \
856 perror("realloc failed"); \
Daniel Veillard0142b842000-01-14 14:45:24 +0000857 return(NULL); \
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000858 } \
859}
860
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000861/**
862 * htmlEntityLookup:
863 * @name: the entity name
864 *
865 * Lookup the given entity in EntitiesTable
866 *
867 * TODO: the linear scan is really ugly, an hash table is really needed.
868 *
869 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
870 */
871htmlEntityDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000872htmlEntityLookup(const xmlChar *name) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000873 int i;
874
875 for (i = 0;i < (sizeof(html40EntitiesTable)/
876 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillardb96e6431999-08-29 21:02:19 +0000877 if (!xmlStrcmp(name, BAD_CAST html40EntitiesTable[i].name)) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000878#ifdef DEBUG
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000879 fprintf(stderr,"Found entity %s\n", name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000880#endif
881 return(&html40EntitiesTable[i]);
882 }
883 }
884 return(NULL);
885}
886
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000887
888/**
889 * htmlDecodeEntities:
890 * @ctxt: the parser context
891 * @len: the len to decode (in bytes !), -1 for no size limit
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000892 * @end: an end marker xmlChar, 0 if none
893 * @end2: an end marker xmlChar, 0 if none
894 * @end3: an end marker xmlChar, 0 if none
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000895 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000896 * Subtitute the HTML entities by their value
897 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000898 * DEPRECATED !!!!
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000899 *
900 * Returns A newly allocated string with the substitution done. The caller
901 * must deallocate it !
902 */
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000903xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000904htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000905 xmlChar end, xmlChar end2, xmlChar end3) {
906 xmlChar *buffer = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000907 int buffer_size = 0;
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000908 xmlChar *out = NULL;
909 xmlChar *name = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000910
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000911 xmlChar *cur = NULL;
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000912 htmlEntityDescPtr ent;
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000913 int nbchars = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000914 unsigned int max = (unsigned int) len;
915
916 /*
917 * allocate a translation buffer.
918 */
Daniel Veillard5e5c6231999-12-29 12:49:06 +0000919 buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000920 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000921 if (buffer == NULL) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000922 perror("htmlDecodeEntities: malloc failed");
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000923 return(NULL);
924 }
925 out = buffer;
926
927 /*
928 * Ok loop until we reach one of the ending char or a size limit.
929 */
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000930 while ((nbchars < max) && (CUR != end) &&
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000931 (CUR != end2) && (CUR != end3)) {
932
933 if (CUR == '&') {
934 if (NXT(1) == '#') {
935 int val = htmlParseCharRef(ctxt);
Daniel Veillardb96e6431999-08-29 21:02:19 +0000936 /* invalid for UTF-8 variable encoding !!!!! */
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000937 *out++ = val;
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000938 nbchars += 3; /* !!!! */
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000939 } else {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000940 ent = htmlParseEntityRef(ctxt, &name);
941 if (name != NULL) {
942 if ((ent == NULL) || (ent->value <= 0) ||
943 (ent->value >= 255)) {
944 *out++ = '&';
945 cur = name;
946 while (*cur != 0) {
947 if (out - buffer > buffer_size - 100) {
948 int index = out - buffer;
949
950 growBuffer(buffer);
951 out = &buffer[index];
952 }
953 *out++ = *cur++;
954 }
955 *out++ = ';';
956 } else {
Daniel Veillardb96e6431999-08-29 21:02:19 +0000957 /* invalid for UTF-8 variable encoding !!!!! */
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000958 *out++ = (xmlChar)ent->value;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000959 if (out - buffer > buffer_size - 100) {
960 int index = out - buffer;
961
962 growBuffer(buffer);
963 out = &buffer[index];
964 }
965 }
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000966 nbchars += 2 + xmlStrlen(name);
Daniel Veillard6454aec1999-09-02 22:04:43 +0000967 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000968 }
969 }
970 } else {
Daniel Veillardb96e6431999-08-29 21:02:19 +0000971 /* invalid for UTF-8 , use COPY(out); !!!!! */
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000972 *out++ = CUR;
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000973 nbchars++;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000974 if (out - buffer > buffer_size - 100) {
975 int index = out - buffer;
976
977 growBuffer(buffer);
978 out = &buffer[index];
979 }
980 NEXT;
981 }
982 }
983 *out++ = 0;
984 return(buffer);
985}
986
987
988/************************************************************************
989 * *
990 * Commodity functions to handle encodings *
991 * *
992 ************************************************************************/
993
994/**
995 * htmlSwitchEncoding:
996 * @ctxt: the parser context
997 * @len: the len of @cur
998 *
999 * change the input functions when discovering the character encoding
1000 * of a given entity.
1001 *
1002 */
1003void
1004htmlSwitchEncoding(htmlParserCtxtPtr ctxt, xmlCharEncoding enc)
1005{
1006 switch (enc) {
1007 case XML_CHAR_ENCODING_ERROR:
1008 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1009 ctxt->sax->error(ctxt->userData, "encoding unknown\n");
1010 ctxt->wellFormed = 0;
1011 break;
1012 case XML_CHAR_ENCODING_NONE:
1013 /* let's assume it's UTF-8 without the XML decl */
1014 return;
1015 case XML_CHAR_ENCODING_UTF8:
1016 /* default encoding, no conversion should be needed */
1017 return;
1018 case XML_CHAR_ENCODING_UTF16LE:
1019 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1020 ctxt->sax->error(ctxt->userData,
1021 "char encoding UTF16 little endian not supported\n");
1022 break;
1023 case XML_CHAR_ENCODING_UTF16BE:
1024 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1025 ctxt->sax->error(ctxt->userData,
1026 "char encoding UTF16 big endian not supported\n");
1027 break;
1028 case XML_CHAR_ENCODING_UCS4LE:
1029 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1030 ctxt->sax->error(ctxt->userData,
1031 "char encoding USC4 little endian not supported\n");
1032 break;
1033 case XML_CHAR_ENCODING_UCS4BE:
1034 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1035 ctxt->sax->error(ctxt->userData,
1036 "char encoding USC4 big endian not supported\n");
1037 break;
1038 case XML_CHAR_ENCODING_EBCDIC:
1039 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1040 ctxt->sax->error(ctxt->userData,
1041 "char encoding EBCDIC not supported\n");
1042 break;
1043 case XML_CHAR_ENCODING_UCS4_2143:
1044 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1045 ctxt->sax->error(ctxt->userData,
1046 "char encoding UCS4 2143 not supported\n");
1047 break;
1048 case XML_CHAR_ENCODING_UCS4_3412:
1049 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1050 ctxt->sax->error(ctxt->userData,
1051 "char encoding UCS4 3412 not supported\n");
1052 break;
1053 case XML_CHAR_ENCODING_UCS2:
1054 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1055 ctxt->sax->error(ctxt->userData,
1056 "char encoding UCS2 not supported\n");
1057 break;
1058 case XML_CHAR_ENCODING_8859_1:
1059 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1060 ctxt->sax->error(ctxt->userData,
1061 "char encoding ISO_8859_1 ISO Latin 1 not supported\n");
1062 break;
1063 case XML_CHAR_ENCODING_8859_2:
1064 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1065 ctxt->sax->error(ctxt->userData,
1066 "char encoding ISO_8859_2 ISO Latin 2 not supported\n");
1067 break;
1068 case XML_CHAR_ENCODING_8859_3:
1069 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1070 ctxt->sax->error(ctxt->userData,
1071 "char encoding ISO_8859_3 not supported\n");
1072 break;
1073 case XML_CHAR_ENCODING_8859_4:
1074 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1075 ctxt->sax->error(ctxt->userData,
1076 "char encoding ISO_8859_4 not supported\n");
1077 break;
1078 case XML_CHAR_ENCODING_8859_5:
1079 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1080 ctxt->sax->error(ctxt->userData,
1081 "char encoding ISO_8859_5 not supported\n");
1082 break;
1083 case XML_CHAR_ENCODING_8859_6:
1084 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1085 ctxt->sax->error(ctxt->userData,
1086 "char encoding ISO_8859_6 not supported\n");
1087 break;
1088 case XML_CHAR_ENCODING_8859_7:
1089 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1090 ctxt->sax->error(ctxt->userData,
1091 "char encoding ISO_8859_7 not supported\n");
1092 break;
1093 case XML_CHAR_ENCODING_8859_8:
1094 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1095 ctxt->sax->error(ctxt->userData,
1096 "char encoding ISO_8859_8 not supported\n");
1097 break;
1098 case XML_CHAR_ENCODING_8859_9:
1099 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1100 ctxt->sax->error(ctxt->userData,
1101 "char encoding ISO_8859_9 not supported\n");
1102 break;
1103 case XML_CHAR_ENCODING_2022_JP:
1104 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1105 ctxt->sax->error(ctxt->userData,
1106 "char encoding ISO-2022-JPnot supported\n");
1107 break;
1108 case XML_CHAR_ENCODING_SHIFT_JIS:
1109 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1110 ctxt->sax->error(ctxt->userData,
1111 "char encoding Shift_JISnot supported\n");
1112 break;
1113 case XML_CHAR_ENCODING_EUC_JP:
1114 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1115 ctxt->sax->error(ctxt->userData,
1116 "char encoding EUC-JPnot supported\n");
1117 break;
1118 }
1119}
1120
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001121/************************************************************************
1122 * *
1123 * Commodity functions to handle streams *
1124 * *
1125 ************************************************************************/
1126
1127/**
1128 * htmlFreeInputStream:
1129 * @input: an htmlParserInputPtr
1130 *
1131 * Free up an input stream.
1132 */
1133void
1134htmlFreeInputStream(htmlParserInputPtr input) {
1135 if (input == NULL) return;
1136
1137 if (input->filename != NULL) xmlFree((char *) input->filename);
1138 if (input->directory != NULL) xmlFree((char *) input->directory);
1139 if ((input->free != NULL) && (input->base != NULL))
1140 input->free((xmlChar *) input->base);
1141 if (input->buf != NULL)
1142 xmlFreeParserInputBuffer(input->buf);
1143 memset(input, -1, sizeof(htmlParserInput));
1144 xmlFree(input);
1145}
1146
1147/**
1148 * htmlNewInputStream:
1149 * @ctxt: an HTML parser context
1150 *
1151 * Create a new input stream structure
1152 * Returns the new input stream or NULL
1153 */
1154htmlParserInputPtr
1155htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1156 htmlParserInputPtr input;
1157
1158 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1159 if (input == NULL) {
1160 ctxt->errNo = XML_ERR_NO_MEMORY;
1161 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1162 ctxt->sax->error(ctxt->userData,
1163 "malloc: couldn't allocate a new input stream\n");
1164 ctxt->errNo = XML_ERR_NO_MEMORY;
1165 return(NULL);
1166 }
1167 input->filename = NULL;
1168 input->directory = NULL;
1169 input->base = NULL;
1170 input->cur = NULL;
1171 input->buf = NULL;
1172 input->line = 1;
1173 input->col = 1;
1174 input->buf = NULL;
1175 input->free = NULL;
1176 input->consumed = 0;
1177 input->length = 0;
1178 return(input);
1179}
1180
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001181
1182/************************************************************************
1183 * *
1184 * Commodity functions, cleanup needed ? *
1185 * *
1186 ************************************************************************/
1187
1188/**
1189 * areBlanks:
1190 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001191 * @str: a xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001192 * @len: the size of @str
1193 *
1194 * Is this a sequence of blank chars that one can ignore ?
1195 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001196 * Returns 1 if ignorable 0 otherwise.
1197 */
1198
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001199static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001200 int i;
1201 xmlNodePtr lastChild;
1202
1203 for (i = 0;i < len;i++)
1204 if (!(IS_BLANK(str[i]))) return(0);
1205
1206 if (CUR != '<') return(0);
1207 if (ctxt->node == NULL) return(0);
1208 lastChild = xmlGetLastChild(ctxt->node);
1209 if (lastChild == NULL) {
1210 if (ctxt->node->content != NULL) return(0);
1211 } else if (xmlNodeIsText(lastChild))
1212 return(0);
1213 return(1);
1214}
1215
1216/**
1217 * htmlHandleEntity:
1218 * @ctxt: an HTML parser context
1219 * @entity: an XML entity pointer.
1220 *
1221 * Default handling of an HTML entity, call the parser with the
1222 * substitution string
1223 */
1224
1225void
1226htmlHandleEntity(htmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
1227 int len;
1228
1229 if (entity->content == NULL) {
1230 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1231 ctxt->sax->error(ctxt->userData, "htmlHandleEntity %s: content == NULL\n",
1232 entity->name);
1233 ctxt->wellFormed = 0;
1234 return;
1235 }
1236 len = xmlStrlen(entity->content);
1237
1238 /*
1239 * Just handle the content as a set of chars.
1240 */
1241 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
1242 ctxt->sax->characters(ctxt->userData, entity->content, len);
1243
1244}
1245
1246/**
1247 * htmlNewDoc:
1248 * @URI: URI for the dtd, or NULL
1249 * @ExternalID: the external ID of the DTD, or NULL
1250 *
1251 * Returns a new document
1252 */
1253htmlDocPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001254htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001255 xmlDocPtr cur;
1256
1257 /*
1258 * Allocate a new document and fill the fields.
1259 */
Daniel Veillard6454aec1999-09-02 22:04:43 +00001260 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001261 if (cur == NULL) {
1262 fprintf(stderr, "xmlNewDoc : malloc failed\n");
1263 return(NULL);
1264 }
Daniel Veillarde7a5a771999-08-30 13:05:42 +00001265 memset(cur, 0, sizeof(xmlDoc));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001266
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001267 cur->type = XML_HTML_DOCUMENT_NODE;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001268 cur->version = NULL;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001269 cur->intSubset = NULL;
Daniel Veillard5cb5ab81999-12-21 15:35:29 +00001270 if ((ExternalID == NULL) &&
1271 (URI == NULL))
1272 xmlCreateIntSubset(cur, BAD_CAST "HTML",
1273 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
1274 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
1275 else
1276 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001277 cur->name = NULL;
1278 cur->root = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001279 cur->extSubset = NULL;
1280 cur->oldNs = NULL;
1281 cur->encoding = NULL;
1282 cur->standalone = 1;
1283 cur->compression = 0;
Daniel Veillardc08a2c61999-09-08 21:35:25 +00001284 cur->ids = NULL;
1285 cur->refs = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001286#ifndef XML_WITHOUT_CORBA
1287 cur->_private = NULL;
1288 cur->vepv = NULL;
1289#endif
1290 return(cur);
1291}
1292
1293
1294/************************************************************************
1295 * *
1296 * The parser itself *
1297 * Relates to http://www.w3.org/TR/html40 *
1298 * *
1299 ************************************************************************/
1300
1301/************************************************************************
1302 * *
1303 * The parser itself *
1304 * *
1305 ************************************************************************/
1306
1307/**
1308 * htmlParseHTMLName:
1309 * @ctxt: an HTML parser context
1310 *
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001311 * parse an HTML tag or attribute name, note that we convert it to lowercase
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001312 * since HTML names are not case-sensitive.
1313 *
1314 * Returns the Tag Name parsed or NULL
1315 */
1316
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001317xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001318htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001319 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001320 int i = 0;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001321 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001322
1323 if (!IS_LETTER(CUR) && (CUR != '_') &&
1324 (CUR != ':')) return(NULL);
1325
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001326 while ((i < HTML_PARSER_BUFFER_SIZE) &&
1327 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)))) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001328 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001329 else loc[i] = CUR;
1330 i++;
1331
1332 NEXT;
1333 }
1334
1335 ret = xmlStrndup(loc, i);
1336
1337 return(ret);
1338}
1339
1340/**
1341 * htmlParseName:
1342 * @ctxt: an HTML parser context
1343 *
1344 * parse an HTML name, this routine is case sensistive.
1345 *
1346 * Returns the Name parsed or NULL
1347 */
1348
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001349xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001350htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001351 xmlChar buf[HTML_MAX_NAMELEN];
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001352 int len = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001353
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001354 GROW;
1355 if (!IS_LETTER(CUR) && (CUR != '_')) {
1356 return(NULL);
1357 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001358
1359 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1360 (CUR == '.') || (CUR == '-') ||
1361 (CUR == '_') || (CUR == ':') ||
1362 (IS_COMBINING(CUR)) ||
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001363 (IS_EXTENDER(CUR))) {
1364 buf[len++] = CUR;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001365 NEXT;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001366 if (len >= HTML_MAX_NAMELEN) {
1367 fprintf(stderr,
1368 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1369 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1370 (CUR == '.') || (CUR == '-') ||
1371 (CUR == '_') || (CUR == ':') ||
1372 (IS_COMBINING(CUR)) ||
1373 (IS_EXTENDER(CUR)))
1374 NEXT;
1375 break;
1376 }
1377 }
1378 return(xmlStrndup(buf, len));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001379}
1380
1381/**
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001382 * htmlParseHTMLAttribute:
1383 * @ctxt: an HTML parser context
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001384 * @stop: a char stop value
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001385 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001386 * parse an HTML attribute value till the stop (quote), if
1387 * stop is 0 then it stops at the first space
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001388 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001389 * Returns the attribute parsed or NULL
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001390 */
1391
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001392xmlChar *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001393htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00001394#if 0
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001395 xmlChar buf[HTML_MAX_NAMELEN];
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001396 int len = 0;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001397
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001398 GROW;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001399 while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1400 if ((stop == 0) && (IS_BLANK(CUR))) break;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001401 buf[len++] = CUR;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001402 NEXT;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001403 if (len >= HTML_MAX_NAMELEN) {
1404 fprintf(stderr,
1405 "htmlParseHTMLAttribute: reached HTML_MAX_NAMELEN limit\n");
1406 while ((!IS_BLANK(CUR)) && (CUR != '<') &&
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001407 (CUR != '>') &&
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001408 (CUR != '\'') && (CUR != '"'))
1409 NEXT;
1410 break;
1411 }
1412 }
1413 return(xmlStrndup(buf, len));
Daniel Veillard71b656e2000-01-05 14:46:17 +00001414#else
1415 xmlChar *buffer = NULL;
1416 int buffer_size = 0;
1417 xmlChar *out = NULL;
1418 xmlChar *name = NULL;
1419
1420 xmlChar *cur = NULL;
1421 htmlEntityDescPtr ent;
1422
1423 /*
1424 * allocate a translation buffer.
1425 */
1426 buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1427 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1428 if (buffer == NULL) {
1429 perror("htmlParseHTMLAttribute: malloc failed");
1430 return(NULL);
1431 }
1432 out = buffer;
1433
1434 /*
1435 * Ok loop until we reach one of the ending chars
1436 */
1437 while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1438 if ((stop == 0) && (IS_BLANK(CUR))) break;
1439 if (CUR == '&') {
1440 if (NXT(1) == '#') {
1441 int val = htmlParseCharRef(ctxt);
1442 *out++ = val;
1443 } else {
1444 ent = htmlParseEntityRef(ctxt, &name);
1445 if (name == NULL) {
1446 *out++ = '&';
1447 if (out - buffer > buffer_size - 100) {
1448 int index = out - buffer;
1449
1450 growBuffer(buffer);
1451 out = &buffer[index];
1452 }
1453 } else if ((ent == NULL) || (ent->value <= 0) ||
1454 (ent->value >= 255)) {
1455 *out++ = '&';
1456 cur = name;
1457 while (*cur != 0) {
1458 if (out - buffer > buffer_size - 100) {
1459 int index = out - buffer;
1460
1461 growBuffer(buffer);
1462 out = &buffer[index];
1463 }
1464 *out++ = *cur++;
1465 }
1466 xmlFree(name);
1467 } else {
1468 *out++ = ent->value;
1469 if (out - buffer > buffer_size - 100) {
1470 int index = out - buffer;
1471
1472 growBuffer(buffer);
1473 out = &buffer[index];
1474 }
1475 xmlFree(name);
1476 }
1477 }
1478 } else {
1479 *out++ = CUR;
1480 if (out - buffer > buffer_size - 100) {
1481 int index = out - buffer;
1482
1483 growBuffer(buffer);
1484 out = &buffer[index];
1485 }
1486 NEXT;
1487 }
1488 }
1489 *out++ = 0;
1490 return(buffer);
1491#endif
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001492}
1493
1494/**
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001495 * htmlParseNmtoken:
1496 * @ctxt: an HTML parser context
1497 *
1498 * parse an HTML Nmtoken.
1499 *
1500 * Returns the Nmtoken parsed or NULL
1501 */
1502
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001503xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001504htmlParseNmtoken(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001505 xmlChar buf[HTML_MAX_NAMELEN];
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001506 int len = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001507
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001508 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001509 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1510 (CUR == '.') || (CUR == '-') ||
1511 (CUR == '_') || (CUR == ':') ||
1512 (IS_COMBINING(CUR)) ||
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001513 (IS_EXTENDER(CUR))) {
1514 buf[len++] = CUR;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001515 NEXT;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001516 if (len >= HTML_MAX_NAMELEN) {
1517 fprintf(stderr,
1518 "htmlParseNmtoken: reached HTML_MAX_NAMELEN limit\n");
1519 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1520 (CUR == '.') || (CUR == '-') ||
1521 (CUR == '_') || (CUR == ':') ||
1522 (IS_COMBINING(CUR)) ||
1523 (IS_EXTENDER(CUR)))
1524 NEXT;
1525 break;
1526 }
1527 }
1528 return(xmlStrndup(buf, len));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001529}
1530
1531/**
1532 * htmlParseEntityRef:
1533 * @ctxt: an HTML parser context
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001534 * @str: location to store the entity name
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001535 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001536 * parse an HTML ENTITY references
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001537 *
1538 * [68] EntityRef ::= '&' Name ';'
1539 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001540 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
1541 * if non-NULL *str will have to be freed by the caller.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001542 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001543htmlEntityDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001544htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
1545 xmlChar *name;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001546 htmlEntityDescPtr ent = NULL;
1547 *str = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001548
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001549 if (CUR == '&') {
1550 NEXT;
1551 name = htmlParseName(ctxt);
1552 if (name == NULL) {
1553 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1554 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
1555 ctxt->wellFormed = 0;
1556 } else {
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001557 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001558 if (CUR == ';') {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001559 *str = name;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001560
1561 /*
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001562 * Lookup the entity in the table.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001563 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001564 ent = htmlEntityLookup(name);
Daniel Veillard71b656e2000-01-05 14:46:17 +00001565 if (ent != NULL) /* OK that's ugly !!! */
1566 NEXT;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001567 } else {
1568 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1569 ctxt->sax->error(ctxt->userData,
1570 "htmlParseEntityRef: expecting ';'\n");
Daniel Veillard71b656e2000-01-05 14:46:17 +00001571 *str = name;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001572 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001573 }
1574 }
1575 return(ent);
1576}
1577
1578/**
1579 * htmlParseAttValue:
1580 * @ctxt: an HTML parser context
1581 *
1582 * parse a value for an attribute
1583 * Note: the parser won't do substitution of entities here, this
1584 * will be handled later in xmlStringGetNodeList, unless it was
1585 * asked for ctxt->replaceEntities != 0
1586 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001587 * Returns the AttValue parsed or NULL.
1588 */
1589
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001590xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001591htmlParseAttValue(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001592 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001593
1594 if (CUR == '"') {
1595 NEXT;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001596 ret = htmlParseHTMLAttribute(ctxt, '"');
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001597 if (CUR != '"') {
1598 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1599 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
1600 ctxt->wellFormed = 0;
1601 } else
1602 NEXT;
1603 } else if (CUR == '\'') {
1604 NEXT;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001605 ret = htmlParseHTMLAttribute(ctxt, '\'');
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001606 if (CUR != '\'') {
1607 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1608 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
1609 ctxt->wellFormed = 0;
1610 } else
1611 NEXT;
1612 } else {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001613 /*
1614 * That's an HTMLism, the attribute value may not be quoted
1615 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001616 ret = htmlParseHTMLAttribute(ctxt, 0);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001617 if (ret == NULL) {
1618 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1619 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
1620 ctxt->wellFormed = 0;
1621 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001622 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001623 return(ret);
1624}
1625
1626/**
1627 * htmlParseSystemLiteral:
1628 * @ctxt: an HTML parser context
1629 *
1630 * parse an HTML Literal
1631 *
1632 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
1633 *
1634 * Returns the SystemLiteral parsed or NULL
1635 */
1636
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001637xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001638htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001639 const xmlChar *q;
1640 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001641
1642 if (CUR == '"') {
1643 NEXT;
1644 q = CUR_PTR;
1645 while ((IS_CHAR(CUR)) && (CUR != '"'))
1646 NEXT;
1647 if (!IS_CHAR(CUR)) {
1648 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1649 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
1650 ctxt->wellFormed = 0;
1651 } else {
1652 ret = xmlStrndup(q, CUR_PTR - q);
1653 NEXT;
1654 }
1655 } else if (CUR == '\'') {
1656 NEXT;
1657 q = CUR_PTR;
1658 while ((IS_CHAR(CUR)) && (CUR != '\''))
1659 NEXT;
1660 if (!IS_CHAR(CUR)) {
1661 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1662 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
1663 ctxt->wellFormed = 0;
1664 } else {
1665 ret = xmlStrndup(q, CUR_PTR - q);
1666 NEXT;
1667 }
1668 } else {
1669 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1670 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
1671 ctxt->wellFormed = 0;
1672 }
1673
1674 return(ret);
1675}
1676
1677/**
1678 * htmlParsePubidLiteral:
1679 * @ctxt: an HTML parser context
1680 *
1681 * parse an HTML public literal
1682 *
1683 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
1684 *
1685 * Returns the PubidLiteral parsed or NULL.
1686 */
1687
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001688xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001689htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001690 const xmlChar *q;
1691 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001692 /*
1693 * Name ::= (Letter | '_') (NameChar)*
1694 */
1695 if (CUR == '"') {
1696 NEXT;
1697 q = CUR_PTR;
1698 while (IS_PUBIDCHAR(CUR)) NEXT;
1699 if (CUR != '"') {
1700 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1701 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
1702 ctxt->wellFormed = 0;
1703 } else {
1704 ret = xmlStrndup(q, CUR_PTR - q);
1705 NEXT;
1706 }
1707 } else if (CUR == '\'') {
1708 NEXT;
1709 q = CUR_PTR;
1710 while ((IS_LETTER(CUR)) && (CUR != '\''))
1711 NEXT;
1712 if (!IS_LETTER(CUR)) {
1713 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1714 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
1715 ctxt->wellFormed = 0;
1716 } else {
1717 ret = xmlStrndup(q, CUR_PTR - q);
1718 NEXT;
1719 }
1720 } else {
1721 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1722 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
1723 ctxt->wellFormed = 0;
1724 }
1725
1726 return(ret);
1727}
1728
1729/**
1730 * htmlParseCharData:
1731 * @ctxt: an HTML parser context
1732 * @cdata: int indicating whether we are within a CDATA section
1733 *
1734 * parse a CharData section.
1735 * if we are within a CDATA section ']]>' marks an end of section.
1736 *
1737 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
1738 */
1739
1740void
1741htmlParseCharData(htmlParserCtxtPtr ctxt, int cdata) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001742 xmlChar *buf = NULL;
1743 int len = 0;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001744 int size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001745 xmlChar q;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001746
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001747 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
1748 if (buf == NULL) {
1749 fprintf(stderr, "malloc of %d byte failed\n", size);
1750 return;
1751 }
1752
1753 q = CUR;
1754 while ((IS_CHAR(q)) && (q != '<') &&
1755 (q != '&')) {
1756 if ((q == ']') && (NXT(1) == ']') &&
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001757 (NXT(2) == '>')) {
1758 if (cdata) break;
1759 else {
1760 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1761 ctxt->sax->error(ctxt->userData,
1762 "Sequence ']]>' not allowed in content\n");
1763 ctxt->wellFormed = 0;
1764 }
1765 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001766 if (len + 1 >= size) {
1767 size *= 2;
1768 buf = xmlRealloc(buf, size * sizeof(xmlChar));
1769 if (buf == NULL) {
1770 fprintf(stderr, "realloc of %d byte failed\n", size);
1771 return;
1772 }
1773 }
1774 buf[len++] = q;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001775 NEXT;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001776 q = CUR;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001777 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001778 if (len == 0) {
1779 xmlFree(buf);
1780 return;
1781 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001782
1783 /*
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001784 * Ok the buffer is to be consumed as chars.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001785 */
1786 if (ctxt->sax != NULL) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001787 if (areBlanks(ctxt, buf, len)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001788 if (ctxt->sax->ignorableWhitespace != NULL)
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001789 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, len);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001790 } else {
1791 if (ctxt->sax->characters != NULL)
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001792 ctxt->sax->characters(ctxt->userData, buf, len);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001793 }
1794 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001795 xmlFree(buf);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001796}
1797
1798/**
1799 * htmlParseExternalID:
1800 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001801 * @publicID: a xmlChar** receiving PubidLiteral
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001802 * @strict: indicate whether we should restrict parsing to only
1803 * production [75], see NOTE below
1804 *
1805 * Parse an External ID or a Public ID
1806 *
1807 * NOTE: Productions [75] and [83] interract badly since [75] can generate
1808 * 'PUBLIC' S PubidLiteral S SystemLiteral
1809 *
1810 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
1811 * | 'PUBLIC' S PubidLiteral S SystemLiteral
1812 *
1813 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
1814 *
1815 * Returns the function returns SystemLiteral and in the second
1816 * case publicID receives PubidLiteral, is strict is off
1817 * it is possible to return NULL and have publicID set.
1818 */
1819
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001820xmlChar *
1821htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID, int strict) {
1822 xmlChar *URI = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001823
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001824 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
1825 (UPP(2) == 'S') && (UPP(3) == 'T') &&
1826 (UPP(4) == 'E') && (UPP(5) == 'M')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001827 SKIP(6);
1828 if (!IS_BLANK(CUR)) {
1829 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1830 ctxt->sax->error(ctxt->userData,
1831 "Space required after 'SYSTEM'\n");
1832 ctxt->wellFormed = 0;
1833 }
1834 SKIP_BLANKS;
1835 URI = htmlParseSystemLiteral(ctxt);
1836 if (URI == NULL) {
1837 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1838 ctxt->sax->error(ctxt->userData,
1839 "htmlParseExternalID: SYSTEM, no URI\n");
1840 ctxt->wellFormed = 0;
1841 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001842 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
1843 (UPP(2) == 'B') && (UPP(3) == 'L') &&
1844 (UPP(4) == 'I') && (UPP(5) == 'C')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001845 SKIP(6);
1846 if (!IS_BLANK(CUR)) {
1847 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1848 ctxt->sax->error(ctxt->userData,
1849 "Space required after 'PUBLIC'\n");
1850 ctxt->wellFormed = 0;
1851 }
1852 SKIP_BLANKS;
1853 *publicID = htmlParsePubidLiteral(ctxt);
1854 if (*publicID == NULL) {
1855 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1856 ctxt->sax->error(ctxt->userData,
1857 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
1858 ctxt->wellFormed = 0;
1859 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001860 SKIP_BLANKS;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001861 if ((CUR == '"') || (CUR == '\'')) {
1862 URI = htmlParseSystemLiteral(ctxt);
1863 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001864 }
1865 return(URI);
1866}
1867
1868/**
1869 * htmlParseComment:
1870 * @ctxt: an HTML parser context
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001871 *
1872 * Parse an XML (SGML) comment <!-- .... -->
1873 *
1874 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
1875 */
1876void
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001877htmlParseComment(htmlParserCtxtPtr ctxt) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001878 xmlChar *buf = NULL;
1879 int len = 0;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001880 int size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001881 register xmlChar s, r, q;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001882
1883 /*
1884 * Check that there is a comment right here.
1885 */
1886 if ((CUR != '<') || (NXT(1) != '!') ||
1887 (NXT(2) != '-') || (NXT(3) != '-')) return;
1888
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001889 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
1890 if (buf == NULL) {
1891 fprintf(stderr, "malloc of %d byte failed\n", size);
1892 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001893 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001894 q = r = '-'; /* 0 or '-' to cover our ass against <!--> and <!---> ? !!! */
1895 SKIP(4);
1896 s = CUR;
1897
1898 while (IS_CHAR(s) &&
1899 ((s != '>') || (r != '-') || (q != '-'))) {
1900 if (len + 1 >= size) {
1901 size *= 2;
1902 buf = xmlRealloc(buf, size * sizeof(xmlChar));
1903 if (buf == NULL) {
1904 fprintf(stderr, "realloc of %d byte failed\n", size);
1905 return;
1906 }
1907 }
1908 buf[len++] = s;
1909 NEXT;
1910 q = r;
1911 r = s;
1912 s = CUR;
1913 }
1914 buf[len - 2] = 0;
1915 if (!IS_CHAR(s)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001916 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001917 ctxt->sax->error(ctxt->userData, "Comment not terminated \n<!--%.50s\n", buf);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001918 ctxt->wellFormed = 0;
1919 } else {
1920 NEXT;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001921 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL)) {
1922 ctxt->sax->comment(ctxt->userData, buf);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001923 }
1924 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001925 xmlFree(buf);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001926}
1927
1928/**
1929 * htmlParseCharRef:
1930 * @ctxt: an HTML parser context
1931 *
1932 * parse Reference declarations
1933 *
1934 * [66] CharRef ::= '&#' [0-9]+ ';' |
1935 * '&#x' [0-9a-fA-F]+ ';'
1936 *
1937 * Returns the value parsed (as an int)
1938 */
1939int
1940htmlParseCharRef(htmlParserCtxtPtr ctxt) {
1941 int val = 0;
1942
1943 if ((CUR == '&') && (NXT(1) == '#') &&
1944 (NXT(2) == 'x')) {
1945 SKIP(3);
1946 while (CUR != ';') {
1947 if ((CUR >= '0') && (CUR <= '9'))
1948 val = val * 16 + (CUR - '0');
1949 else if ((CUR >= 'a') && (CUR <= 'f'))
1950 val = val * 16 + (CUR - 'a') + 10;
1951 else if ((CUR >= 'A') && (CUR <= 'F'))
1952 val = val * 16 + (CUR - 'A') + 10;
1953 else {
1954 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1955 ctxt->sax->error(ctxt->userData,
1956 "htmlParseCharRef: invalid hexadecimal value\n");
1957 ctxt->wellFormed = 0;
1958 val = 0;
1959 break;
1960 }
1961 NEXT;
1962 }
1963 if (CUR == ';')
1964 NEXT;
1965 } else if ((CUR == '&') && (NXT(1) == '#')) {
1966 SKIP(2);
1967 while (CUR != ';') {
1968 if ((CUR >= '0') && (CUR <= '9'))
1969 val = val * 10 + (CUR - '0');
1970 else {
1971 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1972 ctxt->sax->error(ctxt->userData,
1973 "htmlParseCharRef: invalid decimal value\n");
1974 ctxt->wellFormed = 0;
1975 val = 0;
1976 break;
1977 }
1978 NEXT;
1979 }
1980 if (CUR == ';')
1981 NEXT;
1982 } else {
1983 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1984 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
1985 ctxt->wellFormed = 0;
1986 }
1987 /*
1988 * Check the value IS_CHAR ...
1989 */
1990 if (IS_CHAR(val)) {
1991 return(val);
1992 } else {
1993 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001994 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001995 val);
1996 ctxt->wellFormed = 0;
1997 }
1998 return(0);
1999}
2000
2001
2002/**
2003 * htmlParseDocTypeDecl :
2004 * @ctxt: an HTML parser context
2005 *
2006 * parse a DOCTYPE declaration
2007 *
2008 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2009 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2010 */
2011
2012void
2013htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002014 xmlChar *name;
2015 xmlChar *ExternalID = NULL;
2016 xmlChar *URI = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002017
2018 /*
2019 * We know that '<!DOCTYPE' has been detected.
2020 */
2021 SKIP(9);
2022
2023 SKIP_BLANKS;
2024
2025 /*
2026 * Parse the DOCTYPE name.
2027 */
2028 name = htmlParseName(ctxt);
2029 if (name == NULL) {
2030 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2031 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2032 ctxt->wellFormed = 0;
2033 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002034 /*
2035 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2036 */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002037
2038 SKIP_BLANKS;
2039
2040 /*
2041 * Check for SystemID and ExternalID
2042 */
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002043 URI = htmlParseExternalID(ctxt, &ExternalID, 0);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002044 SKIP_BLANKS;
2045
2046 /*
2047 * We should be at the end of the DOCTYPE declaration.
2048 */
2049 if (CUR != '>') {
2050 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2051 ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
2052 ctxt->wellFormed = 0;
2053 /* We shouldn't try to resynchronize ... */
2054 } else {
2055 }
2056 NEXT;
2057
2058 /*
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002059 * Create the document accordingly to the DOCTYPE
2060 */
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002061 if (ctxt->myDoc != NULL)
2062 xmlFreeDoc(ctxt->myDoc);
2063
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002064 ctxt->myDoc = htmlNewDoc(URI, ExternalID);
2065
2066 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002067 * Cleanup, since we don't use all those identifiers
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002068 */
Daniel Veillard6454aec1999-09-02 22:04:43 +00002069 if (URI != NULL) xmlFree(URI);
2070 if (ExternalID != NULL) xmlFree(ExternalID);
2071 if (name != NULL) xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002072}
2073
2074/**
2075 * htmlParseAttribute:
2076 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002077 * @value: a xmlChar ** used to store the value of the attribute
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002078 *
2079 * parse an attribute
2080 *
2081 * [41] Attribute ::= Name Eq AttValue
2082 *
2083 * [25] Eq ::= S? '=' S?
2084 *
2085 * With namespace:
2086 *
2087 * [NS 11] Attribute ::= QName Eq AttValue
2088 *
2089 * Also the case QName == xmlns:??? is handled independently as a namespace
2090 * definition.
2091 *
2092 * Returns the attribute name, and the value in *value.
2093 */
2094
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002095xmlChar *
2096htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002097 xmlChar *name, *val = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002098
2099 *value = NULL;
2100 name = htmlParseName(ctxt);
2101 if (name == NULL) {
2102 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2103 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2104 ctxt->wellFormed = 0;
2105 return(NULL);
2106 }
2107
2108 /*
2109 * read the value
2110 */
2111 SKIP_BLANKS;
2112 if (CUR == '=') {
2113 NEXT;
2114 SKIP_BLANKS;
2115 val = htmlParseAttValue(ctxt);
2116 } else {
Daniel Veillard4a53eca1999-12-12 13:03:50 +00002117 /* TODO : some attribute must have values, some may not */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002118 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002119 ctxt->sax->warning(ctxt->userData,
2120 "No value for attribute %s\n", name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002121 }
2122
2123 *value = val;
2124 return(name);
2125}
2126
2127/**
2128 * htmlParseStartTag:
2129 * @ctxt: an HTML parser context
2130 *
2131 * parse a start of tag either for rule element or
2132 * EmptyElement. In both case we don't parse the tag closing chars.
2133 *
2134 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2135 *
2136 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2137 *
2138 * With namespace:
2139 *
2140 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2141 *
2142 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2143 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002144 */
2145
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002146void
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002147htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002148 xmlChar *name;
2149 xmlChar *attname;
2150 xmlChar *attvalue;
2151 const xmlChar **atts = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002152 int nbatts = 0;
2153 int maxatts = 0;
2154 int i;
2155
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002156 if (CUR != '<') return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002157 NEXT;
2158
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002159 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002160 name = htmlParseHTMLName(ctxt);
2161 if (name == NULL) {
2162 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2163 ctxt->sax->error(ctxt->userData,
2164 "htmlParseStartTag: invalid element name\n");
2165 ctxt->wellFormed = 0;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002166 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002167 }
2168
2169 /*
2170 * Check for auto-closure of HTML elements.
2171 */
2172 htmlAutoClose(ctxt, name);
2173
2174 /*
2175 * Now parse the attributes, it ends up with the ending
2176 *
2177 * (S Attribute)* S?
2178 */
2179 SKIP_BLANKS;
2180 while ((IS_CHAR(CUR)) &&
2181 (CUR != '>') &&
2182 ((CUR != '/') || (NXT(1) != '>'))) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002183 long cons = ctxt->nbChars;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002184
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002185 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002186 attname = htmlParseAttribute(ctxt, &attvalue);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002187 if (attname != NULL) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002188 /*
2189 * Well formedness requires at most one declaration of an attribute
2190 */
2191 for (i = 0; i < nbatts;i += 2) {
2192 if (!xmlStrcmp(atts[i], attname)) {
2193 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002194 ctxt->sax->error(ctxt->userData,
2195 "Attribute %s redefined\n",
2196 attname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002197 ctxt->wellFormed = 0;
Daniel Veillard6454aec1999-09-02 22:04:43 +00002198 xmlFree(attname);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002199 if (attvalue != NULL)
2200 xmlFree(attvalue);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002201 goto failed;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002202 }
2203 }
2204
2205 /*
2206 * Add the pair to atts
2207 */
2208 if (atts == NULL) {
2209 maxatts = 10;
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002210 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002211 if (atts == NULL) {
2212 fprintf(stderr, "malloc of %ld byte failed\n",
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002213 maxatts * (long)sizeof(xmlChar *));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002214 if (name != NULL) xmlFree(name);
2215 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002216 }
Daniel Veillard51e3b151999-11-12 17:02:31 +00002217 } else if (nbatts + 4 > maxatts) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002218 maxatts *= 2;
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002219 atts = (const xmlChar **) xmlRealloc(atts, maxatts * sizeof(xmlChar *));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002220 if (atts == NULL) {
2221 fprintf(stderr, "realloc of %ld byte failed\n",
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002222 maxatts * (long)sizeof(xmlChar *));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002223 if (name != NULL) xmlFree(name);
2224 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002225 }
2226 }
2227 atts[nbatts++] = attname;
2228 atts[nbatts++] = attvalue;
2229 atts[nbatts] = NULL;
2230 atts[nbatts + 1] = NULL;
2231 }
2232
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002233failed:
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002234 SKIP_BLANKS;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002235 if (cons == ctxt->nbChars) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002236 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2237 ctxt->sax->error(ctxt->userData,
2238 "htmlParseStartTag: problem parsing attributes\n");
2239 ctxt->wellFormed = 0;
2240 break;
2241 }
2242 }
2243
2244 /*
2245 * SAX: Start of Element !
2246 */
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002247 htmlnamePush(ctxt, xmlStrdup(name));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002248#ifdef DEBUG
2249 fprintf(stderr,"Start of element %s: pushed %s\n", name, ctxt->name);
2250#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002251 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
2252 ctxt->sax->startElement(ctxt->userData, name, atts);
2253
2254 if (atts != NULL) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002255 for (i = 0;i < nbatts;i++) {
2256 if (atts[i] != NULL)
2257 xmlFree((xmlChar *) atts[i]);
2258 }
Daniel Veillard6454aec1999-09-02 22:04:43 +00002259 xmlFree(atts);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002260 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002261 if (name != NULL) xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002262}
2263
2264/**
2265 * htmlParseEndTag:
2266 * @ctxt: an HTML parser context
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002267 *
2268 * parse an end of tag
2269 *
2270 * [42] ETag ::= '</' Name S? '>'
2271 *
2272 * With namespace
2273 *
2274 * [NS 9] ETag ::= '</' QName S? '>'
2275 */
2276
2277void
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002278htmlParseEndTag(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002279 xmlChar *name;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002280 xmlChar *oldname;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002281 int i;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002282
2283 if ((CUR != '<') || (NXT(1) != '/')) {
2284 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2285 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
2286 ctxt->wellFormed = 0;
2287 return;
2288 }
2289 SKIP(2);
2290
2291 name = htmlParseHTMLName(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002292 if (name == NULL) return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002293
2294 /*
2295 * We should definitely be at the ending "S? '>'" part
2296 */
2297 SKIP_BLANKS;
2298 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
2299 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2300 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
2301 ctxt->wellFormed = 0;
2302 } else
2303 NEXT;
2304
2305 /*
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002306 * If the name read is not one of the element in the parsing stack
2307 * then return, it's just an error.
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002308 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002309 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
2310 if (!xmlStrcmp(name, ctxt->nameTab[i])) break;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002311 }
2312 if (i < 0) {
2313 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002314 ctxt->sax->error(ctxt->userData,
2315 "Unexpected end tag : %s\n", name);
Daniel Veillard6454aec1999-09-02 22:04:43 +00002316 xmlFree(name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002317 ctxt->wellFormed = 0;
2318 return;
2319 }
2320
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002321
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002322 /*
2323 * Check for auto-closure of HTML elements.
2324 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002325
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002326 htmlAutoCloseOnClose(ctxt, name);
2327
2328 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002329 * Well formedness constraints, opening and closing must match.
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002330 * With the exception that the autoclose may have popped stuff out
2331 * of the stack.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002332 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002333 if (xmlStrcmp(name, ctxt->name)) {
2334#ifdef DEBUG
2335 fprintf(stderr,"End of tag %s: expecting %s\n", name, ctxt->name);
2336#endif
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002337 if ((ctxt->name != NULL) &&
2338 (xmlStrcmp(ctxt->name, name))) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002339 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2340 ctxt->sax->error(ctxt->userData,
2341 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002342 name, ctxt->name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002343 ctxt->wellFormed = 0;
2344 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002345 }
2346
2347 /*
2348 * SAX: End of Tag
2349 */
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002350 oldname = ctxt->name;
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002351 if ((oldname != NULL) && (!xmlStrcmp(oldname, name))) {
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002352 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2353 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002354 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002355 if (oldname != NULL) {
2356#ifdef DEBUG
2357 fprintf(stderr,"End of tag %s: popping out %s\n", name, oldname);
2358#endif
2359 xmlFree(oldname);
2360#ifdef DEBUG
2361 } else {
2362 fprintf(stderr,"End of tag %s: stack empty !!!\n", name);
2363#endif
2364 }
2365 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002366
2367 if (name != NULL)
Daniel Veillard6454aec1999-09-02 22:04:43 +00002368 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002369
2370 return;
2371}
2372
2373
2374/**
2375 * htmlParseReference:
2376 * @ctxt: an HTML parser context
2377 *
2378 * parse and handle entity references in content,
2379 * this will end-up in a call to character() since this is either a
2380 * CharRef, or a predefined entity.
2381 */
2382void
2383htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002384 htmlEntityDescPtr ent;
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002385 xmlChar out[2];
2386 xmlChar *name;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002387 int val;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002388 if (CUR != '&') return;
2389
2390 if (NXT(1) == '#') {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002391 val = htmlParseCharRef(ctxt);
Daniel Veillardb96e6431999-08-29 21:02:19 +00002392 /* invalid for UTF-8 variable encoding !!!!! */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002393 out[0] = val;
2394 out[1] = 0;
2395 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
2396 ctxt->sax->characters(ctxt->userData, out, 1);
2397 } else {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002398 ent = htmlParseEntityRef(ctxt, &name);
Daniel Veillard71b656e2000-01-05 14:46:17 +00002399 if (name == NULL) {
2400 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
2401 return;
2402 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002403 if ((ent == NULL) || (ent->value <= 0) || (ent->value >= 255)) {
2404 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
Daniel Veillardb96e6431999-08-29 21:02:19 +00002405 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002406 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
Daniel Veillard71b656e2000-01-05 14:46:17 +00002407 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002408 }
2409 } else {
Daniel Veillardb96e6431999-08-29 21:02:19 +00002410 /* invalid for UTF-8 variable encoding !!!!! */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002411 out[0] = ent->value;
2412 out[1] = 0;
2413 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
2414 ctxt->sax->characters(ctxt->userData, out, 1);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002415 }
Daniel Veillard6454aec1999-09-02 22:04:43 +00002416 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002417 }
2418}
2419
2420/**
2421 * htmlParseContent:
2422 * @ctxt: an HTML parser context
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002423 * @name: the node name
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002424 *
2425 * Parse a content: comment, sub-element, reference or text.
2426 *
2427 */
2428
2429void
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002430htmlParseContent(htmlParserCtxtPtr ctxt) {
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002431 xmlChar *currentNode;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002432 int depth;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002433
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002434 currentNode = xmlStrdup(ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002435 depth = ctxt->nameNr;
2436 while (1) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002437 long cons = ctxt->nbChars;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002438
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002439 GROW;
2440 /*
2441 * Our tag or one of it's parent or children is ending.
2442 */
2443 if ((CUR == '<') && (NXT(1) == '/')) {
2444 htmlParseEndTag(ctxt);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002445 if (currentNode != NULL) xmlFree(currentNode);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002446 return;
2447 }
2448
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002449 /*
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002450 * Has this node been popped out during parsing of
2451 * the next element
2452 */
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002453 if ((xmlStrcmp(currentNode, ctxt->name)) &&
2454 (depth >= ctxt->nameNr)) {
2455 if (currentNode != NULL) xmlFree(currentNode);
2456 return;
2457 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002458
2459 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002460 * First case : a comment
2461 */
2462 if ((CUR == '<') && (NXT(1) == '!') &&
2463 (NXT(2) == '-') && (NXT(3) == '-')) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002464 htmlParseComment(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002465 }
2466
2467 /*
2468 * Second case : a sub-element.
2469 */
2470 else if (CUR == '<') {
2471 htmlParseElement(ctxt);
2472 }
2473
2474 /*
2475 * Third case : a reference. If if has not been resolved,
2476 * parsing returns it's Name, create the node
2477 */
2478 else if (CUR == '&') {
2479 htmlParseReference(ctxt);
2480 }
2481
2482 /*
2483 * Last case, text. Note that References are handled directly.
2484 */
2485 else {
2486 htmlParseCharData(ctxt, 0);
2487 }
2488
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002489 if (cons == ctxt->nbChars) {
Daniel Veillard35008381999-10-25 13:15:52 +00002490 if (ctxt->node != NULL) {
2491 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2492 ctxt->sax->error(ctxt->userData,
2493 "detected an error in element content\n");
2494 ctxt->wellFormed = 0;
2495 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002496 break;
2497 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002498
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002499 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002500 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002501 if (currentNode != NULL) xmlFree(currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002502}
2503
2504/**
2505 * htmlParseElement:
2506 * @ctxt: an HTML parser context
2507 *
2508 * parse an HTML element, this is highly recursive
2509 *
2510 * [39] element ::= EmptyElemTag | STag content ETag
2511 *
2512 * [41] Attribute ::= Name Eq AttValue
2513 */
2514
2515void
2516htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002517 const xmlChar *openTag = CUR_PTR;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002518 xmlChar *name;
Daniel Veillard7d2c2761999-10-11 15:09:51 +00002519 xmlChar *currentNode = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002520 htmlElemDescPtr info;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002521 htmlParserNodeInfo node_info;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002522 xmlChar *oldname;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002523 int depth = ctxt->nameNr;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002524
2525 /* Capture start position */
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002526 if (ctxt->record_info) {
2527 node_info.begin_pos = ctxt->input->consumed +
2528 (CUR_PTR - ctxt->input->base);
2529 node_info.begin_line = ctxt->input->line;
2530 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002531
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002532 oldname = xmlStrdup(ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002533 htmlParseStartTag(ctxt);
2534 name = ctxt->name;
2535#ifdef DEBUG
2536 if (oldname == NULL)
2537 fprintf(stderr, "Start of element %s\n", name);
2538 else if (name == NULL)
2539 fprintf(stderr, "Start of element failed, was %s\n", oldname);
2540 else
2541 fprintf(stderr, "Start of element %s, was %s\n", name, oldname);
2542#endif
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002543 if (((depth == ctxt->nameNr) && (!xmlStrcmp(oldname, ctxt->name))) ||
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002544 (name == NULL)) {
2545 if (CUR == '>')
2546 NEXT;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002547 if (oldname != NULL)
2548 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002549 return;
2550 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002551 if (oldname != NULL)
2552 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002553
2554 /*
2555 * Lookup the info for that element.
2556 */
2557 info = htmlTagLookup(name);
2558 if (info == NULL) {
2559 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2560 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
2561 name);
2562 ctxt->wellFormed = 0;
2563 } else if (info->depr) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002564/***************************
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002565 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
2566 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
2567 name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002568 ***************************/
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002569 }
2570
2571 /*
2572 * Check for an Empty Element labelled the XML/SGML way
2573 */
2574 if ((CUR == '/') && (NXT(1) == '>')) {
2575 SKIP(2);
2576 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2577 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002578 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002579#ifdef DEBUG
2580 fprintf(stderr,"End of tag the XML way: popping out %s\n", oldname);
2581#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002582 if (oldname != NULL)
2583 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002584 return;
2585 }
2586
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002587 if (CUR == '>') {
2588 NEXT;
2589 } else {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002590 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2591 ctxt->sax->error(ctxt->userData, "Couldn't find end of Start Tag\n%.30s\n",
2592 openTag);
2593 ctxt->wellFormed = 0;
2594
2595 /*
2596 * end of parsing of this node.
2597 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002598 if (!xmlStrcmp(name, ctxt->name)) {
2599 nodePop(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002600 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002601#ifdef DEBUG
2602 fprintf(stderr,"End of start tag problem: popping out %s\n", oldname);
2603#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002604 if (oldname != NULL)
2605 xmlFree(oldname);
2606 }
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002607
2608 /*
2609 * Capture end position and add node
2610 */
2611 if ( currentNode != NULL && ctxt->record_info ) {
2612 node_info.end_pos = ctxt->input->consumed +
2613 (CUR_PTR - ctxt->input->base);
2614 node_info.end_line = ctxt->input->line;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002615 node_info.node = ctxt->node;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002616 xmlParserAddNodeInfo(ctxt, &node_info);
2617 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002618 return;
2619 }
2620
2621 /*
2622 * Check for an Empty Element from DTD definition
2623 */
2624 if ((info != NULL) && (info->empty)) {
2625 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2626 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002627 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002628#ifdef DEBUG
2629 fprintf(stderr,"End of empty tag %s : popping out %s\n", name, oldname);
2630#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002631 if (oldname != NULL)
2632 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002633 return;
2634 }
2635
2636 /*
2637 * Parse the content of the element:
2638 */
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002639 currentNode = xmlStrdup(ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002640 depth = ctxt->nameNr;
2641 while (IS_CHAR(CUR)) {
2642 htmlParseContent(ctxt);
2643 if (ctxt->nameNr < depth) break;
2644 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002645
2646 if (!IS_CHAR(CUR)) {
2647 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2648 ctxt->sax->error(ctxt->userData,
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002649 "Premature end of data in tag %s\n", currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002650 ctxt->wellFormed = 0;
2651
2652 /*
2653 * end of parsing of this node.
2654 */
2655 nodePop(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002656 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002657#ifdef DEBUG
2658 fprintf(stderr,"Premature end of tag %s : popping out %s\n", name, oldname);
2659#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002660 if (oldname != NULL)
2661 xmlFree(oldname);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002662 if (currentNode != NULL)
2663 xmlFree(currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002664 return;
2665 }
2666
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002667 /*
2668 * Capture end position and add node
2669 */
2670 if ( currentNode != NULL && ctxt->record_info ) {
2671 node_info.end_pos = ctxt->input->consumed +
2672 (CUR_PTR - ctxt->input->base);
2673 node_info.end_line = ctxt->input->line;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002674 node_info.node = ctxt->node;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002675 xmlParserAddNodeInfo(ctxt, &node_info);
2676 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002677 if (currentNode != NULL)
2678 xmlFree(currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002679}
2680
2681/**
2682 * htmlParseDocument :
2683 * @ctxt: an HTML parser context
2684 *
2685 * parse an HTML document (and build a tree if using the standard SAX
2686 * interface).
2687 *
2688 * Returns 0, -1 in case of error. the parser context is augmented
2689 * as a result of the parsing.
2690 */
2691
2692int
2693htmlParseDocument(htmlParserCtxtPtr ctxt) {
2694 htmlDefaultSAXHandlerInit();
2695 ctxt->html = 1;
2696
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002697 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002698 /*
Daniel Veillardb96e6431999-08-29 21:02:19 +00002699 * SAX: beginning of the document processing.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002700 */
2701 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
2702 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
2703
2704 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002705 * Wipe out everything which is before the first '<'
2706 */
Daniel Veillard35008381999-10-25 13:15:52 +00002707 SKIP_BLANKS;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002708 if (CUR == 0) {
2709 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2710 ctxt->sax->error(ctxt->userData, "Document is empty\n");
2711 ctxt->wellFormed = 0;
2712 }
2713
Daniel Veillard35008381999-10-25 13:15:52 +00002714 /*
2715 * Parse possible comments before any content
2716 */
2717 while ((CUR == '<') && (NXT(1) == '!') &&
2718 (NXT(2) == '-') && (NXT(3) == '-')) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002719 if (ctxt->myDoc == NULL)
2720 ctxt->myDoc = htmlNewDoc(NULL, NULL);
2721 htmlParseComment(ctxt);
Daniel Veillard35008381999-10-25 13:15:52 +00002722 SKIP_BLANKS;
2723 }
2724
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002725
2726 /*
2727 * Then possibly doc type declaration(s) and more Misc
2728 * (doctypedecl Misc*)?
2729 */
2730 if ((CUR == '<') && (NXT(1) == '!') &&
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002731 (UPP(2) == 'D') && (UPP(3) == 'O') &&
2732 (UPP(4) == 'C') && (UPP(5) == 'T') &&
2733 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
2734 (UPP(8) == 'E')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002735 htmlParseDocTypeDecl(ctxt);
2736 }
2737 SKIP_BLANKS;
2738
2739 /*
2740 * Create the document if not done already.
2741 */
2742 if (ctxt->myDoc == NULL) {
2743 ctxt->myDoc = htmlNewDoc(NULL, NULL);
2744 }
2745
2746 /*
2747 * Time to start parsing the tree itself
2748 */
Daniel Veillard35008381999-10-25 13:15:52 +00002749 htmlParseContent(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002750
2751 /*
2752 * SAX: end of the document processing.
2753 */
2754 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
2755 ctxt->sax->endDocument(ctxt->userData);
2756 if (! ctxt->wellFormed) return(-1);
2757 return(0);
2758}
2759
2760
Daniel Veillarddbfd6411999-12-28 16:35:14 +00002761/************************************************************************
2762 * *
2763 * Parser contexts handling *
2764 * *
2765 ************************************************************************/
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002766
2767/**
2768 * xmlInitParserCtxt:
2769 * @ctxt: an HTML parser context
2770 *
2771 * Initialize a parser context
2772 */
2773
2774void
2775htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
2776{
2777 htmlSAXHandler *sax;
2778
Daniel Veillard35008381999-10-25 13:15:52 +00002779 if (ctxt == NULL) return;
2780 memset(ctxt, 0, sizeof(htmlParserCtxt));
2781
Daniel Veillard6454aec1999-09-02 22:04:43 +00002782 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002783 if (sax == NULL) {
2784 fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
2785 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002786 memset(sax, 0, sizeof(htmlSAXHandler));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002787
2788 /* Allocate the Input stack */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002789 ctxt->inputTab = (htmlParserInputPtr *)
2790 xmlMalloc(5 * sizeof(htmlParserInputPtr));
2791 if (ctxt->inputTab == NULL) {
2792 fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
2793 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002794 ctxt->inputNr = 0;
2795 ctxt->inputMax = 5;
2796 ctxt->input = NULL;
2797 ctxt->version = NULL;
2798 ctxt->encoding = NULL;
2799 ctxt->standalone = -1;
Daniel Veillarddbfd6411999-12-28 16:35:14 +00002800 ctxt->instate = XML_PARSER_START;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002801
2802 /* Allocate the Node stack */
Daniel Veillard6454aec1999-09-02 22:04:43 +00002803 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002804 ctxt->nodeNr = 0;
2805 ctxt->nodeMax = 10;
2806 ctxt->node = NULL;
2807
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002808 /* Allocate the Name stack */
2809 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
2810 ctxt->nameNr = 0;
2811 ctxt->nameMax = 10;
2812 ctxt->name = NULL;
2813
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002814 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
2815 else {
2816 ctxt->sax = sax;
2817 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
2818 }
2819 ctxt->userData = ctxt;
2820 ctxt->myDoc = NULL;
2821 ctxt->wellFormed = 1;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002822 ctxt->replaceEntities = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002823 ctxt->html = 1;
2824 ctxt->record_info = 0;
Daniel Veillard35008381999-10-25 13:15:52 +00002825 ctxt->validate = 0;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002826 ctxt->nbChars = 0;
Daniel Veillarddbfd6411999-12-28 16:35:14 +00002827 ctxt->checkIndex = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002828 xmlInitNodeInfoSeq(&ctxt->node_seq);
2829}
2830
2831/**
2832 * htmlFreeParserCtxt:
2833 * @ctxt: an HTML parser context
2834 *
2835 * Free all the memory used by a parser context. However the parsed
2836 * document in ctxt->myDoc is not freed.
2837 */
2838
2839void
2840htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
2841{
2842 htmlParserInputPtr input;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002843 xmlChar *oldname;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002844
2845 if (ctxt == NULL) return;
2846
2847 while ((input = inputPop(ctxt)) != NULL) {
2848 xmlFreeInputStream(input);
2849 }
2850
Daniel Veillard6454aec1999-09-02 22:04:43 +00002851 if (ctxt->nodeTab != NULL) xmlFree(ctxt->nodeTab);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002852 while ((oldname = htmlnamePop(ctxt)) != NULL) {
2853 xmlFree(oldname);
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002854 }
2855 if (ctxt->nameTab != NULL) xmlFree(ctxt->nameTab);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002856 if (ctxt->directory != NULL) xmlFree(ctxt->directory);
Daniel Veillard6454aec1999-09-02 22:04:43 +00002857 if (ctxt->inputTab != NULL) xmlFree(ctxt->inputTab);
2858 if (ctxt->version != NULL) xmlFree((char *) ctxt->version);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002859 if ((ctxt->sax != NULL) && (ctxt->sax != &htmlDefaultSAXHandler))
Daniel Veillard6454aec1999-09-02 22:04:43 +00002860 xmlFree(ctxt->sax);
2861 xmlFree(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002862}
2863
2864/**
2865 * htmlCreateDocParserCtxt :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002866 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002867 * @encoding: a free form C string describing the HTML document encoding, or NULL
2868 *
2869 * Create a parser context for an HTML document.
2870 *
2871 * Returns the new parser context or NULL
2872 */
2873htmlParserCtxtPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002874htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002875 htmlParserCtxtPtr ctxt;
2876 htmlParserInputPtr input;
2877 /* htmlCharEncoding enc; */
2878
Daniel Veillard6454aec1999-09-02 22:04:43 +00002879 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002880 if (ctxt == NULL) {
2881 perror("malloc");
2882 return(NULL);
2883 }
2884 htmlInitParserCtxt(ctxt);
Daniel Veillard6454aec1999-09-02 22:04:43 +00002885 input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002886 if (input == NULL) {
2887 perror("malloc");
Daniel Veillard6454aec1999-09-02 22:04:43 +00002888 xmlFree(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002889 return(NULL);
2890 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002891 memset(input, 0, sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002892
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002893 input->line = 1;
2894 input->col = 1;
2895 input->base = cur;
2896 input->cur = cur;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002897
2898 inputPush(ctxt, input);
2899 return(ctxt);
2900}
2901
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002902/************************************************************************
2903 * *
2904 * Progressive parsing interfaces *
2905 * *
2906 ************************************************************************/
2907
2908/**
2909 * htmlParseLookupSequence:
2910 * @ctxt: an HTML parser context
2911 * @first: the first char to lookup
2912 * @next: the next char to lookup or zero
2913 * @third: the next char to lookup or zero
2914 *
2915 * Try to find if a sequence (first, next, third) or just (first next) or
2916 * (first) is available in the input stream.
2917 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
2918 * to avoid rescanning sequences of bytes, it DOES change the state of the
2919 * parser, do not use liberally.
2920 * This is basically similar to xmlParseLookupSequence()
2921 *
2922 * Returns the index to the current parsing point if the full sequence
2923 * is available, -1 otherwise.
2924 */
2925int
2926htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
2927 xmlChar next, xmlChar third) {
2928 int base, len;
2929 htmlParserInputPtr in;
2930 const xmlChar *buf;
2931
2932 in = ctxt->input;
2933 if (in == NULL) return(-1);
2934 base = in->cur - in->base;
2935 if (base < 0) return(-1);
2936 if (ctxt->checkIndex > base)
2937 base = ctxt->checkIndex;
2938 if (in->buf == NULL) {
2939 buf = in->base;
2940 len = in->length;
2941 } else {
2942 buf = in->buf->buffer->content;
2943 len = in->buf->buffer->use;
2944 }
2945 /* take into account the sequence length */
2946 if (third) len -= 2;
2947 else if (next) len --;
2948 for (;base < len;base++) {
2949 if (buf[base] == first) {
2950 if (third != 0) {
2951 if ((buf[base + 1] != next) ||
2952 (buf[base + 2] != third)) continue;
2953 } else if (next != 0) {
2954 if (buf[base + 1] != next) continue;
2955 }
2956 ctxt->checkIndex = 0;
2957#ifdef DEBUG_PUSH
2958 if (next == 0)
2959 fprintf(stderr, "HPP: lookup '%c' found at %d\n",
2960 first, base);
2961 else if (third == 0)
2962 fprintf(stderr, "HPP: lookup '%c%c' found at %d\n",
2963 first, next, base);
2964 else
2965 fprintf(stderr, "HPP: lookup '%c%c%c' found at %d\n",
2966 first, next, third, base);
2967#endif
2968 return(base - (in->cur - in->base));
2969 }
2970 }
2971 ctxt->checkIndex = base;
2972#ifdef DEBUG_PUSH
2973 if (next == 0)
2974 fprintf(stderr, "HPP: lookup '%c' failed\n", first);
2975 else if (third == 0)
2976 fprintf(stderr, "HPP: lookup '%c%c' failed\n", first, next);
2977 else
2978 fprintf(stderr, "HPP: lookup '%c%c%c' failed\n", first, next, third);
2979#endif
2980 return(-1);
2981}
2982
2983/**
Daniel Veillard71b656e2000-01-05 14:46:17 +00002984 * htmlParseTryOrFinish:
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002985 * @ctxt: an HTML parser context
Daniel Veillard71b656e2000-01-05 14:46:17 +00002986 * @terminate: last chunk indicator
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002987 *
2988 * Try to progress on parsing
2989 *
2990 * Returns zero if no parsing was possible
2991 */
2992int
Daniel Veillard71b656e2000-01-05 14:46:17 +00002993htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002994 int ret = 0;
2995 htmlParserInputPtr in;
2996 int avail;
2997 xmlChar cur, next;
2998
2999#ifdef DEBUG_PUSH
3000 switch (ctxt->instate) {
3001 case XML_PARSER_EOF:
3002 fprintf(stderr, "HPP: try EOF\n"); break;
3003 case XML_PARSER_START:
3004 fprintf(stderr, "HPP: try START\n"); break;
3005 case XML_PARSER_MISC:
3006 fprintf(stderr, "HPP: try MISC\n");break;
3007 case XML_PARSER_COMMENT:
3008 fprintf(stderr, "HPP: try COMMENT\n");break;
3009 case XML_PARSER_PROLOG:
3010 fprintf(stderr, "HPP: try PROLOG\n");break;
3011 case XML_PARSER_START_TAG:
3012 fprintf(stderr, "HPP: try START_TAG\n");break;
3013 case XML_PARSER_CONTENT:
3014 fprintf(stderr, "HPP: try CONTENT\n");break;
3015 case XML_PARSER_CDATA_SECTION:
3016 fprintf(stderr, "HPP: try CDATA_SECTION\n");break;
3017 case XML_PARSER_END_TAG:
3018 fprintf(stderr, "HPP: try END_TAG\n");break;
3019 case XML_PARSER_ENTITY_DECL:
3020 fprintf(stderr, "HPP: try ENTITY_DECL\n");break;
3021 case XML_PARSER_ENTITY_VALUE:
3022 fprintf(stderr, "HPP: try ENTITY_VALUE\n");break;
3023 case XML_PARSER_ATTRIBUTE_VALUE:
3024 fprintf(stderr, "HPP: try ATTRIBUTE_VALUE\n");break;
3025 case XML_PARSER_DTD:
3026 fprintf(stderr, "HPP: try DTD\n");break;
3027 case XML_PARSER_EPILOG:
3028 fprintf(stderr, "HPP: try EPILOG\n");break;
3029 case XML_PARSER_PI:
3030 fprintf(stderr, "HPP: try PI\n");break;
3031 }
3032#endif
3033
3034 while (1) {
3035
3036 in = ctxt->input;
3037 if (in == NULL) break;
3038 if (in->buf == NULL)
3039 avail = in->length - (in->cur - in->base);
3040 else
3041 avail = in->buf->buffer->use - (in->cur - in->base);
3042 if (avail < 1)
3043 goto done;
3044 switch (ctxt->instate) {
3045 case XML_PARSER_EOF:
3046 /*
3047 * Document parsing is done !
3048 */
3049 goto done;
3050 case XML_PARSER_START:
3051 /*
3052 * Very first chars read from the document flow.
3053 */
3054 cur = in->cur[0];
3055 if (IS_BLANK(cur)) {
3056 SKIP_BLANKS;
3057 if (in->buf == NULL)
3058 avail = in->length - (in->cur - in->base);
3059 else
3060 avail = in->buf->buffer->use - (in->cur - in->base);
3061 }
3062 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3063 ctxt->sax->setDocumentLocator(ctxt->userData,
3064 &xmlDefaultSAXLocator);
3065 cur = in->cur[0];
3066 next = in->cur[1];
3067 if ((cur == '<') && (next == '!') &&
3068 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3069 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3070 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3071 (UPP(8) == 'E')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003072 if ((!terminate) &&
3073 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003074 goto done;
3075#ifdef DEBUG_PUSH
3076 fprintf(stderr, "HPP: Parsing internal subset\n");
3077#endif
3078 htmlParseDocTypeDecl(ctxt);
3079 ctxt->instate = XML_PARSER_PROLOG;
3080#ifdef DEBUG_PUSH
3081 fprintf(stderr, "HPP: entering PROLOG\n");
3082#endif
3083 } else {
3084 ctxt->myDoc = htmlNewDoc(NULL, NULL);
3085 ctxt->instate = XML_PARSER_MISC;
3086 }
3087#ifdef DEBUG_PUSH
3088 fprintf(stderr, "HPP: entering MISC\n");
3089#endif
3090 break;
3091 case XML_PARSER_MISC:
3092 SKIP_BLANKS;
3093 if (in->buf == NULL)
3094 avail = in->length - (in->cur - in->base);
3095 else
3096 avail = in->buf->buffer->use - (in->cur - in->base);
3097 if (avail < 2)
3098 goto done;
3099 cur = in->cur[0];
3100 next = in->cur[1];
3101 if ((cur == '<') && (next == '!') &&
3102 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003103 if ((!terminate) &&
3104 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003105 goto done;
3106#ifdef DEBUG_PUSH
3107 fprintf(stderr, "HPP: Parsing Comment\n");
3108#endif
3109 htmlParseComment(ctxt);
3110 ctxt->instate = XML_PARSER_MISC;
3111 } else if ((cur == '<') && (next == '!') &&
3112 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3113 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3114 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3115 (UPP(8) == 'E')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003116 if ((!terminate) &&
3117 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003118 goto done;
3119#ifdef DEBUG_PUSH
3120 fprintf(stderr, "HPP: Parsing internal subset\n");
3121#endif
3122 htmlParseDocTypeDecl(ctxt);
3123 ctxt->instate = XML_PARSER_PROLOG;
3124#ifdef DEBUG_PUSH
3125 fprintf(stderr, "HPP: entering PROLOG\n");
3126#endif
3127 } else if ((cur == '<') && (next == '!') &&
3128 (avail < 9)) {
3129 goto done;
3130 } else {
3131 ctxt->instate = XML_PARSER_START_TAG;
3132#ifdef DEBUG_PUSH
3133 fprintf(stderr, "HPP: entering START_TAG\n");
3134#endif
3135 }
3136 break;
3137 case XML_PARSER_PROLOG:
3138 SKIP_BLANKS;
3139 if (in->buf == NULL)
3140 avail = in->length - (in->cur - in->base);
3141 else
3142 avail = in->buf->buffer->use - (in->cur - in->base);
3143 if (avail < 2)
3144 goto done;
3145 cur = in->cur[0];
3146 next = in->cur[1];
3147 if ((cur == '<') && (next == '!') &&
3148 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003149 if ((!terminate) &&
3150 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003151 goto done;
3152#ifdef DEBUG_PUSH
3153 fprintf(stderr, "HPP: Parsing Comment\n");
3154#endif
3155 htmlParseComment(ctxt);
3156 ctxt->instate = XML_PARSER_PROLOG;
3157 } else if ((cur == '<') && (next == '!') &&
3158 (avail < 4)) {
3159 goto done;
3160 } else {
3161 ctxt->instate = XML_PARSER_START_TAG;
3162#ifdef DEBUG_PUSH
3163 fprintf(stderr, "HPP: entering START_TAG\n");
3164#endif
3165 }
3166 break;
3167 case XML_PARSER_EPILOG:
3168 SKIP_BLANKS;
3169 if (in->buf == NULL)
3170 avail = in->length - (in->cur - in->base);
3171 else
3172 avail = in->buf->buffer->use - (in->cur - in->base);
3173 if (avail < 2)
3174 goto done;
3175 cur = in->cur[0];
3176 next = in->cur[1];
3177 if ((cur == '<') && (next == '!') &&
3178 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003179 if ((!terminate) &&
3180 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003181 goto done;
3182#ifdef DEBUG_PUSH
3183 fprintf(stderr, "HPP: Parsing Comment\n");
3184#endif
3185 htmlParseComment(ctxt);
3186 ctxt->instate = XML_PARSER_EPILOG;
3187 } else if ((cur == '<') && (next == '!') &&
3188 (avail < 4)) {
3189 goto done;
3190 } else {
3191 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3192 ctxt->sax->error(ctxt->userData,
3193 "Extra content at the end of the document\n");
3194 ctxt->wellFormed = 0;
3195 ctxt->errNo = XML_ERR_DOCUMENT_END;
3196 ctxt->instate = XML_PARSER_EOF;
3197#ifdef DEBUG_PUSH
3198 fprintf(stderr, "HPP: entering EOF\n");
3199#endif
3200 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3201 ctxt->sax->endDocument(ctxt->userData);
3202 goto done;
3203 }
3204 break;
3205 case XML_PARSER_START_TAG: {
3206 xmlChar *name, *oldname;
3207 int depth = ctxt->nameNr;
3208 htmlElemDescPtr info;
3209
3210 if (avail < 2)
3211 goto done;
3212 cur = in->cur[0];
3213 if (cur != '<') {
3214 ctxt->instate = XML_PARSER_CONTENT;
3215#ifdef DEBUG_PUSH
3216 fprintf(stderr, "HPP: entering CONTENT\n");
3217#endif
3218 break;
3219 }
Daniel Veillard71b656e2000-01-05 14:46:17 +00003220 if ((!terminate) &&
3221 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003222 goto done;
3223
3224 oldname = xmlStrdup(ctxt->name);
3225 htmlParseStartTag(ctxt);
3226 name = ctxt->name;
3227#ifdef DEBUG
3228 if (oldname == NULL)
3229 fprintf(stderr, "Start of element %s\n", name);
3230 else if (name == NULL)
3231 fprintf(stderr, "Start of element failed, was %s\n",
3232 oldname);
3233 else
3234 fprintf(stderr, "Start of element %s, was %s\n",
3235 name, oldname);
3236#endif
3237 if (((depth == ctxt->nameNr) &&
3238 (!xmlStrcmp(oldname, ctxt->name))) ||
3239 (name == NULL)) {
3240 if (CUR == '>')
3241 NEXT;
3242 if (oldname != NULL)
3243 xmlFree(oldname);
3244 break;
3245 }
3246 if (oldname != NULL)
3247 xmlFree(oldname);
3248
3249 /*
3250 * Lookup the info for that element.
3251 */
3252 info = htmlTagLookup(name);
3253 if (info == NULL) {
3254 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3255 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3256 name);
3257 ctxt->wellFormed = 0;
3258 } else if (info->depr) {
3259 /***************************
3260 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3261 ctxt->sax->warning(ctxt->userData,
3262 "Tag %s is deprecated\n",
3263 name);
3264 ***************************/
3265 }
3266
3267 /*
3268 * Check for an Empty Element labelled the XML/SGML way
3269 */
3270 if ((CUR == '/') && (NXT(1) == '>')) {
3271 SKIP(2);
3272 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3273 ctxt->sax->endElement(ctxt->userData, name);
3274 oldname = htmlnamePop(ctxt);
3275#ifdef DEBUG
3276 fprintf(stderr,"End of tag the XML way: popping out %s\n",
3277 oldname);
3278#endif
3279 if (oldname != NULL)
3280 xmlFree(oldname);
3281 ctxt->instate = XML_PARSER_CONTENT;
3282#ifdef DEBUG_PUSH
3283 fprintf(stderr, "HPP: entering CONTENT\n");
3284#endif
3285 break;
3286 }
3287
3288 if (CUR == '>') {
3289 NEXT;
3290 } else {
3291 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3292 ctxt->sax->error(ctxt->userData,
3293 "Couldn't find end of Start Tag %s\n",
3294 name);
3295 ctxt->wellFormed = 0;
3296
3297 /*
3298 * end of parsing of this node.
3299 */
3300 if (!xmlStrcmp(name, ctxt->name)) {
3301 nodePop(ctxt);
3302 oldname = htmlnamePop(ctxt);
3303#ifdef DEBUG
3304 fprintf(stderr,
3305 "End of start tag problem: popping out %s\n", oldname);
3306#endif
3307 if (oldname != NULL)
3308 xmlFree(oldname);
3309 }
3310
3311 ctxt->instate = XML_PARSER_CONTENT;
3312#ifdef DEBUG_PUSH
3313 fprintf(stderr, "HPP: entering CONTENT\n");
3314#endif
3315 break;
3316 }
3317
3318 /*
3319 * Check for an Empty Element from DTD definition
3320 */
3321 if ((info != NULL) && (info->empty)) {
3322 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3323 ctxt->sax->endElement(ctxt->userData, name);
3324 oldname = htmlnamePop(ctxt);
3325#ifdef DEBUG
3326 fprintf(stderr,"End of empty tag %s : popping out %s\n", name, oldname);
3327#endif
3328 if (oldname != NULL)
3329 xmlFree(oldname);
3330 }
3331 ctxt->instate = XML_PARSER_CONTENT;
3332#ifdef DEBUG_PUSH
3333 fprintf(stderr, "HPP: entering CONTENT\n");
3334#endif
3335 break;
3336 }
3337 case XML_PARSER_CONTENT:
3338 /*
3339 * Handle preparsed entities and charRef
3340 */
3341 if (ctxt->token != 0) {
3342 xmlChar cur[2] = { 0 , 0 } ;
3343
3344 cur[0] = (xmlChar) ctxt->token;
3345 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3346 ctxt->sax->characters(ctxt->userData, cur, 1);
3347 ctxt->token = 0;
3348 ctxt->checkIndex = 0;
3349 }
3350 if (avail < 2)
3351 goto done;
3352 cur = in->cur[0];
3353 next = in->cur[1];
3354 if ((cur == '<') && (next == '!') &&
3355 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003356 if ((!terminate) &&
3357 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003358 goto done;
3359#ifdef DEBUG_PUSH
3360 fprintf(stderr, "HPP: Parsing Comment\n");
3361#endif
3362 htmlParseComment(ctxt);
3363 ctxt->instate = XML_PARSER_CONTENT;
3364 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
3365 goto done;
3366 } else if ((cur == '<') && (next == '/')) {
3367 ctxt->instate = XML_PARSER_END_TAG;
3368 ctxt->checkIndex = 0;
3369#ifdef DEBUG_PUSH
3370 fprintf(stderr, "HPP: entering END_TAG\n");
3371#endif
3372 break;
3373 } else if (cur == '<') {
3374 ctxt->instate = XML_PARSER_START_TAG;
3375 ctxt->checkIndex = 0;
3376#ifdef DEBUG_PUSH
3377 fprintf(stderr, "HPP: entering START_TAG\n");
3378#endif
3379 break;
3380 } else if (cur == '&') {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003381 if ((!terminate) &&
3382 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003383 goto done;
3384#ifdef DEBUG_PUSH
3385 fprintf(stderr, "HPP: Parsing Reference\n");
3386#endif
3387 /* TODO: check generation of subtrees if noent !!! */
3388 htmlParseReference(ctxt);
3389 } else {
3390 /* TODO Avoid the extra copy, handle directly !!!!!! */
3391 /*
3392 * Goal of the following test is :
3393 * - minimize calls to the SAX 'character' callback
3394 * when they are mergeable
3395 */
3396 if ((ctxt->inputNr == 1) &&
3397 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003398 if ((!terminate) &&
3399 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003400 goto done;
3401 }
3402 ctxt->checkIndex = 0;
3403#ifdef DEBUG_PUSH
3404 fprintf(stderr, "HPP: Parsing char data\n");
3405#endif
3406 htmlParseCharData(ctxt, 0);
3407 }
3408 break;
3409 case XML_PARSER_END_TAG:
3410 if (avail < 2)
3411 goto done;
Daniel Veillard71b656e2000-01-05 14:46:17 +00003412 if ((!terminate) &&
3413 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003414 goto done;
3415 htmlParseEndTag(ctxt);
3416 if (ctxt->nameNr == 0) {
3417 ctxt->instate = XML_PARSER_EPILOG;
3418 } else {
3419 ctxt->instate = XML_PARSER_CONTENT;
3420 }
3421 ctxt->checkIndex = 0;
3422#ifdef DEBUG_PUSH
3423 fprintf(stderr, "HPP: entering CONTENT\n");
3424#endif
3425 break;
3426 case XML_PARSER_CDATA_SECTION:
3427 fprintf(stderr, "HPP: internal error, state == CDATA\n");
3428 ctxt->instate = XML_PARSER_CONTENT;
3429 ctxt->checkIndex = 0;
3430#ifdef DEBUG_PUSH
3431 fprintf(stderr, "HPP: entering CONTENT\n");
3432#endif
3433 break;
3434 case XML_PARSER_DTD:
3435 fprintf(stderr, "HPP: internal error, state == DTD\n");
3436 ctxt->instate = XML_PARSER_CONTENT;
3437 ctxt->checkIndex = 0;
3438#ifdef DEBUG_PUSH
3439 fprintf(stderr, "HPP: entering CONTENT\n");
3440#endif
3441 break;
3442 case XML_PARSER_COMMENT:
3443 fprintf(stderr, "HPP: internal error, state == COMMENT\n");
3444 ctxt->instate = XML_PARSER_CONTENT;
3445 ctxt->checkIndex = 0;
3446#ifdef DEBUG_PUSH
3447 fprintf(stderr, "HPP: entering CONTENT\n");
3448#endif
3449 break;
3450 case XML_PARSER_PI:
3451 fprintf(stderr, "HPP: internal error, state == PI\n");
3452 ctxt->instate = XML_PARSER_CONTENT;
3453 ctxt->checkIndex = 0;
3454#ifdef DEBUG_PUSH
3455 fprintf(stderr, "HPP: entering CONTENT\n");
3456#endif
3457 break;
3458 case XML_PARSER_ENTITY_DECL:
3459 fprintf(stderr, "HPP: internal error, state == ENTITY_DECL\n");
3460 ctxt->instate = XML_PARSER_CONTENT;
3461 ctxt->checkIndex = 0;
3462#ifdef DEBUG_PUSH
3463 fprintf(stderr, "HPP: entering CONTENT\n");
3464#endif
3465 break;
3466 case XML_PARSER_ENTITY_VALUE:
3467 fprintf(stderr, "HPP: internal error, state == ENTITY_VALUE\n");
3468 ctxt->instate = XML_PARSER_CONTENT;
3469 ctxt->checkIndex = 0;
3470#ifdef DEBUG_PUSH
3471 fprintf(stderr, "HPP: entering DTD\n");
3472#endif
3473 break;
3474 case XML_PARSER_ATTRIBUTE_VALUE:
3475 fprintf(stderr, "HPP: internal error, state == ATTRIBUTE_VALUE\n");
3476 ctxt->instate = XML_PARSER_START_TAG;
3477 ctxt->checkIndex = 0;
3478#ifdef DEBUG_PUSH
3479 fprintf(stderr, "HPP: entering START_TAG\n");
3480#endif
3481 break;
3482 }
3483 }
3484done:
3485#ifdef DEBUG_PUSH
3486 fprintf(stderr, "HPP: done %d\n", ret);
3487#endif
3488 return(ret);
3489}
3490
3491/**
Daniel Veillard71b656e2000-01-05 14:46:17 +00003492 * htmlParseTry:
3493 * @ctxt: an HTML parser context
3494 *
3495 * Try to progress on parsing
3496 *
3497 * Returns zero if no parsing was possible
3498 */
3499int
3500htmlParseTry(htmlParserCtxtPtr ctxt) {
3501 return(htmlParseTryOrFinish(ctxt, 0));
3502}
3503
3504/**
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003505 * htmlParseChunk:
3506 * @ctxt: an XML parser context
3507 * @chunk: an char array
3508 * @size: the size in byte of the chunk
3509 * @terminate: last chunk indicator
3510 *
3511 * Parse a Chunk of memory
3512 *
3513 * Returns zero if no error, the xmlParserErrors otherwise.
3514 */
3515int
3516htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
3517 int terminate) {
3518 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
3519 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
3520 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
3521 int cur = ctxt->input->cur - ctxt->input->base;
3522
3523 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
3524 ctxt->input->base = ctxt->input->buf->buffer->content + base;
3525 ctxt->input->cur = ctxt->input->base + cur;
3526#ifdef DEBUG_PUSH
3527 fprintf(stderr, "HPP: pushed %d\n", size);
3528#endif
3529
Daniel Veillard71b656e2000-01-05 14:46:17 +00003530 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003531 } else if (ctxt->instate != XML_PARSER_EOF)
Daniel Veillard71b656e2000-01-05 14:46:17 +00003532 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003533 if (terminate) {
3534 if ((ctxt->instate != XML_PARSER_EOF) &&
3535 (ctxt->instate != XML_PARSER_EPILOG) &&
3536 (ctxt->instate != XML_PARSER_MISC)) {
3537 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3538 ctxt->sax->error(ctxt->userData,
3539 "Extra content at the end of the document\n");
3540 ctxt->wellFormed = 0;
3541 ctxt->errNo = XML_ERR_DOCUMENT_END;
3542 }
3543 if (ctxt->instate != XML_PARSER_EOF) {
3544 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3545 ctxt->sax->endDocument(ctxt->userData);
3546 }
3547 ctxt->instate = XML_PARSER_EOF;
3548 }
3549 return((xmlParserErrors) ctxt->errNo);
3550}
3551
3552/************************************************************************
3553 * *
3554 * User entry points *
3555 * *
3556 ************************************************************************/
3557
3558/**
3559 * htmlCreatePushParserCtxt :
3560 * @sax: a SAX handler
3561 * @user_data: The user data returned on SAX callbacks
3562 * @chunk: a pointer to an array of chars
3563 * @size: number of chars in the array
3564 * @filename: an optional file name or URI
3565 * @enc: an optional encoding
3566 *
3567 * Create a parser context for using the HTML parser in push mode
3568 * To allow content encoding detection, @size should be >= 4
3569 * The value of @filename is used for fetching external entities
3570 * and error/warning reports.
3571 *
3572 * Returns the new parser context or NULL
3573 */
3574htmlParserCtxtPtr
3575htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
3576 const char *chunk, int size, const char *filename,
3577 xmlCharEncoding enc) {
3578 htmlParserCtxtPtr ctxt;
3579 htmlParserInputPtr inputStream;
3580 xmlParserInputBufferPtr buf;
3581
3582 buf = xmlAllocParserInputBuffer(enc);
3583 if (buf == NULL) return(NULL);
3584
3585 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
3586 if (ctxt == NULL) {
3587 xmlFree(buf);
3588 return(NULL);
3589 }
3590 memset(ctxt, 0, sizeof(htmlParserCtxt));
3591 htmlInitParserCtxt(ctxt);
3592 if (sax != NULL) {
3593 if (ctxt->sax != &htmlDefaultSAXHandler)
3594 xmlFree(ctxt->sax);
3595 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
3596 if (ctxt->sax == NULL) {
3597 xmlFree(buf);
3598 xmlFree(ctxt);
3599 return(NULL);
3600 }
3601 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
3602 if (user_data != NULL)
3603 ctxt->userData = user_data;
3604 }
3605 if (filename == NULL) {
3606 ctxt->directory = NULL;
3607 } else {
3608 ctxt->directory = xmlParserGetDirectory(filename);
3609 }
3610
3611 inputStream = htmlNewInputStream(ctxt);
3612 if (inputStream == NULL) {
3613 xmlFreeParserCtxt(ctxt);
3614 return(NULL);
3615 }
3616
3617 if (filename == NULL)
3618 inputStream->filename = NULL;
3619 else
3620 inputStream->filename = xmlMemStrdup(filename);
3621 inputStream->buf = buf;
3622 inputStream->base = inputStream->buf->buffer->content;
3623 inputStream->cur = inputStream->buf->buffer->content;
3624
3625 inputPush(ctxt, inputStream);
3626
3627 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
3628 (ctxt->input->buf != NULL)) {
3629 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
3630#ifdef DEBUG_PUSH
3631 fprintf(stderr, "HPP: pushed %d\n", size);
3632#endif
3633 }
3634
3635 return(ctxt);
3636}
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003637
3638/**
3639 * htmlSAXParseDoc :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003640 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003641 * @encoding: a free form C string describing the HTML document encoding, or NULL
3642 * @sax: the SAX handler block
3643 * @userData: if using SAX, this pointer will be provided on callbacks.
3644 *
3645 * parse an HTML in-memory document and build a tree.
3646 * It use the given SAX function block to handle the parsing callback.
3647 * If sax is NULL, fallback to the default DOM tree building routines.
3648 *
3649 * Returns the resulting document tree
3650 */
3651
3652htmlDocPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003653htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003654 htmlDocPtr ret;
3655 htmlParserCtxtPtr ctxt;
3656
3657 if (cur == NULL) return(NULL);
3658
3659
3660 ctxt = htmlCreateDocParserCtxt(cur, encoding);
3661 if (ctxt == NULL) return(NULL);
3662 if (sax != NULL) {
3663 ctxt->sax = sax;
3664 ctxt->userData = userData;
3665 }
3666
3667 htmlParseDocument(ctxt);
3668 ret = ctxt->myDoc;
3669 if (sax != NULL) {
3670 ctxt->sax = NULL;
3671 ctxt->userData = NULL;
3672 }
3673 htmlFreeParserCtxt(ctxt);
3674
3675 return(ret);
3676}
3677
3678/**
3679 * htmlParseDoc :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003680 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003681 * @encoding: a free form C string describing the HTML document encoding, or NULL
3682 *
3683 * parse an HTML in-memory document and build a tree.
3684 *
3685 * Returns the resulting document tree
3686 */
3687
3688htmlDocPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003689htmlParseDoc(xmlChar *cur, const char *encoding) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003690 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
3691}
3692
3693
3694/**
3695 * htmlCreateFileParserCtxt :
3696 * @filename: the filename
3697 * @encoding: a free form C string describing the HTML document encoding, or NULL
3698 *
3699 * Create a parser context for a file content.
3700 * Automatic support for ZLIB/Compress compressed document is provided
3701 * by default if found at compile-time.
3702 *
3703 * Returns the new parser context or NULL
3704 */
3705htmlParserCtxtPtr
3706htmlCreateFileParserCtxt(const char *filename, const char *encoding)
3707{
3708 htmlParserCtxtPtr ctxt;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003709 htmlParserInputPtr inputStream;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00003710 xmlParserInputBufferPtr buf;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003711 /* htmlCharEncoding enc; */
3712
Daniel Veillarde2d034d1999-07-27 19:52:06 +00003713 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
3714 if (buf == NULL) return(NULL);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003715
Daniel Veillard6454aec1999-09-02 22:04:43 +00003716 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003717 if (ctxt == NULL) {
3718 perror("malloc");
3719 return(NULL);
3720 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003721 memset(ctxt, 0, sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003722 htmlInitParserCtxt(ctxt);
Daniel Veillard6454aec1999-09-02 22:04:43 +00003723 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003724 if (inputStream == NULL) {
3725 perror("malloc");
Daniel Veillard6454aec1999-09-02 22:04:43 +00003726 xmlFree(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003727 return(NULL);
3728 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003729 memset(inputStream, 0, sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003730
Daniel Veillard6454aec1999-09-02 22:04:43 +00003731 inputStream->filename = xmlMemStrdup(filename);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003732 inputStream->line = 1;
3733 inputStream->col = 1;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00003734 inputStream->buf = buf;
Daniel Veillard35008381999-10-25 13:15:52 +00003735 inputStream->directory = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003736
Daniel Veillarde2d034d1999-07-27 19:52:06 +00003737 inputStream->base = inputStream->buf->buffer->content;
3738 inputStream->cur = inputStream->buf->buffer->content;
3739 inputStream->free = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003740
3741 inputPush(ctxt, inputStream);
3742 return(ctxt);
3743}
3744
3745/**
3746 * htmlSAXParseFile :
3747 * @filename: the filename
3748 * @encoding: a free form C string describing the HTML document encoding, or NULL
3749 * @sax: the SAX handler block
3750 * @userData: if using SAX, this pointer will be provided on callbacks.
3751 *
3752 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
3753 * compressed document is provided by default if found at compile-time.
3754 * It use the given SAX function block to handle the parsing callback.
3755 * If sax is NULL, fallback to the default DOM tree building routines.
3756 *
3757 * Returns the resulting document tree
3758 */
3759
3760htmlDocPtr
3761htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
3762 void *userData) {
3763 htmlDocPtr ret;
3764 htmlParserCtxtPtr ctxt;
3765
3766 ctxt = htmlCreateFileParserCtxt(filename, encoding);
3767 if (ctxt == NULL) return(NULL);
3768 if (sax != NULL) {
3769 ctxt->sax = sax;
3770 ctxt->userData = userData;
3771 }
3772
3773 htmlParseDocument(ctxt);
3774
3775 ret = ctxt->myDoc;
3776 if (sax != NULL) {
3777 ctxt->sax = NULL;
3778 ctxt->userData = NULL;
3779 }
3780 htmlFreeParserCtxt(ctxt);
3781
3782 return(ret);
3783}
3784
3785/**
3786 * htmlParseFile :
3787 * @filename: the filename
3788 * @encoding: a free form C string describing the HTML document encoding, or NULL
3789 *
3790 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
3791 * compressed document is provided by default if found at compile-time.
3792 *
3793 * Returns the resulting document tree
3794 */
3795
3796htmlDocPtr
3797htmlParseFile(const char *filename, const char *encoding) {
3798 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
3799}