blob: aafaec6d7c9d6cfb7d3c0d83c7e32895f2590536 [file] [log] [blame]
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * Daniel.Veillard@w3.org
7 */
8
9#ifdef WIN32
Daniel Veillard3c558c31999-12-22 11:30:41 +000010#include "win32config.h"
Daniel Veillardbe70ff71999-07-05 16:50:46 +000011#else
Daniel Veillard7f7d1111999-09-22 09:46:25 +000012#include "config.h"
Daniel Veillardbe70ff71999-07-05 16:50:46 +000013#endif
Daniel Veillard7f7d1111999-09-22 09:46:25 +000014
Daniel Veillardbe70ff71999-07-05 16:50:46 +000015#include <stdio.h>
Daniel Veillardbe70ff71999-07-05 16:50:46 +000016#include <string.h> /* for memset() only */
Daniel Veillard7f7d1111999-09-22 09:46:25 +000017#ifdef HAVE_CTYPE_H
18#include <ctype.h>
19#endif
20#ifdef HAVE_STDLIB_H
Daniel Veillardbe70ff71999-07-05 16:50:46 +000021#include <stdlib.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000022#endif
23#ifdef HAVE_SYS_STAT_H
Daniel Veillardbe70ff71999-07-05 16:50:46 +000024#include <sys/stat.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000025#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +000026#ifdef HAVE_FCNTL_H
27#include <fcntl.h>
28#endif
29#ifdef HAVE_UNISTD_H
30#include <unistd.h>
31#endif
32#ifdef HAVE_ZLIB_H
33#include <zlib.h>
34#endif
35
Daniel Veillard6454aec1999-09-02 22:04:43 +000036#include "xmlmemory.h"
Daniel Veillardbe70ff71999-07-05 16:50:46 +000037#include "tree.h"
38#include "HTMLparser.h"
39#include "entities.h"
40#include "encoding.h"
41#include "valid.h"
42#include "parserInternals.h"
Daniel Veillarde2d034d1999-07-27 19:52:06 +000043#include "xmlIO.h"
Daniel Veillard5e5c6231999-12-29 12:49:06 +000044#include "xml-error.h"
Daniel Veillarde2d034d1999-07-27 19:52:06 +000045
46#define HTML_MAX_NAMELEN 1000
47#define INPUT_CHUNK 50
Daniel Veillard5e5c6231999-12-29 12:49:06 +000048#define HTML_PARSER_BIG_BUFFER_SIZE 1024
49#define HTML_PARSER_BUFFER_SIZE 100
Daniel Veillardbe70ff71999-07-05 16:50:46 +000050
Daniel Veillard82150d81999-07-07 07:32:15 +000051/* #define DEBUG */
Daniel Veillard5e5c6231999-12-29 12:49:06 +000052/* #define DEBUG_PUSH */
Daniel Veillard5233ffc1999-07-06 22:25:25 +000053
54/************************************************************************
55 * *
56 * Parser stacks related functions and macros *
57 * *
58 ************************************************************************/
59
60/*
61 * Generic function for accessing stacks in the Parser Context
62 */
63
Daniel Veillarddbfd6411999-12-28 16:35:14 +000064#define PUSH_AND_POP(scope, type, name) \
65scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000066 if (ctxt->name##Nr >= ctxt->name##Max) { \
67 ctxt->name##Max *= 2; \
Daniel Veillard6454aec1999-09-02 22:04:43 +000068 ctxt->name##Tab = (void *) xmlRealloc(ctxt->name##Tab, \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000069 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
70 if (ctxt->name##Tab == NULL) { \
71 fprintf(stderr, "realloc failed !\n"); \
Daniel Veillard0142b842000-01-14 14:45:24 +000072 return(0); \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000073 } \
74 } \
75 ctxt->name##Tab[ctxt->name##Nr] = value; \
76 ctxt->name = value; \
77 return(ctxt->name##Nr++); \
78} \
Daniel Veillarddbfd6411999-12-28 16:35:14 +000079scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000080 type ret; \
Daniel Veillard7c1206f1999-10-14 09:10:25 +000081 if (ctxt->name##Nr < 0) return(0); \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000082 ctxt->name##Nr--; \
Daniel Veillard7c1206f1999-10-14 09:10:25 +000083 if (ctxt->name##Nr < 0) return(0); \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000084 if (ctxt->name##Nr > 0) \
85 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
86 else \
87 ctxt->name = NULL; \
88 ret = ctxt->name##Tab[ctxt->name##Nr]; \
89 ctxt->name##Tab[ctxt->name##Nr] = 0; \
90 return(ret); \
91} \
92
Daniel Veillarddbfd6411999-12-28 16:35:14 +000093PUSH_AND_POP(extern, xmlNodePtr, node)
94PUSH_AND_POP(extern, xmlChar*, name)
Daniel Veillard5233ffc1999-07-06 22:25:25 +000095
96/*
97 * Macros for accessing the content. Those should be used only by the parser,
98 * and not exported.
99 *
100 * Dirty macros, i.e. one need to make assumption on the context to use them
101 *
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000102 * CUR_PTR return the current pointer to the xmlChar to be parsed.
103 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000104 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
105 * in UNICODE mode. This should be used internally by the parser
106 * only to compare to ASCII values otherwise it would break when
107 * running with UTF-8 encoding.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000108 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000109 * to compare on ASCII based substring.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000110 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000111 * it should be used only to compare on ASCII based substring.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000112 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000113 * strings within the parser.
114 *
115 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
116 *
117 * CURRENT Returns the current char value, with the full decoding of
118 * UTF-8 if we are using this mode. It returns an int.
119 * NEXT Skip to the next character, this does the proper decoding
120 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000121 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
122 */
123
Daniel Veillardcf461992000-03-14 18:30:20 +0000124#define CUR ((int) (*ctxt->input->cur))
125
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000126#define UPPER (toupper(*ctxt->input->cur))
Daniel Veillardcf461992000-03-14 18:30:20 +0000127
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000128#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
Daniel Veillardcf461992000-03-14 18:30:20 +0000129
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000130#define NXT(val) ctxt->input->cur[(val)]
Daniel Veillardcf461992000-03-14 18:30:20 +0000131
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000132#define UPP(val) (toupper(ctxt->input->cur[(val)]))
Daniel Veillardcf461992000-03-14 18:30:20 +0000133
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000134#define CUR_PTR ctxt->input->cur
Daniel Veillardcf461992000-03-14 18:30:20 +0000135
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000136#define SHRINK xmlParserInputShrink(ctxt->input)
Daniel Veillardcf461992000-03-14 18:30:20 +0000137
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000138#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000139
Daniel Veillardcf461992000-03-14 18:30:20 +0000140#define CURRENT ((int) (*ctxt->input->cur))
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000141
Daniel Veillardcf461992000-03-14 18:30:20 +0000142#define NEXT htmlNextChar(ctxt);
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000143
Daniel Veillardcf461992000-03-14 18:30:20 +0000144#define SKIP_BLANKS htmlSkipBlankChars(ctxt);
145
146/**
147 * htmlNextChar:
148 * @ctxt: the HTML parser context
149 *
150 * Skip to the next char input char.
151 */
152
153void
154htmlNextChar(htmlParserCtxtPtr ctxt) {
155 if ((*ctxt->input->cur == 0) &&
156 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
157 xmlPopInput(ctxt);
158 } else {
159 if (*(ctxt->input->cur) == '\n') {
160 ctxt->input->line++; ctxt->input->col = 1;
161 } else ctxt->input->col++;
162 ctxt->input->cur++;
163 ctxt->nbChars++;
164 if (*ctxt->input->cur == 0)
165 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
166 }
167}
168
169/**
170 * htmlSkipBlankChars:
171 * @ctxt: the HTML parser context
172 *
173 * skip all blanks character found at that point in the input streams.
174 *
175 * Returns the number of space chars skipped
176 */
177
178int
179htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
180 int res = 0;
181
182 while (IS_BLANK(*(ctxt->input->cur))) {
183 if ((*ctxt->input->cur == 0) &&
184 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
185 xmlPopInput(ctxt);
186 } else {
187 if (*(ctxt->input->cur) == '\n') {
188 ctxt->input->line++; ctxt->input->col = 1;
189 } else ctxt->input->col++;
190 ctxt->input->cur++;
191 ctxt->nbChars++;
192 if (*ctxt->input->cur == 0)
193 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
194 }
195 res++;
196 }
197 return(res);
198}
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000199
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000200
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000201
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000202/************************************************************************
203 * *
204 * The list of HTML elements and their properties *
205 * *
206 ************************************************************************/
207
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000208/*
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000209 * Start Tag: 1 means the start tag can be ommited
210 * End Tag: 1 means the end tag can be ommited
211 * 2 means it's forbidden (empty elements)
212 * Depr: this element is deprecated
213 * DTD: 1 means that this element is valid only in the Loose DTD
214 * 2 means that this element is valid only in the Frameset DTD
215 *
216 * Name,Start Tag,End Tag, Empty, Depr., DTD, Description
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000217 */
218htmlElemDesc html40ElementTable[] = {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000219{ "a", 0, 0, 0, 0, 0, "anchor " },
220{ "abbr", 0, 0, 0, 0, 0, "abbreviated form" },
221{ "acronym", 0, 0, 0, 0, 0, "" },
222{ "address", 0, 0, 0, 0, 0, "information on author " },
223{ "applet", 0, 0, 0, 1, 1, "java applet " },
224{ "area", 0, 2, 1, 0, 0, "client-side image map area " },
225{ "b", 0, 0, 0, 0, 0, "bold text style" },
226{ "base", 0, 2, 1, 0, 0, "document base uri " },
227{ "basefont", 0, 2, 1, 1, 1, "base font size " },
228{ "bdo", 0, 0, 0, 0, 0, "i18n bidi over-ride " },
229{ "big", 0, 0, 0, 0, 0, "large text style" },
230{ "blockquote", 0, 0, 0, 0, 0, "long quotation " },
231{ "body", 1, 1, 0, 0, 0, "document body " },
232{ "br", 0, 2, 1, 0, 0, "forced line break " },
233{ "button", 0, 0, 0, 0, 0, "push button " },
234{ "caption", 0, 0, 0, 0, 0, "table caption " },
235{ "center", 0, 0, 0, 1, 1, "shorthand for div align=center " },
236{ "cite", 0, 0, 0, 0, 0, "citation" },
237{ "code", 0, 0, 0, 0, 0, "computer code fragment" },
238{ "col", 0, 2, 1, 0, 0, "table column " },
239{ "colgroup", 0, 1, 0, 0, 0, "table column group " },
240{ "dd", 0, 1, 0, 0, 0, "definition description " },
241{ "del", 0, 0, 0, 0, 0, "deleted text " },
242{ "dfn", 0, 0, 0, 0, 0, "instance definition" },
243{ "dir", 0, 0, 0, 1, 1, "directory list" },
244{ "div", 0, 0, 0, 0, 0, "generic language/style container"},
245{ "dl", 0, 0, 0, 0, 0, "definition list " },
246{ "dt", 0, 1, 0, 0, 0, "definition term " },
247{ "em", 0, 0, 0, 0, 0, "emphasis" },
248{ "fieldset", 0, 0, 0, 0, 0, "form control group " },
249{ "font", 0, 0, 0, 1, 1, "local change to font " },
250{ "form", 0, 0, 0, 0, 0, "interactive form " },
251{ "frame", 0, 2, 1, 0, 2, "subwindow " },
252{ "frameset", 0, 0, 0, 0, 2, "window subdivision" },
253{ "h1", 0, 0, 0, 0, 0, "heading " },
254{ "h2", 0, 0, 0, 0, 0, "heading " },
255{ "h3", 0, 0, 0, 0, 0, "heading " },
256{ "h4", 0, 0, 0, 0, 0, "heading " },
257{ "h5", 0, 0, 0, 0, 0, "heading " },
258{ "h6", 0, 0, 0, 0, 0, "heading " },
259{ "head", 1, 1, 0, 0, 0, "document head " },
260{ "hr", 0, 2, 1, 0, 0, "horizontal rule " },
261{ "html", 1, 1, 0, 0, 0, "document root element " },
262{ "i", 0, 0, 0, 0, 0, "italic text style" },
263{ "iframe", 0, 0, 0, 0, 1, "inline subwindow " },
264{ "img", 0, 2, 1, 0, 0, "embedded image " },
265{ "input", 0, 2, 1, 0, 0, "form control " },
266{ "ins", 0, 0, 0, 0, 0, "inserted text" },
267{ "isindex", 0, 2, 1, 1, 1, "single line prompt " },
268{ "kbd", 0, 0, 0, 0, 0, "text to be entered by the user" },
269{ "label", 0, 0, 0, 0, 0, "form field label text " },
270{ "legend", 0, 0, 0, 0, 0, "fieldset legend " },
271{ "li", 0, 1, 0, 0, 0, "list item " },
272{ "link", 0, 2, 1, 0, 0, "a media-independent link " },
273{ "map", 0, 0, 0, 0, 0, "client-side image map " },
274{ "menu", 0, 0, 0, 1, 1, "menu list " },
275{ "meta", 0, 2, 1, 0, 0, "generic metainformation " },
276{ "noframes", 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " },
277{ "noscript", 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
278{ "object", 0, 0, 0, 0, 0, "generic embedded object " },
279{ "ol", 0, 0, 0, 0, 0, "ordered list " },
280{ "optgroup", 0, 0, 0, 0, 0, "option group " },
281{ "option", 0, 1, 0, 0, 0, "selectable choice " },
282{ "p", 0, 1, 0, 0, 0, "paragraph " },
283{ "param", 0, 2, 1, 0, 0, "named property value " },
284{ "pre", 0, 0, 0, 0, 0, "preformatted text " },
285{ "q", 0, 0, 0, 0, 0, "short inline quotation " },
286{ "s", 0, 0, 0, 1, 1, "strike-through text style" },
287{ "samp", 0, 0, 0, 0, 0, "sample program output, scripts, etc." },
288{ "script", 0, 0, 0, 0, 0, "script statements " },
289{ "select", 0, 0, 0, 0, 0, "option selector " },
290{ "small", 0, 0, 0, 0, 0, "small text style" },
291{ "span", 0, 0, 0, 0, 0, "generic language/style container " },
292{ "strike", 0, 0, 0, 1, 1, "strike-through text" },
293{ "strong", 0, 0, 0, 0, 0, "strong emphasis" },
294{ "style", 0, 0, 0, 0, 0, "style info " },
295{ "sub", 0, 0, 0, 0, 0, "subscript" },
296{ "sup", 0, 0, 0, 0, 0, "superscript " },
297{ "table", 0, 0, 0, 0, 0, "&#160;" },
298{ "tbody", 1, 1, 0, 0, 0, "table body " },
299{ "td", 0, 1, 0, 0, 0, "table data cell" },
300{ "textarea", 0, 0, 0, 0, 0, "multi-line text field " },
301{ "tfoot", 0, 1, 0, 0, 0, "table footer " },
302{ "th", 0, 1, 0, 0, 0, "table header cell" },
303{ "thead", 0, 1, 0, 0, 0, "table header " },
304{ "title", 0, 0, 0, 0, 0, "document title " },
305{ "tr", 0, 1, 0, 0, 0, "table row " },
306{ "tt", 0, 0, 0, 0, 0, "teletype or monospaced text style" },
307{ "u", 0, 0, 0, 1, 1, "underlined text style" },
308{ "ul", 0, 0, 0, 0, 0, "unordered list " },
309{ "var", 0, 0, 0, 0, 0, "instance of a variable or program argument" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000310};
311
312/*
313 * start tags that imply the end of a current element
314 * any tag of each line implies the end of the current element if the type of
315 * that element is in the same line
316 */
Daniel Veillardb96e6431999-08-29 21:02:19 +0000317char *htmlEquEnd[] = {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000318"dt", "dd", "li", "option", NULL,
319"h1", "h2", "h3", "h4", "h5", "h6", NULL,
320"ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000321NULL
322};
323/*
324 * acording the HTML DTD, HR should be added to the 2nd line above, as it
325 * is not allowed within a H1, H2, H3, etc. But we should tolerate that case
326 * because many documents contain rules in headings...
327 */
328
329/*
330 * start tags that imply the end of current element
331 */
Daniel Veillardb96e6431999-08-29 21:02:19 +0000332char *htmlStartClose[] = {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000333"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
334 "dl", "ul", "ol", "menu", "dir", "address", "pre",
335 "listing", "xmp", "head", NULL,
336"head", "p", NULL,
337"title", "p", NULL,
338"body", "head", "style", "link", "title", "p", NULL,
339"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
340 "pre", "listing", "xmp", "head", "li", NULL,
341"hr", "p", "head", NULL,
342"h1", "p", "head", NULL,
343"h2", "p", "head", NULL,
344"h3", "p", "head", NULL,
345"h4", "p", "head", NULL,
346"h5", "p", "head", NULL,
347"h6", "p", "head", NULL,
348"dir", "p", "head", NULL,
349"address", "p", "head", "ul", NULL,
350"pre", "p", "head", "ul", NULL,
351"listing", "p", "head", NULL,
352"xmp", "p", "head", NULL,
353"blockquote", "p", "head", NULL,
354"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
355 "xmp", "head", NULL,
356"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
357 "head", "dd", NULL,
358"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
359 "head", "dt", NULL,
360"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
361 "listing", "xmp", NULL,
362"ol", "p", "head", "ul", NULL,
363"menu", "p", "head", "ul", NULL,
364"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
365"div", "p", "head", NULL,
366"noscript", "p", "head", NULL,
367"center", "font", "b", "i", "p", "head", NULL,
368"a", "a", NULL,
369"caption", "p", NULL,
370"colgroup", "caption", "colgroup", "col", "p", NULL,
371"col", "caption", "col", "p", NULL,
372"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
373 "listing", "xmp", "a", NULL,
374"th", "th", "td", NULL,
375"td", "th", "td", "p", NULL,
376"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
377"thead", "caption", "col", "colgroup", NULL,
378"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
379 "tbody", "p", NULL,
380"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
381 "tfoot", "tbody", "p", NULL,
382"optgroup", "option", NULL,
383"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
384 "pre", "listing", "xmp", "a", NULL,
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000385NULL
386};
387
Daniel Veillardb96e6431999-08-29 21:02:19 +0000388static char** htmlStartCloseIndex[100];
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000389static int htmlStartCloseIndexinitialized = 0;
390
391/************************************************************************
392 * *
393 * functions to handle HTML specific data *
394 * *
395 ************************************************************************/
396
397/**
398 * htmlInitAutoClose:
399 *
400 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
401 *
402 */
403void
404htmlInitAutoClose(void) {
405 int index, i = 0;
406
407 if (htmlStartCloseIndexinitialized) return;
408
409 for (index = 0;index < 100;index ++) htmlStartCloseIndex[index] = NULL;
410 index = 0;
411 while ((htmlStartClose[i] != NULL) && (index < 100 - 1)) {
412 htmlStartCloseIndex[index++] = &htmlStartClose[i];
413 while (htmlStartClose[i] != NULL) i++;
414 i++;
415 }
416}
417
418/**
419 * htmlTagLookup:
420 * @tag: The tag name
421 *
422 * Lookup the HTML tag in the ElementTable
423 *
424 * Returns the related htmlElemDescPtr or NULL if not found.
425 */
426htmlElemDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000427htmlTagLookup(const xmlChar *tag) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000428 int i = 0;
429
430 for (i = 0; i < (sizeof(html40ElementTable) /
431 sizeof(html40ElementTable[0]));i++) {
Daniel Veillardb96e6431999-08-29 21:02:19 +0000432 if (!xmlStrcmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000433 return(&html40ElementTable[i]);
434 }
435 return(NULL);
436}
437
438/**
439 * htmlCheckAutoClose:
440 * @new: The new tag name
441 * @old: The old tag name
442 *
443 * Checks wether the new tag is one of the registered valid tags for closing old.
444 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
445 *
446 * Returns 0 if no, 1 if yes.
447 */
448int
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000449htmlCheckAutoClose(const xmlChar *new, const xmlChar *old) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000450 int i, index;
Daniel Veillardb96e6431999-08-29 21:02:19 +0000451 char **close;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000452
453 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
454
455 /* inefficient, but not a big deal */
456 for (index = 0; index < 100;index++) {
457 close = htmlStartCloseIndex[index];
458 if (close == NULL) return(0);
Daniel Veillardb96e6431999-08-29 21:02:19 +0000459 if (!xmlStrcmp(BAD_CAST *close, new)) break;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000460 }
461
462 i = close - htmlStartClose;
463 i++;
464 while (htmlStartClose[i] != NULL) {
Daniel Veillardb96e6431999-08-29 21:02:19 +0000465 if (!xmlStrcmp(BAD_CAST htmlStartClose[i], old)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000466 return(1);
467 }
468 i++;
469 }
470 return(0);
471}
472
473/**
474 * htmlAutoClose:
475 * @ctxt: an HTML parser context
476 * @new: The new tag name
477 *
478 * The HTmL DtD allows a tag to implicitely close other tags.
479 * The list is kept in htmlStartClose array. This function is
480 * called when a new tag has been detected and generates the
481 * appropriates closes if possible/needed.
482 */
483void
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000484htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *new) {
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000485 xmlChar *oldname;
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000486 while ((ctxt->name != NULL) &&
487 (htmlCheckAutoClose(new, ctxt->name))) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000488#ifdef DEBUG
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000489 fprintf(stderr,"htmlAutoClose: %s closes %s\n", new, ctxt->name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000490#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000491 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000492 ctxt->sax->endElement(ctxt->userData, ctxt->name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +0000493 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000494 if (oldname != NULL) {
495#ifdef DEBUG
496 fprintf(stderr,"htmlAutoClose: popped %s\n", oldname);
497#endif
498 xmlFree(oldname);
499 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000500 }
501}
502
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000503/**
Daniel Veillard5cb5ab81999-12-21 15:35:29 +0000504 * htmlAutoCloseTag:
505 * @doc: the HTML document
506 * @name: The tag name
507 * @elem: the HTML element
508 *
509 * The HTmL DtD allows a tag to implicitely close other tags.
510 * The list is kept in htmlStartClose array. This function checks
511 * if the element or one of it's children would autoclose the
512 * given tag.
513 *
514 * Returns 1 if autoclose, 0 otherwise
515 */
516int
517htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
518 htmlNodePtr child;
519
520 if (elem == NULL) return(1);
521 if (!xmlStrcmp(name, elem->name)) return(0);
522 if (htmlCheckAutoClose(elem->name, name)) return(1);
Daniel Veillardcf461992000-03-14 18:30:20 +0000523 child = elem->children;
Daniel Veillard5cb5ab81999-12-21 15:35:29 +0000524 while (child != NULL) {
525 if (htmlAutoCloseTag(doc, name, child)) return(1);
526 child = child->next;
527 }
528 return(0);
529}
530
531/**
532 * htmlIsAutoClosed:
533 * @doc: the HTML document
534 * @elem: the HTML element
535 *
536 * The HTmL DtD allows a tag to implicitely close other tags.
537 * The list is kept in htmlStartClose array. This function checks
538 * if a tag is autoclosed by one of it's child
539 *
540 * Returns 1 if autoclosed, 0 otherwise
541 */
542int
543htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
544 htmlNodePtr child;
545
546 if (elem == NULL) return(1);
Daniel Veillardcf461992000-03-14 18:30:20 +0000547 child = elem->children;
Daniel Veillard5cb5ab81999-12-21 15:35:29 +0000548 while (child != NULL) {
549 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
550 child = child->next;
551 }
552 return(0);
553}
554
555/**
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000556 * htmlAutoCloseOnClose:
557 * @ctxt: an HTML parser context
558 * @new: The new tag name
559 *
560 * The HTmL DtD allows an ending tag to implicitely close other tags.
561 */
562void
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000563htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *new) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000564 htmlElemDescPtr info;
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000565 xmlChar *oldname;
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000566 int i;
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000567
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000568#ifdef DEBUG
569 fprintf(stderr,"Close of %s stack: %d elements\n", new, ctxt->nameNr);
570 for (i = 0;i < ctxt->nameNr;i++)
571 fprintf(stderr,"%d : %s\n", i, ctxt->nameTab[i]);
572#endif
573
574 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
575 if (!xmlStrcmp(new, ctxt->nameTab[i])) break;
576 }
577 if (i < 0) return;
578
579 while (xmlStrcmp(new, ctxt->name)) {
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000580 info = htmlTagLookup(ctxt->name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000581 if ((info == NULL) || (info->endTag == 1)) {
582#ifdef DEBUG
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000583 fprintf(stderr,"htmlAutoCloseOnClose: %s closes %s\n", new, ctxt->name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000584#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000585 } else {
586 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
587 ctxt->sax->error(ctxt->userData,
588 "Opening and ending tag mismatch: %s and %s\n",
589 new, ctxt->name);
590 ctxt->wellFormed = 0;
591 }
592 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
593 ctxt->sax->endElement(ctxt->userData, ctxt->name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +0000594 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000595 if (oldname != NULL) {
596#ifdef DEBUG
597 fprintf(stderr,"htmlAutoCloseOnClose: popped %s\n", oldname);
598#endif
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000599 xmlFree(oldname);
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000600 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000601 }
602}
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000603
604/************************************************************************
605 * *
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000606 * The list of HTML predefined entities *
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000607 * *
608 ************************************************************************/
609
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000610
611htmlEntityDesc html40EntitiesTable[] = {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000612/*
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000613 * the 4 absolute ones,
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000614 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000615{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
616{ 38, "amp", "ampersand, U+0026 ISOnum" },
617{ 60, "lt", "less-than sign, U+003C ISOnum" },
618{ 62, "gt", "greater-than sign, U+003E ISOnum" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000619
620/*
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000621 * A bunch still in the 128-255 range
622 * Replacing them depend really on the charset used.
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000623 */
Daniel Veillard5cb5ab81999-12-21 15:35:29 +0000624{ 39, "apos", "single quote" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000625{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
626{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
627{ 162, "cent", "cent sign, U+00A2 ISOnum" },
628{ 163, "pound","pound sign, U+00A3 ISOnum" },
629{ 164, "curren","currency sign, U+00A4 ISOnum" },
630{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
631{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
632{ 167, "sect", "section sign, U+00A7 ISOnum" },
633{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
634{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
635{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
636{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
637{ 172, "not", "not sign, U+00AC ISOnum" },
638{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
639{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
640{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
641{ 176, "deg", "degree sign, U+00B0 ISOnum" },
642{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
643{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
644{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
645{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
646{ 181, "micro","micro sign, U+00B5 ISOnum" },
647{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000648{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000649{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
650{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
651{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000652{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000653{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
654{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
655{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
656{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
657{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
658{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
659{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
660{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
661{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
662{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
663{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
664{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
665{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
666{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
667{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
668{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
669{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
670{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
671{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
672{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
673{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
674{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
675{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
676{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
677{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
678{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
679{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
680{ 215, "times","multiplication sign, U+00D7 ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000681{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000682{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
683{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
684{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
685{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
686{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
687{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
688{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
689{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
690{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
691{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
692{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
693{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
694{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
695{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
696{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
697{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
698{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
699{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
700{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
701{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
702{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
703{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
704{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
705{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
706{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
707{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
708{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
709{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
710{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
711{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
712{ 247, "divide","division sign, U+00F7 ISOnum" },
713{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
714{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
715{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
716{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
717{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
718{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
719{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
720{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000721
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000722/*
723 * Anything below should really be kept as entities references
724 */
725{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000726
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000727{ 913, "Alpha","greek capital letter alpha, U+0391" },
728{ 914, "Beta", "greek capital letter beta, U+0392" },
729{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
730{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
731{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
732{ 918, "Zeta", "greek capital letter zeta, U+0396" },
733{ 919, "Eta", "greek capital letter eta, U+0397" },
734{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
735{ 921, "Iota", "greek capital letter iota, U+0399" },
736{ 922, "Kappa","greek capital letter kappa, U+039A" },
737{ 923, "Lambda""greek capital letter lambda, U+039B ISOgrk3" },
738{ 924, "Mu", "greek capital letter mu, U+039C" },
739{ 925, "Nu", "greek capital letter nu, U+039D" },
740{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
741{ 927, "Omicron","greek capital letter omicron, U+039F" },
742{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
743{ 929, "Rho", "greek capital letter rho, U+03A1" },
744{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
745{ 932, "Tau", "greek capital letter tau, U+03A4" },
746{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
747{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
748{ 935, "Chi", "greek capital letter chi, U+03A7" },
749{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
750{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000751
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000752{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
753{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
754{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
755{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
756{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
757{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
758{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
759{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
760{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
761{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
762{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
763{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
764{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
765{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
766{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
767{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
768{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
769{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
770{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
771{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
772{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
773{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
774{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
775{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
776{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
777{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
778{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
779{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000780
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000781{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
782{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
783{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
784{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
785{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
786{ 8260, "frasl","fraction slash, U+2044 NEW" },
787
Daniel Veillardb05deb71999-08-10 19:04:08 +0000788{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000789{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
790{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
791{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
792{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
793{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
794{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
795{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
796{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
797{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
798{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
799{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
800{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
801{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
802{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
803{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
804
805
806{ 8704, "forall","for all, U+2200 ISOtech" },
807{ 8706, "part", "partial differential, U+2202 ISOtech" },
808{ 8707, "exist","there exists, U+2203 ISOtech" },
809{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
810{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
811{ 8712, "isin", "element of, U+2208 ISOtech" },
812{ 8713, "notin","not an element of, U+2209 ISOtech" },
813{ 8715, "ni", "contains as member, U+220B ISOtech" },
814{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
815{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
816{ 8722, "minus","minus sign, U+2212 ISOtech" },
817{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
818{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
819{ 8733, "prop", "proportional to, U+221D ISOtech" },
820{ 8734, "infin","infinity, U+221E ISOtech" },
821{ 8736, "ang", "angle, U+2220 ISOamso" },
822{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
823{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
824{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
825{ 8746, "cup", "union = cup, U+222A ISOtech" },
826{ 8747, "int", "integral, U+222B ISOtech" },
827{ 8756, "there4","therefore, U+2234 ISOtech" },
828{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
829{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
830{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
831{ 8800, "ne", "not equal to, U+2260 ISOtech" },
832{ 8801, "equiv","identical to, U+2261 ISOtech" },
833{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
834{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
835{ 8834, "sub", "subset of, U+2282 ISOtech" },
836{ 8835, "sup", "superset of, U+2283 ISOtech" },
837{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
838{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
839{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
840{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
841{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
842{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
843{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
844{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
845{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
846{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
847{ 8971, "rfloor","right floor, U+230B ISOamsc" },
848{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
849{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
850{ 9674, "loz", "lozenge, U+25CA ISOpub" },
851
852{ 9824, "spades","black spade suit, U+2660 ISOpub" },
853{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
854{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
855{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
856
857{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
858{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
859{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
860{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
861{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
862{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
863{ 732, "tilde","small tilde, U+02DC ISOdia" },
864
865{ 8194, "ensp", "en space, U+2002 ISOpub" },
866{ 8195, "emsp", "em space, U+2003 ISOpub" },
867{ 8201, "thinsp","thin space, U+2009 ISOpub" },
868{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
869{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
870{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
871{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
872{ 8211, "ndash","en dash, U+2013 ISOpub" },
873{ 8212, "mdash","em dash, U+2014 ISOpub" },
874{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
875{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
876{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
877{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
878{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
879{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
880{ 8224, "dagger","dagger, U+2020 ISOpub" },
881{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
882{ 8240, "permil","per mille sign, U+2030 ISOtech" },
883{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000884{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000885{ 8364, "euro", "euro sign, U+20AC NEW" }
886};
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000887
888/************************************************************************
889 * *
890 * Commodity functions to handle entities *
891 * *
892 ************************************************************************/
893
894/*
895 * Macro used to grow the current buffer.
896 */
897#define growBuffer(buffer) { \
898 buffer##_size *= 2; \
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000899 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000900 if (buffer == NULL) { \
901 perror("realloc failed"); \
Daniel Veillard0142b842000-01-14 14:45:24 +0000902 return(NULL); \
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000903 } \
904}
905
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000906/**
907 * htmlEntityLookup:
908 * @name: the entity name
909 *
910 * Lookup the given entity in EntitiesTable
911 *
912 * TODO: the linear scan is really ugly, an hash table is really needed.
913 *
914 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
915 */
916htmlEntityDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000917htmlEntityLookup(const xmlChar *name) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000918 int i;
919
920 for (i = 0;i < (sizeof(html40EntitiesTable)/
921 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillardb96e6431999-08-29 21:02:19 +0000922 if (!xmlStrcmp(name, BAD_CAST html40EntitiesTable[i].name)) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000923#ifdef DEBUG
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000924 fprintf(stderr,"Found entity %s\n", name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000925#endif
926 return(&html40EntitiesTable[i]);
927 }
928 }
929 return(NULL);
930}
931
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000932
933/**
934 * htmlDecodeEntities:
935 * @ctxt: the parser context
936 * @len: the len to decode (in bytes !), -1 for no size limit
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000937 * @end: an end marker xmlChar, 0 if none
938 * @end2: an end marker xmlChar, 0 if none
939 * @end3: an end marker xmlChar, 0 if none
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000940 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000941 * Subtitute the HTML entities by their value
942 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000943 * DEPRECATED !!!!
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000944 *
945 * Returns A newly allocated string with the substitution done. The caller
946 * must deallocate it !
947 */
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000948xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000949htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000950 xmlChar end, xmlChar end2, xmlChar end3) {
951 xmlChar *buffer = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000952 int buffer_size = 0;
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000953 xmlChar *out = NULL;
954 xmlChar *name = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000955
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000956 xmlChar *cur = NULL;
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000957 htmlEntityDescPtr ent;
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000958 int nbchars = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000959 unsigned int max = (unsigned int) len;
960
961 /*
962 * allocate a translation buffer.
963 */
Daniel Veillard5e5c6231999-12-29 12:49:06 +0000964 buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000965 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000966 if (buffer == NULL) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000967 perror("htmlDecodeEntities: malloc failed");
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000968 return(NULL);
969 }
970 out = buffer;
971
972 /*
973 * Ok loop until we reach one of the ending char or a size limit.
974 */
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000975 while ((nbchars < max) && (CUR != end) &&
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000976 (CUR != end2) && (CUR != end3)) {
977
978 if (CUR == '&') {
979 if (NXT(1) == '#') {
980 int val = htmlParseCharRef(ctxt);
Daniel Veillardb96e6431999-08-29 21:02:19 +0000981 /* invalid for UTF-8 variable encoding !!!!! */
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000982 *out++ = val;
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000983 nbchars += 3; /* !!!! */
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000984 } else {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000985 ent = htmlParseEntityRef(ctxt, &name);
986 if (name != NULL) {
987 if ((ent == NULL) || (ent->value <= 0) ||
988 (ent->value >= 255)) {
989 *out++ = '&';
990 cur = name;
991 while (*cur != 0) {
992 if (out - buffer > buffer_size - 100) {
993 int index = out - buffer;
994
995 growBuffer(buffer);
996 out = &buffer[index];
997 }
998 *out++ = *cur++;
999 }
1000 *out++ = ';';
1001 } else {
Daniel Veillardb96e6431999-08-29 21:02:19 +00001002 /* invalid for UTF-8 variable encoding !!!!! */
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001003 *out++ = (xmlChar)ent->value;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001004 if (out - buffer > buffer_size - 100) {
1005 int index = out - buffer;
1006
1007 growBuffer(buffer);
1008 out = &buffer[index];
1009 }
1010 }
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001011 nbchars += 2 + xmlStrlen(name);
Daniel Veillard6454aec1999-09-02 22:04:43 +00001012 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001013 }
1014 }
1015 } else {
Daniel Veillardb96e6431999-08-29 21:02:19 +00001016 /* invalid for UTF-8 , use COPY(out); !!!!! */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001017 *out++ = CUR;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001018 nbchars++;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001019 if (out - buffer > buffer_size - 100) {
1020 int index = out - buffer;
1021
1022 growBuffer(buffer);
1023 out = &buffer[index];
1024 }
1025 NEXT;
1026 }
1027 }
1028 *out++ = 0;
1029 return(buffer);
1030}
1031
1032
1033/************************************************************************
1034 * *
1035 * Commodity functions to handle encodings *
1036 * *
1037 ************************************************************************/
1038
1039/**
1040 * htmlSwitchEncoding:
1041 * @ctxt: the parser context
1042 * @len: the len of @cur
1043 *
1044 * change the input functions when discovering the character encoding
1045 * of a given entity.
1046 *
1047 */
1048void
1049htmlSwitchEncoding(htmlParserCtxtPtr ctxt, xmlCharEncoding enc)
1050{
1051 switch (enc) {
1052 case XML_CHAR_ENCODING_ERROR:
1053 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1054 ctxt->sax->error(ctxt->userData, "encoding unknown\n");
1055 ctxt->wellFormed = 0;
1056 break;
1057 case XML_CHAR_ENCODING_NONE:
1058 /* let's assume it's UTF-8 without the XML decl */
1059 return;
1060 case XML_CHAR_ENCODING_UTF8:
1061 /* default encoding, no conversion should be needed */
1062 return;
1063 case XML_CHAR_ENCODING_UTF16LE:
1064 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1065 ctxt->sax->error(ctxt->userData,
1066 "char encoding UTF16 little endian not supported\n");
1067 break;
1068 case XML_CHAR_ENCODING_UTF16BE:
1069 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1070 ctxt->sax->error(ctxt->userData,
1071 "char encoding UTF16 big endian not supported\n");
1072 break;
1073 case XML_CHAR_ENCODING_UCS4LE:
1074 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1075 ctxt->sax->error(ctxt->userData,
1076 "char encoding USC4 little endian not supported\n");
1077 break;
1078 case XML_CHAR_ENCODING_UCS4BE:
1079 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1080 ctxt->sax->error(ctxt->userData,
1081 "char encoding USC4 big endian not supported\n");
1082 break;
1083 case XML_CHAR_ENCODING_EBCDIC:
1084 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1085 ctxt->sax->error(ctxt->userData,
1086 "char encoding EBCDIC not supported\n");
1087 break;
1088 case XML_CHAR_ENCODING_UCS4_2143:
1089 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1090 ctxt->sax->error(ctxt->userData,
1091 "char encoding UCS4 2143 not supported\n");
1092 break;
1093 case XML_CHAR_ENCODING_UCS4_3412:
1094 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1095 ctxt->sax->error(ctxt->userData,
1096 "char encoding UCS4 3412 not supported\n");
1097 break;
1098 case XML_CHAR_ENCODING_UCS2:
1099 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1100 ctxt->sax->error(ctxt->userData,
1101 "char encoding UCS2 not supported\n");
1102 break;
1103 case XML_CHAR_ENCODING_8859_1:
1104 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1105 ctxt->sax->error(ctxt->userData,
1106 "char encoding ISO_8859_1 ISO Latin 1 not supported\n");
1107 break;
1108 case XML_CHAR_ENCODING_8859_2:
1109 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1110 ctxt->sax->error(ctxt->userData,
1111 "char encoding ISO_8859_2 ISO Latin 2 not supported\n");
1112 break;
1113 case XML_CHAR_ENCODING_8859_3:
1114 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1115 ctxt->sax->error(ctxt->userData,
1116 "char encoding ISO_8859_3 not supported\n");
1117 break;
1118 case XML_CHAR_ENCODING_8859_4:
1119 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1120 ctxt->sax->error(ctxt->userData,
1121 "char encoding ISO_8859_4 not supported\n");
1122 break;
1123 case XML_CHAR_ENCODING_8859_5:
1124 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1125 ctxt->sax->error(ctxt->userData,
1126 "char encoding ISO_8859_5 not supported\n");
1127 break;
1128 case XML_CHAR_ENCODING_8859_6:
1129 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1130 ctxt->sax->error(ctxt->userData,
1131 "char encoding ISO_8859_6 not supported\n");
1132 break;
1133 case XML_CHAR_ENCODING_8859_7:
1134 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1135 ctxt->sax->error(ctxt->userData,
1136 "char encoding ISO_8859_7 not supported\n");
1137 break;
1138 case XML_CHAR_ENCODING_8859_8:
1139 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1140 ctxt->sax->error(ctxt->userData,
1141 "char encoding ISO_8859_8 not supported\n");
1142 break;
1143 case XML_CHAR_ENCODING_8859_9:
1144 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1145 ctxt->sax->error(ctxt->userData,
1146 "char encoding ISO_8859_9 not supported\n");
1147 break;
1148 case XML_CHAR_ENCODING_2022_JP:
1149 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1150 ctxt->sax->error(ctxt->userData,
1151 "char encoding ISO-2022-JPnot supported\n");
1152 break;
1153 case XML_CHAR_ENCODING_SHIFT_JIS:
1154 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1155 ctxt->sax->error(ctxt->userData,
1156 "char encoding Shift_JISnot supported\n");
1157 break;
1158 case XML_CHAR_ENCODING_EUC_JP:
1159 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1160 ctxt->sax->error(ctxt->userData,
1161 "char encoding EUC-JPnot supported\n");
1162 break;
1163 }
1164}
1165
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001166/************************************************************************
1167 * *
1168 * Commodity functions to handle streams *
1169 * *
1170 ************************************************************************/
1171
1172/**
1173 * htmlFreeInputStream:
1174 * @input: an htmlParserInputPtr
1175 *
1176 * Free up an input stream.
1177 */
1178void
1179htmlFreeInputStream(htmlParserInputPtr input) {
1180 if (input == NULL) return;
1181
1182 if (input->filename != NULL) xmlFree((char *) input->filename);
1183 if (input->directory != NULL) xmlFree((char *) input->directory);
1184 if ((input->free != NULL) && (input->base != NULL))
1185 input->free((xmlChar *) input->base);
1186 if (input->buf != NULL)
1187 xmlFreeParserInputBuffer(input->buf);
1188 memset(input, -1, sizeof(htmlParserInput));
1189 xmlFree(input);
1190}
1191
1192/**
1193 * htmlNewInputStream:
1194 * @ctxt: an HTML parser context
1195 *
1196 * Create a new input stream structure
1197 * Returns the new input stream or NULL
1198 */
1199htmlParserInputPtr
1200htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1201 htmlParserInputPtr input;
1202
1203 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1204 if (input == NULL) {
1205 ctxt->errNo = XML_ERR_NO_MEMORY;
1206 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1207 ctxt->sax->error(ctxt->userData,
1208 "malloc: couldn't allocate a new input stream\n");
1209 ctxt->errNo = XML_ERR_NO_MEMORY;
1210 return(NULL);
1211 }
1212 input->filename = NULL;
1213 input->directory = NULL;
1214 input->base = NULL;
1215 input->cur = NULL;
1216 input->buf = NULL;
1217 input->line = 1;
1218 input->col = 1;
1219 input->buf = NULL;
1220 input->free = NULL;
1221 input->consumed = 0;
1222 input->length = 0;
1223 return(input);
1224}
1225
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001226
1227/************************************************************************
1228 * *
1229 * Commodity functions, cleanup needed ? *
1230 * *
1231 ************************************************************************/
1232
1233/**
1234 * areBlanks:
1235 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001236 * @str: a xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001237 * @len: the size of @str
1238 *
1239 * Is this a sequence of blank chars that one can ignore ?
1240 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001241 * Returns 1 if ignorable 0 otherwise.
1242 */
1243
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001244static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001245 int i;
1246 xmlNodePtr lastChild;
1247
1248 for (i = 0;i < len;i++)
1249 if (!(IS_BLANK(str[i]))) return(0);
1250
1251 if (CUR != '<') return(0);
1252 if (ctxt->node == NULL) return(0);
1253 lastChild = xmlGetLastChild(ctxt->node);
1254 if (lastChild == NULL) {
1255 if (ctxt->node->content != NULL) return(0);
1256 } else if (xmlNodeIsText(lastChild))
1257 return(0);
1258 return(1);
1259}
1260
1261/**
1262 * htmlHandleEntity:
1263 * @ctxt: an HTML parser context
1264 * @entity: an XML entity pointer.
1265 *
1266 * Default handling of an HTML entity, call the parser with the
1267 * substitution string
1268 */
1269
1270void
1271htmlHandleEntity(htmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
1272 int len;
1273
1274 if (entity->content == NULL) {
1275 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1276 ctxt->sax->error(ctxt->userData, "htmlHandleEntity %s: content == NULL\n",
1277 entity->name);
1278 ctxt->wellFormed = 0;
1279 return;
1280 }
1281 len = xmlStrlen(entity->content);
1282
1283 /*
1284 * Just handle the content as a set of chars.
1285 */
1286 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
1287 ctxt->sax->characters(ctxt->userData, entity->content, len);
1288
1289}
1290
1291/**
1292 * htmlNewDoc:
1293 * @URI: URI for the dtd, or NULL
1294 * @ExternalID: the external ID of the DTD, or NULL
1295 *
1296 * Returns a new document
1297 */
1298htmlDocPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001299htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001300 xmlDocPtr cur;
1301
1302 /*
1303 * Allocate a new document and fill the fields.
1304 */
Daniel Veillard6454aec1999-09-02 22:04:43 +00001305 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001306 if (cur == NULL) {
1307 fprintf(stderr, "xmlNewDoc : malloc failed\n");
1308 return(NULL);
1309 }
Daniel Veillarde7a5a771999-08-30 13:05:42 +00001310 memset(cur, 0, sizeof(xmlDoc));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001311
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001312 cur->type = XML_HTML_DOCUMENT_NODE;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001313 cur->version = NULL;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001314 cur->intSubset = NULL;
Daniel Veillard5cb5ab81999-12-21 15:35:29 +00001315 if ((ExternalID == NULL) &&
1316 (URI == NULL))
1317 xmlCreateIntSubset(cur, BAD_CAST "HTML",
1318 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
1319 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
1320 else
1321 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001322 cur->name = NULL;
Daniel Veillardcf461992000-03-14 18:30:20 +00001323 cur->children = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001324 cur->extSubset = NULL;
1325 cur->oldNs = NULL;
1326 cur->encoding = NULL;
1327 cur->standalone = 1;
1328 cur->compression = 0;
Daniel Veillardc08a2c61999-09-08 21:35:25 +00001329 cur->ids = NULL;
1330 cur->refs = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001331#ifndef XML_WITHOUT_CORBA
1332 cur->_private = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001333#endif
1334 return(cur);
1335}
1336
1337
1338/************************************************************************
1339 * *
1340 * The parser itself *
1341 * Relates to http://www.w3.org/TR/html40 *
1342 * *
1343 ************************************************************************/
1344
1345/************************************************************************
1346 * *
1347 * The parser itself *
1348 * *
1349 ************************************************************************/
1350
1351/**
1352 * htmlParseHTMLName:
1353 * @ctxt: an HTML parser context
1354 *
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001355 * parse an HTML tag or attribute name, note that we convert it to lowercase
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001356 * since HTML names are not case-sensitive.
1357 *
1358 * Returns the Tag Name parsed or NULL
1359 */
1360
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001361xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001362htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001363 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001364 int i = 0;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001365 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001366
1367 if (!IS_LETTER(CUR) && (CUR != '_') &&
1368 (CUR != ':')) return(NULL);
1369
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001370 while ((i < HTML_PARSER_BUFFER_SIZE) &&
1371 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)))) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001372 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001373 else loc[i] = CUR;
1374 i++;
1375
1376 NEXT;
1377 }
1378
1379 ret = xmlStrndup(loc, i);
1380
1381 return(ret);
1382}
1383
1384/**
1385 * htmlParseName:
1386 * @ctxt: an HTML parser context
1387 *
1388 * parse an HTML name, this routine is case sensistive.
1389 *
1390 * Returns the Name parsed or NULL
1391 */
1392
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001393xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001394htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001395 xmlChar buf[HTML_MAX_NAMELEN];
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001396 int len = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001397
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001398 GROW;
1399 if (!IS_LETTER(CUR) && (CUR != '_')) {
1400 return(NULL);
1401 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001402
1403 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1404 (CUR == '.') || (CUR == '-') ||
1405 (CUR == '_') || (CUR == ':') ||
1406 (IS_COMBINING(CUR)) ||
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001407 (IS_EXTENDER(CUR))) {
1408 buf[len++] = CUR;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001409 NEXT;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001410 if (len >= HTML_MAX_NAMELEN) {
1411 fprintf(stderr,
1412 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1413 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1414 (CUR == '.') || (CUR == '-') ||
1415 (CUR == '_') || (CUR == ':') ||
1416 (IS_COMBINING(CUR)) ||
1417 (IS_EXTENDER(CUR)))
1418 NEXT;
1419 break;
1420 }
1421 }
1422 return(xmlStrndup(buf, len));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001423}
1424
1425/**
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001426 * htmlParseHTMLAttribute:
1427 * @ctxt: an HTML parser context
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001428 * @stop: a char stop value
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001429 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001430 * parse an HTML attribute value till the stop (quote), if
1431 * stop is 0 then it stops at the first space
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001432 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001433 * Returns the attribute parsed or NULL
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001434 */
1435
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001436xmlChar *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001437htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00001438#if 0
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001439 xmlChar buf[HTML_MAX_NAMELEN];
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001440 int len = 0;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001441
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001442 GROW;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001443 while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1444 if ((stop == 0) && (IS_BLANK(CUR))) break;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001445 buf[len++] = CUR;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001446 NEXT;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001447 if (len >= HTML_MAX_NAMELEN) {
1448 fprintf(stderr,
1449 "htmlParseHTMLAttribute: reached HTML_MAX_NAMELEN limit\n");
1450 while ((!IS_BLANK(CUR)) && (CUR != '<') &&
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001451 (CUR != '>') &&
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001452 (CUR != '\'') && (CUR != '"'))
1453 NEXT;
1454 break;
1455 }
1456 }
1457 return(xmlStrndup(buf, len));
Daniel Veillard71b656e2000-01-05 14:46:17 +00001458#else
1459 xmlChar *buffer = NULL;
1460 int buffer_size = 0;
1461 xmlChar *out = NULL;
1462 xmlChar *name = NULL;
1463
1464 xmlChar *cur = NULL;
1465 htmlEntityDescPtr ent;
1466
1467 /*
1468 * allocate a translation buffer.
1469 */
1470 buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1471 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1472 if (buffer == NULL) {
1473 perror("htmlParseHTMLAttribute: malloc failed");
1474 return(NULL);
1475 }
1476 out = buffer;
1477
1478 /*
1479 * Ok loop until we reach one of the ending chars
1480 */
1481 while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1482 if ((stop == 0) && (IS_BLANK(CUR))) break;
1483 if (CUR == '&') {
1484 if (NXT(1) == '#') {
1485 int val = htmlParseCharRef(ctxt);
1486 *out++ = val;
1487 } else {
1488 ent = htmlParseEntityRef(ctxt, &name);
1489 if (name == NULL) {
1490 *out++ = '&';
1491 if (out - buffer > buffer_size - 100) {
1492 int index = out - buffer;
1493
1494 growBuffer(buffer);
1495 out = &buffer[index];
1496 }
1497 } else if ((ent == NULL) || (ent->value <= 0) ||
1498 (ent->value >= 255)) {
1499 *out++ = '&';
1500 cur = name;
1501 while (*cur != 0) {
1502 if (out - buffer > buffer_size - 100) {
1503 int index = out - buffer;
1504
1505 growBuffer(buffer);
1506 out = &buffer[index];
1507 }
1508 *out++ = *cur++;
1509 }
1510 xmlFree(name);
1511 } else {
1512 *out++ = ent->value;
1513 if (out - buffer > buffer_size - 100) {
1514 int index = out - buffer;
1515
1516 growBuffer(buffer);
1517 out = &buffer[index];
1518 }
1519 xmlFree(name);
1520 }
1521 }
1522 } else {
1523 *out++ = CUR;
1524 if (out - buffer > buffer_size - 100) {
1525 int index = out - buffer;
1526
1527 growBuffer(buffer);
1528 out = &buffer[index];
1529 }
1530 NEXT;
1531 }
1532 }
1533 *out++ = 0;
1534 return(buffer);
1535#endif
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001536}
1537
1538/**
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001539 * htmlParseNmtoken:
1540 * @ctxt: an HTML parser context
1541 *
1542 * parse an HTML Nmtoken.
1543 *
1544 * Returns the Nmtoken parsed or NULL
1545 */
1546
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001547xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001548htmlParseNmtoken(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001549 xmlChar buf[HTML_MAX_NAMELEN];
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001550 int len = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001551
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001552 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001553 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1554 (CUR == '.') || (CUR == '-') ||
1555 (CUR == '_') || (CUR == ':') ||
1556 (IS_COMBINING(CUR)) ||
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001557 (IS_EXTENDER(CUR))) {
1558 buf[len++] = CUR;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001559 NEXT;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001560 if (len >= HTML_MAX_NAMELEN) {
1561 fprintf(stderr,
1562 "htmlParseNmtoken: reached HTML_MAX_NAMELEN limit\n");
1563 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1564 (CUR == '.') || (CUR == '-') ||
1565 (CUR == '_') || (CUR == ':') ||
1566 (IS_COMBINING(CUR)) ||
1567 (IS_EXTENDER(CUR)))
1568 NEXT;
1569 break;
1570 }
1571 }
1572 return(xmlStrndup(buf, len));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001573}
1574
1575/**
1576 * htmlParseEntityRef:
1577 * @ctxt: an HTML parser context
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001578 * @str: location to store the entity name
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001579 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001580 * parse an HTML ENTITY references
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001581 *
1582 * [68] EntityRef ::= '&' Name ';'
1583 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001584 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
1585 * if non-NULL *str will have to be freed by the caller.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001586 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001587htmlEntityDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001588htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
1589 xmlChar *name;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001590 htmlEntityDescPtr ent = NULL;
1591 *str = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001592
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001593 if (CUR == '&') {
1594 NEXT;
1595 name = htmlParseName(ctxt);
1596 if (name == NULL) {
1597 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1598 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
1599 ctxt->wellFormed = 0;
1600 } else {
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001601 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001602 if (CUR == ';') {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001603 *str = name;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001604
1605 /*
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001606 * Lookup the entity in the table.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001607 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001608 ent = htmlEntityLookup(name);
Daniel Veillard71b656e2000-01-05 14:46:17 +00001609 if (ent != NULL) /* OK that's ugly !!! */
1610 NEXT;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001611 } else {
1612 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1613 ctxt->sax->error(ctxt->userData,
1614 "htmlParseEntityRef: expecting ';'\n");
Daniel Veillard71b656e2000-01-05 14:46:17 +00001615 *str = name;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001616 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001617 }
1618 }
1619 return(ent);
1620}
1621
1622/**
1623 * htmlParseAttValue:
1624 * @ctxt: an HTML parser context
1625 *
1626 * parse a value for an attribute
1627 * Note: the parser won't do substitution of entities here, this
1628 * will be handled later in xmlStringGetNodeList, unless it was
1629 * asked for ctxt->replaceEntities != 0
1630 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001631 * Returns the AttValue parsed or NULL.
1632 */
1633
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001634xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001635htmlParseAttValue(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001636 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001637
1638 if (CUR == '"') {
1639 NEXT;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001640 ret = htmlParseHTMLAttribute(ctxt, '"');
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001641 if (CUR != '"') {
1642 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1643 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
1644 ctxt->wellFormed = 0;
1645 } else
1646 NEXT;
1647 } else if (CUR == '\'') {
1648 NEXT;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001649 ret = htmlParseHTMLAttribute(ctxt, '\'');
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001650 if (CUR != '\'') {
1651 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1652 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
1653 ctxt->wellFormed = 0;
1654 } else
1655 NEXT;
1656 } else {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001657 /*
1658 * That's an HTMLism, the attribute value may not be quoted
1659 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001660 ret = htmlParseHTMLAttribute(ctxt, 0);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001661 if (ret == NULL) {
1662 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1663 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
1664 ctxt->wellFormed = 0;
1665 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001666 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001667 return(ret);
1668}
1669
1670/**
1671 * htmlParseSystemLiteral:
1672 * @ctxt: an HTML parser context
1673 *
1674 * parse an HTML Literal
1675 *
1676 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
1677 *
1678 * Returns the SystemLiteral parsed or NULL
1679 */
1680
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001681xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001682htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001683 const xmlChar *q;
1684 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001685
1686 if (CUR == '"') {
1687 NEXT;
1688 q = CUR_PTR;
1689 while ((IS_CHAR(CUR)) && (CUR != '"'))
1690 NEXT;
1691 if (!IS_CHAR(CUR)) {
1692 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1693 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
1694 ctxt->wellFormed = 0;
1695 } else {
1696 ret = xmlStrndup(q, CUR_PTR - q);
1697 NEXT;
1698 }
1699 } else if (CUR == '\'') {
1700 NEXT;
1701 q = CUR_PTR;
1702 while ((IS_CHAR(CUR)) && (CUR != '\''))
1703 NEXT;
1704 if (!IS_CHAR(CUR)) {
1705 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1706 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
1707 ctxt->wellFormed = 0;
1708 } else {
1709 ret = xmlStrndup(q, CUR_PTR - q);
1710 NEXT;
1711 }
1712 } else {
1713 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardcf461992000-03-14 18:30:20 +00001714 ctxt->sax->error(ctxt->userData,
1715 "SystemLiteral \" or ' expected\n");
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001716 ctxt->wellFormed = 0;
1717 }
1718
1719 return(ret);
1720}
1721
1722/**
1723 * htmlParsePubidLiteral:
1724 * @ctxt: an HTML parser context
1725 *
1726 * parse an HTML public literal
1727 *
1728 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
1729 *
1730 * Returns the PubidLiteral parsed or NULL.
1731 */
1732
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001733xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001734htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001735 const xmlChar *q;
1736 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001737 /*
1738 * Name ::= (Letter | '_') (NameChar)*
1739 */
1740 if (CUR == '"') {
1741 NEXT;
1742 q = CUR_PTR;
1743 while (IS_PUBIDCHAR(CUR)) NEXT;
1744 if (CUR != '"') {
1745 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1746 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
1747 ctxt->wellFormed = 0;
1748 } else {
1749 ret = xmlStrndup(q, CUR_PTR - q);
1750 NEXT;
1751 }
1752 } else if (CUR == '\'') {
1753 NEXT;
1754 q = CUR_PTR;
1755 while ((IS_LETTER(CUR)) && (CUR != '\''))
1756 NEXT;
1757 if (!IS_LETTER(CUR)) {
1758 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1759 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
1760 ctxt->wellFormed = 0;
1761 } else {
1762 ret = xmlStrndup(q, CUR_PTR - q);
1763 NEXT;
1764 }
1765 } else {
1766 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1767 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
1768 ctxt->wellFormed = 0;
1769 }
1770
1771 return(ret);
1772}
1773
1774/**
1775 * htmlParseCharData:
1776 * @ctxt: an HTML parser context
1777 * @cdata: int indicating whether we are within a CDATA section
1778 *
1779 * parse a CharData section.
1780 * if we are within a CDATA section ']]>' marks an end of section.
1781 *
1782 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
1783 */
1784
1785void
1786htmlParseCharData(htmlParserCtxtPtr ctxt, int cdata) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001787 xmlChar *buf = NULL;
1788 int len = 0;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001789 int size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001790 xmlChar q;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001791
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001792 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
1793 if (buf == NULL) {
1794 fprintf(stderr, "malloc of %d byte failed\n", size);
1795 return;
1796 }
1797
1798 q = CUR;
1799 while ((IS_CHAR(q)) && (q != '<') &&
1800 (q != '&')) {
1801 if ((q == ']') && (NXT(1) == ']') &&
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001802 (NXT(2) == '>')) {
1803 if (cdata) break;
1804 else {
1805 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1806 ctxt->sax->error(ctxt->userData,
1807 "Sequence ']]>' not allowed in content\n");
1808 ctxt->wellFormed = 0;
1809 }
1810 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001811 if (len + 1 >= size) {
1812 size *= 2;
1813 buf = xmlRealloc(buf, size * sizeof(xmlChar));
1814 if (buf == NULL) {
1815 fprintf(stderr, "realloc of %d byte failed\n", size);
1816 return;
1817 }
1818 }
1819 buf[len++] = q;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001820 NEXT;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001821 q = CUR;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001822 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001823 if (len == 0) {
1824 xmlFree(buf);
1825 return;
1826 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001827
1828 /*
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001829 * Ok the buffer is to be consumed as chars.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001830 */
1831 if (ctxt->sax != NULL) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001832 if (areBlanks(ctxt, buf, len)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001833 if (ctxt->sax->ignorableWhitespace != NULL)
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001834 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, len);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001835 } else {
1836 if (ctxt->sax->characters != NULL)
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001837 ctxt->sax->characters(ctxt->userData, buf, len);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001838 }
1839 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001840 xmlFree(buf);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001841}
1842
1843/**
1844 * htmlParseExternalID:
1845 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001846 * @publicID: a xmlChar** receiving PubidLiteral
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001847 * @strict: indicate whether we should restrict parsing to only
1848 * production [75], see NOTE below
1849 *
1850 * Parse an External ID or a Public ID
1851 *
1852 * NOTE: Productions [75] and [83] interract badly since [75] can generate
1853 * 'PUBLIC' S PubidLiteral S SystemLiteral
1854 *
1855 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
1856 * | 'PUBLIC' S PubidLiteral S SystemLiteral
1857 *
1858 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
1859 *
1860 * Returns the function returns SystemLiteral and in the second
1861 * case publicID receives PubidLiteral, is strict is off
1862 * it is possible to return NULL and have publicID set.
1863 */
1864
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001865xmlChar *
1866htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID, int strict) {
1867 xmlChar *URI = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001868
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001869 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
1870 (UPP(2) == 'S') && (UPP(3) == 'T') &&
1871 (UPP(4) == 'E') && (UPP(5) == 'M')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001872 SKIP(6);
1873 if (!IS_BLANK(CUR)) {
1874 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1875 ctxt->sax->error(ctxt->userData,
1876 "Space required after 'SYSTEM'\n");
1877 ctxt->wellFormed = 0;
1878 }
1879 SKIP_BLANKS;
1880 URI = htmlParseSystemLiteral(ctxt);
1881 if (URI == NULL) {
1882 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1883 ctxt->sax->error(ctxt->userData,
1884 "htmlParseExternalID: SYSTEM, no URI\n");
1885 ctxt->wellFormed = 0;
1886 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001887 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
1888 (UPP(2) == 'B') && (UPP(3) == 'L') &&
1889 (UPP(4) == 'I') && (UPP(5) == 'C')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001890 SKIP(6);
1891 if (!IS_BLANK(CUR)) {
1892 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1893 ctxt->sax->error(ctxt->userData,
1894 "Space required after 'PUBLIC'\n");
1895 ctxt->wellFormed = 0;
1896 }
1897 SKIP_BLANKS;
1898 *publicID = htmlParsePubidLiteral(ctxt);
1899 if (*publicID == NULL) {
1900 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1901 ctxt->sax->error(ctxt->userData,
1902 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
1903 ctxt->wellFormed = 0;
1904 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001905 SKIP_BLANKS;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001906 if ((CUR == '"') || (CUR == '\'')) {
1907 URI = htmlParseSystemLiteral(ctxt);
1908 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001909 }
1910 return(URI);
1911}
1912
1913/**
1914 * htmlParseComment:
1915 * @ctxt: an HTML parser context
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001916 *
1917 * Parse an XML (SGML) comment <!-- .... -->
1918 *
1919 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
1920 */
1921void
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001922htmlParseComment(htmlParserCtxtPtr ctxt) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001923 xmlChar *buf = NULL;
1924 int len = 0;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001925 int size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001926 register xmlChar s, r, q;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001927
1928 /*
1929 * Check that there is a comment right here.
1930 */
1931 if ((CUR != '<') || (NXT(1) != '!') ||
1932 (NXT(2) != '-') || (NXT(3) != '-')) return;
1933
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001934 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
1935 if (buf == NULL) {
1936 fprintf(stderr, "malloc of %d byte failed\n", size);
1937 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001938 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001939 q = r = '-'; /* 0 or '-' to cover our ass against <!--> and <!---> ? !!! */
1940 SKIP(4);
1941 s = CUR;
1942
1943 while (IS_CHAR(s) &&
1944 ((s != '>') || (r != '-') || (q != '-'))) {
1945 if (len + 1 >= size) {
1946 size *= 2;
1947 buf = xmlRealloc(buf, size * sizeof(xmlChar));
1948 if (buf == NULL) {
1949 fprintf(stderr, "realloc of %d byte failed\n", size);
1950 return;
1951 }
1952 }
1953 buf[len++] = s;
1954 NEXT;
1955 q = r;
1956 r = s;
1957 s = CUR;
1958 }
1959 buf[len - 2] = 0;
1960 if (!IS_CHAR(s)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001961 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001962 ctxt->sax->error(ctxt->userData, "Comment not terminated \n<!--%.50s\n", buf);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001963 ctxt->wellFormed = 0;
1964 } else {
1965 NEXT;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001966 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL)) {
1967 ctxt->sax->comment(ctxt->userData, buf);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001968 }
1969 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001970 xmlFree(buf);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001971}
1972
1973/**
1974 * htmlParseCharRef:
1975 * @ctxt: an HTML parser context
1976 *
1977 * parse Reference declarations
1978 *
1979 * [66] CharRef ::= '&#' [0-9]+ ';' |
1980 * '&#x' [0-9a-fA-F]+ ';'
1981 *
1982 * Returns the value parsed (as an int)
1983 */
1984int
1985htmlParseCharRef(htmlParserCtxtPtr ctxt) {
1986 int val = 0;
1987
1988 if ((CUR == '&') && (NXT(1) == '#') &&
1989 (NXT(2) == 'x')) {
1990 SKIP(3);
1991 while (CUR != ';') {
1992 if ((CUR >= '0') && (CUR <= '9'))
1993 val = val * 16 + (CUR - '0');
1994 else if ((CUR >= 'a') && (CUR <= 'f'))
1995 val = val * 16 + (CUR - 'a') + 10;
1996 else if ((CUR >= 'A') && (CUR <= 'F'))
1997 val = val * 16 + (CUR - 'A') + 10;
1998 else {
1999 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2000 ctxt->sax->error(ctxt->userData,
2001 "htmlParseCharRef: invalid hexadecimal value\n");
2002 ctxt->wellFormed = 0;
2003 val = 0;
2004 break;
2005 }
2006 NEXT;
2007 }
2008 if (CUR == ';')
2009 NEXT;
2010 } else if ((CUR == '&') && (NXT(1) == '#')) {
2011 SKIP(2);
2012 while (CUR != ';') {
2013 if ((CUR >= '0') && (CUR <= '9'))
2014 val = val * 10 + (CUR - '0');
2015 else {
2016 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2017 ctxt->sax->error(ctxt->userData,
2018 "htmlParseCharRef: invalid decimal value\n");
2019 ctxt->wellFormed = 0;
2020 val = 0;
2021 break;
2022 }
2023 NEXT;
2024 }
2025 if (CUR == ';')
2026 NEXT;
2027 } else {
2028 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2029 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2030 ctxt->wellFormed = 0;
2031 }
2032 /*
2033 * Check the value IS_CHAR ...
2034 */
2035 if (IS_CHAR(val)) {
2036 return(val);
2037 } else {
2038 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002039 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002040 val);
2041 ctxt->wellFormed = 0;
2042 }
2043 return(0);
2044}
2045
2046
2047/**
2048 * htmlParseDocTypeDecl :
2049 * @ctxt: an HTML parser context
2050 *
2051 * parse a DOCTYPE declaration
2052 *
2053 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2054 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2055 */
2056
2057void
2058htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002059 xmlChar *name;
2060 xmlChar *ExternalID = NULL;
2061 xmlChar *URI = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002062
2063 /*
2064 * We know that '<!DOCTYPE' has been detected.
2065 */
2066 SKIP(9);
2067
2068 SKIP_BLANKS;
2069
2070 /*
2071 * Parse the DOCTYPE name.
2072 */
2073 name = htmlParseName(ctxt);
2074 if (name == NULL) {
2075 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2076 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2077 ctxt->wellFormed = 0;
2078 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002079 /*
2080 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2081 */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002082
2083 SKIP_BLANKS;
2084
2085 /*
2086 * Check for SystemID and ExternalID
2087 */
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002088 URI = htmlParseExternalID(ctxt, &ExternalID, 0);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002089 SKIP_BLANKS;
2090
2091 /*
2092 * We should be at the end of the DOCTYPE declaration.
2093 */
2094 if (CUR != '>') {
2095 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2096 ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
2097 ctxt->wellFormed = 0;
2098 /* We shouldn't try to resynchronize ... */
2099 } else {
2100 }
2101 NEXT;
2102
2103 /*
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002104 * Create the document accordingly to the DOCTYPE
2105 */
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002106 if (ctxt->myDoc != NULL)
2107 xmlFreeDoc(ctxt->myDoc);
2108
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002109 ctxt->myDoc = htmlNewDoc(URI, ExternalID);
2110
2111 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002112 * Cleanup, since we don't use all those identifiers
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002113 */
Daniel Veillard6454aec1999-09-02 22:04:43 +00002114 if (URI != NULL) xmlFree(URI);
2115 if (ExternalID != NULL) xmlFree(ExternalID);
2116 if (name != NULL) xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002117}
2118
2119/**
2120 * htmlParseAttribute:
2121 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002122 * @value: a xmlChar ** used to store the value of the attribute
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002123 *
2124 * parse an attribute
2125 *
2126 * [41] Attribute ::= Name Eq AttValue
2127 *
2128 * [25] Eq ::= S? '=' S?
2129 *
2130 * With namespace:
2131 *
2132 * [NS 11] Attribute ::= QName Eq AttValue
2133 *
2134 * Also the case QName == xmlns:??? is handled independently as a namespace
2135 * definition.
2136 *
2137 * Returns the attribute name, and the value in *value.
2138 */
2139
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002140xmlChar *
2141htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002142 xmlChar *name, *val = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002143
2144 *value = NULL;
2145 name = htmlParseName(ctxt);
2146 if (name == NULL) {
2147 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2148 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2149 ctxt->wellFormed = 0;
2150 return(NULL);
2151 }
2152
2153 /*
2154 * read the value
2155 */
2156 SKIP_BLANKS;
2157 if (CUR == '=') {
2158 NEXT;
2159 SKIP_BLANKS;
2160 val = htmlParseAttValue(ctxt);
2161 } else {
Daniel Veillard4a53eca1999-12-12 13:03:50 +00002162 /* TODO : some attribute must have values, some may not */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002163 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002164 ctxt->sax->warning(ctxt->userData,
2165 "No value for attribute %s\n", name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002166 }
2167
2168 *value = val;
2169 return(name);
2170}
2171
2172/**
2173 * htmlParseStartTag:
2174 * @ctxt: an HTML parser context
2175 *
2176 * parse a start of tag either for rule element or
2177 * EmptyElement. In both case we don't parse the tag closing chars.
2178 *
2179 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2180 *
2181 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2182 *
2183 * With namespace:
2184 *
2185 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2186 *
2187 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2188 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002189 */
2190
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002191void
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002192htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002193 xmlChar *name;
2194 xmlChar *attname;
2195 xmlChar *attvalue;
2196 const xmlChar **atts = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002197 int nbatts = 0;
2198 int maxatts = 0;
2199 int i;
2200
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002201 if (CUR != '<') return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002202 NEXT;
2203
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002204 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002205 name = htmlParseHTMLName(ctxt);
2206 if (name == NULL) {
2207 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2208 ctxt->sax->error(ctxt->userData,
2209 "htmlParseStartTag: invalid element name\n");
2210 ctxt->wellFormed = 0;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002211 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002212 }
2213
2214 /*
2215 * Check for auto-closure of HTML elements.
2216 */
2217 htmlAutoClose(ctxt, name);
2218
2219 /*
2220 * Now parse the attributes, it ends up with the ending
2221 *
2222 * (S Attribute)* S?
2223 */
2224 SKIP_BLANKS;
2225 while ((IS_CHAR(CUR)) &&
2226 (CUR != '>') &&
2227 ((CUR != '/') || (NXT(1) != '>'))) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002228 long cons = ctxt->nbChars;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002229
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002230 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002231 attname = htmlParseAttribute(ctxt, &attvalue);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002232 if (attname != NULL) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002233 /*
2234 * Well formedness requires at most one declaration of an attribute
2235 */
2236 for (i = 0; i < nbatts;i += 2) {
2237 if (!xmlStrcmp(atts[i], attname)) {
2238 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002239 ctxt->sax->error(ctxt->userData,
2240 "Attribute %s redefined\n",
2241 attname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002242 ctxt->wellFormed = 0;
Daniel Veillard6454aec1999-09-02 22:04:43 +00002243 xmlFree(attname);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002244 if (attvalue != NULL)
2245 xmlFree(attvalue);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002246 goto failed;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002247 }
2248 }
2249
2250 /*
2251 * Add the pair to atts
2252 */
2253 if (atts == NULL) {
2254 maxatts = 10;
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002255 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002256 if (atts == NULL) {
2257 fprintf(stderr, "malloc of %ld byte failed\n",
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002258 maxatts * (long)sizeof(xmlChar *));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002259 if (name != NULL) xmlFree(name);
2260 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002261 }
Daniel Veillard51e3b151999-11-12 17:02:31 +00002262 } else if (nbatts + 4 > maxatts) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002263 maxatts *= 2;
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002264 atts = (const xmlChar **) xmlRealloc(atts, maxatts * sizeof(xmlChar *));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002265 if (atts == NULL) {
2266 fprintf(stderr, "realloc of %ld byte failed\n",
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002267 maxatts * (long)sizeof(xmlChar *));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002268 if (name != NULL) xmlFree(name);
2269 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002270 }
2271 }
2272 atts[nbatts++] = attname;
2273 atts[nbatts++] = attvalue;
2274 atts[nbatts] = NULL;
2275 atts[nbatts + 1] = NULL;
2276 }
2277
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002278failed:
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002279 SKIP_BLANKS;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002280 if (cons == ctxt->nbChars) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002281 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2282 ctxt->sax->error(ctxt->userData,
2283 "htmlParseStartTag: problem parsing attributes\n");
2284 ctxt->wellFormed = 0;
2285 break;
2286 }
2287 }
2288
2289 /*
2290 * SAX: Start of Element !
2291 */
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002292 htmlnamePush(ctxt, xmlStrdup(name));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002293#ifdef DEBUG
2294 fprintf(stderr,"Start of element %s: pushed %s\n", name, ctxt->name);
2295#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002296 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
2297 ctxt->sax->startElement(ctxt->userData, name, atts);
2298
2299 if (atts != NULL) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002300 for (i = 0;i < nbatts;i++) {
2301 if (atts[i] != NULL)
2302 xmlFree((xmlChar *) atts[i]);
2303 }
Daniel Veillard6454aec1999-09-02 22:04:43 +00002304 xmlFree(atts);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002305 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002306 if (name != NULL) xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002307}
2308
2309/**
2310 * htmlParseEndTag:
2311 * @ctxt: an HTML parser context
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002312 *
2313 * parse an end of tag
2314 *
2315 * [42] ETag ::= '</' Name S? '>'
2316 *
2317 * With namespace
2318 *
2319 * [NS 9] ETag ::= '</' QName S? '>'
2320 */
2321
2322void
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002323htmlParseEndTag(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002324 xmlChar *name;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002325 xmlChar *oldname;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002326 int i;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002327
2328 if ((CUR != '<') || (NXT(1) != '/')) {
2329 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2330 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
2331 ctxt->wellFormed = 0;
2332 return;
2333 }
2334 SKIP(2);
2335
2336 name = htmlParseHTMLName(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002337 if (name == NULL) return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002338
2339 /*
2340 * We should definitely be at the ending "S? '>'" part
2341 */
2342 SKIP_BLANKS;
2343 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
2344 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2345 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
2346 ctxt->wellFormed = 0;
2347 } else
2348 NEXT;
2349
2350 /*
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002351 * If the name read is not one of the element in the parsing stack
2352 * then return, it's just an error.
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002353 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002354 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
2355 if (!xmlStrcmp(name, ctxt->nameTab[i])) break;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002356 }
2357 if (i < 0) {
2358 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002359 ctxt->sax->error(ctxt->userData,
2360 "Unexpected end tag : %s\n", name);
Daniel Veillard6454aec1999-09-02 22:04:43 +00002361 xmlFree(name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002362 ctxt->wellFormed = 0;
2363 return;
2364 }
2365
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002366
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002367 /*
2368 * Check for auto-closure of HTML elements.
2369 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002370
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002371 htmlAutoCloseOnClose(ctxt, name);
2372
2373 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002374 * Well formedness constraints, opening and closing must match.
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002375 * With the exception that the autoclose may have popped stuff out
2376 * of the stack.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002377 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002378 if (xmlStrcmp(name, ctxt->name)) {
2379#ifdef DEBUG
2380 fprintf(stderr,"End of tag %s: expecting %s\n", name, ctxt->name);
2381#endif
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002382 if ((ctxt->name != NULL) &&
2383 (xmlStrcmp(ctxt->name, name))) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002384 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2385 ctxt->sax->error(ctxt->userData,
2386 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002387 name, ctxt->name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002388 ctxt->wellFormed = 0;
2389 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002390 }
2391
2392 /*
2393 * SAX: End of Tag
2394 */
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002395 oldname = ctxt->name;
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002396 if ((oldname != NULL) && (!xmlStrcmp(oldname, name))) {
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002397 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2398 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002399 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002400 if (oldname != NULL) {
2401#ifdef DEBUG
2402 fprintf(stderr,"End of tag %s: popping out %s\n", name, oldname);
2403#endif
2404 xmlFree(oldname);
2405#ifdef DEBUG
2406 } else {
2407 fprintf(stderr,"End of tag %s: stack empty !!!\n", name);
2408#endif
2409 }
2410 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002411
2412 if (name != NULL)
Daniel Veillard6454aec1999-09-02 22:04:43 +00002413 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002414
2415 return;
2416}
2417
2418
2419/**
2420 * htmlParseReference:
2421 * @ctxt: an HTML parser context
2422 *
2423 * parse and handle entity references in content,
2424 * this will end-up in a call to character() since this is either a
2425 * CharRef, or a predefined entity.
2426 */
2427void
2428htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002429 htmlEntityDescPtr ent;
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002430 xmlChar out[2];
2431 xmlChar *name;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002432 int val;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002433 if (CUR != '&') return;
2434
2435 if (NXT(1) == '#') {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002436 val = htmlParseCharRef(ctxt);
Daniel Veillardb96e6431999-08-29 21:02:19 +00002437 /* invalid for UTF-8 variable encoding !!!!! */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002438 out[0] = val;
2439 out[1] = 0;
2440 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
2441 ctxt->sax->characters(ctxt->userData, out, 1);
2442 } else {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002443 ent = htmlParseEntityRef(ctxt, &name);
Daniel Veillard71b656e2000-01-05 14:46:17 +00002444 if (name == NULL) {
2445 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
2446 return;
2447 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002448 if ((ent == NULL) || (ent->value <= 0) || (ent->value >= 255)) {
2449 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
Daniel Veillardb96e6431999-08-29 21:02:19 +00002450 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002451 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
Daniel Veillard71b656e2000-01-05 14:46:17 +00002452 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002453 }
2454 } else {
Daniel Veillardb96e6431999-08-29 21:02:19 +00002455 /* invalid for UTF-8 variable encoding !!!!! */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002456 out[0] = ent->value;
2457 out[1] = 0;
2458 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
2459 ctxt->sax->characters(ctxt->userData, out, 1);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002460 }
Daniel Veillard6454aec1999-09-02 22:04:43 +00002461 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002462 }
2463}
2464
2465/**
2466 * htmlParseContent:
2467 * @ctxt: an HTML parser context
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002468 * @name: the node name
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002469 *
2470 * Parse a content: comment, sub-element, reference or text.
2471 *
2472 */
2473
2474void
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002475htmlParseContent(htmlParserCtxtPtr ctxt) {
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002476 xmlChar *currentNode;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002477 int depth;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002478
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002479 currentNode = xmlStrdup(ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002480 depth = ctxt->nameNr;
2481 while (1) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002482 long cons = ctxt->nbChars;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002483
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002484 GROW;
2485 /*
2486 * Our tag or one of it's parent or children is ending.
2487 */
2488 if ((CUR == '<') && (NXT(1) == '/')) {
2489 htmlParseEndTag(ctxt);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002490 if (currentNode != NULL) xmlFree(currentNode);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002491 return;
2492 }
2493
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002494 /*
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002495 * Has this node been popped out during parsing of
2496 * the next element
2497 */
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002498 if ((xmlStrcmp(currentNode, ctxt->name)) &&
2499 (depth >= ctxt->nameNr)) {
2500 if (currentNode != NULL) xmlFree(currentNode);
2501 return;
2502 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002503
2504 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002505 * First case : a comment
2506 */
2507 if ((CUR == '<') && (NXT(1) == '!') &&
2508 (NXT(2) == '-') && (NXT(3) == '-')) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002509 htmlParseComment(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002510 }
2511
2512 /*
2513 * Second case : a sub-element.
2514 */
2515 else if (CUR == '<') {
2516 htmlParseElement(ctxt);
2517 }
2518
2519 /*
2520 * Third case : a reference. If if has not been resolved,
2521 * parsing returns it's Name, create the node
2522 */
2523 else if (CUR == '&') {
2524 htmlParseReference(ctxt);
2525 }
2526
2527 /*
2528 * Last case, text. Note that References are handled directly.
2529 */
2530 else {
2531 htmlParseCharData(ctxt, 0);
2532 }
2533
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002534 if (cons == ctxt->nbChars) {
Daniel Veillard35008381999-10-25 13:15:52 +00002535 if (ctxt->node != NULL) {
2536 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2537 ctxt->sax->error(ctxt->userData,
2538 "detected an error in element content\n");
2539 ctxt->wellFormed = 0;
2540 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002541 break;
2542 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002543
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002544 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002545 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002546 if (currentNode != NULL) xmlFree(currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002547}
2548
2549/**
2550 * htmlParseElement:
2551 * @ctxt: an HTML parser context
2552 *
2553 * parse an HTML element, this is highly recursive
2554 *
2555 * [39] element ::= EmptyElemTag | STag content ETag
2556 *
2557 * [41] Attribute ::= Name Eq AttValue
2558 */
2559
2560void
2561htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002562 const xmlChar *openTag = CUR_PTR;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002563 xmlChar *name;
Daniel Veillard7d2c2761999-10-11 15:09:51 +00002564 xmlChar *currentNode = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002565 htmlElemDescPtr info;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002566 htmlParserNodeInfo node_info;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002567 xmlChar *oldname;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002568 int depth = ctxt->nameNr;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002569
2570 /* Capture start position */
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002571 if (ctxt->record_info) {
2572 node_info.begin_pos = ctxt->input->consumed +
2573 (CUR_PTR - ctxt->input->base);
2574 node_info.begin_line = ctxt->input->line;
2575 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002576
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002577 oldname = xmlStrdup(ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002578 htmlParseStartTag(ctxt);
2579 name = ctxt->name;
2580#ifdef DEBUG
2581 if (oldname == NULL)
2582 fprintf(stderr, "Start of element %s\n", name);
2583 else if (name == NULL)
2584 fprintf(stderr, "Start of element failed, was %s\n", oldname);
2585 else
2586 fprintf(stderr, "Start of element %s, was %s\n", name, oldname);
2587#endif
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002588 if (((depth == ctxt->nameNr) && (!xmlStrcmp(oldname, ctxt->name))) ||
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002589 (name == NULL)) {
2590 if (CUR == '>')
2591 NEXT;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002592 if (oldname != NULL)
2593 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002594 return;
2595 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002596 if (oldname != NULL)
2597 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002598
2599 /*
2600 * Lookup the info for that element.
2601 */
2602 info = htmlTagLookup(name);
2603 if (info == NULL) {
2604 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2605 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
2606 name);
2607 ctxt->wellFormed = 0;
2608 } else if (info->depr) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002609/***************************
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002610 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
2611 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
2612 name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002613 ***************************/
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002614 }
2615
2616 /*
2617 * Check for an Empty Element labelled the XML/SGML way
2618 */
2619 if ((CUR == '/') && (NXT(1) == '>')) {
2620 SKIP(2);
2621 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2622 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002623 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002624#ifdef DEBUG
2625 fprintf(stderr,"End of tag the XML way: popping out %s\n", oldname);
2626#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002627 if (oldname != NULL)
2628 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002629 return;
2630 }
2631
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002632 if (CUR == '>') {
2633 NEXT;
2634 } else {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002635 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2636 ctxt->sax->error(ctxt->userData, "Couldn't find end of Start Tag\n%.30s\n",
2637 openTag);
2638 ctxt->wellFormed = 0;
2639
2640 /*
2641 * end of parsing of this node.
2642 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002643 if (!xmlStrcmp(name, ctxt->name)) {
2644 nodePop(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002645 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002646#ifdef DEBUG
2647 fprintf(stderr,"End of start tag problem: popping out %s\n", oldname);
2648#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002649 if (oldname != NULL)
2650 xmlFree(oldname);
2651 }
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002652
2653 /*
2654 * Capture end position and add node
2655 */
2656 if ( currentNode != NULL && ctxt->record_info ) {
2657 node_info.end_pos = ctxt->input->consumed +
2658 (CUR_PTR - ctxt->input->base);
2659 node_info.end_line = ctxt->input->line;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002660 node_info.node = ctxt->node;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002661 xmlParserAddNodeInfo(ctxt, &node_info);
2662 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002663 return;
2664 }
2665
2666 /*
2667 * Check for an Empty Element from DTD definition
2668 */
2669 if ((info != NULL) && (info->empty)) {
2670 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2671 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002672 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002673#ifdef DEBUG
2674 fprintf(stderr,"End of empty tag %s : popping out %s\n", name, oldname);
2675#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002676 if (oldname != NULL)
2677 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002678 return;
2679 }
2680
2681 /*
2682 * Parse the content of the element:
2683 */
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002684 currentNode = xmlStrdup(ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002685 depth = ctxt->nameNr;
2686 while (IS_CHAR(CUR)) {
2687 htmlParseContent(ctxt);
2688 if (ctxt->nameNr < depth) break;
2689 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002690
2691 if (!IS_CHAR(CUR)) {
2692 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2693 ctxt->sax->error(ctxt->userData,
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002694 "Premature end of data in tag %s\n", currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002695 ctxt->wellFormed = 0;
2696
2697 /*
2698 * end of parsing of this node.
2699 */
2700 nodePop(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002701 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002702#ifdef DEBUG
2703 fprintf(stderr,"Premature end of tag %s : popping out %s\n", name, oldname);
2704#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002705 if (oldname != NULL)
2706 xmlFree(oldname);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002707 if (currentNode != NULL)
2708 xmlFree(currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002709 return;
2710 }
2711
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002712 /*
2713 * Capture end position and add node
2714 */
2715 if ( currentNode != NULL && ctxt->record_info ) {
2716 node_info.end_pos = ctxt->input->consumed +
2717 (CUR_PTR - ctxt->input->base);
2718 node_info.end_line = ctxt->input->line;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002719 node_info.node = ctxt->node;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002720 xmlParserAddNodeInfo(ctxt, &node_info);
2721 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002722 if (currentNode != NULL)
2723 xmlFree(currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002724}
2725
2726/**
2727 * htmlParseDocument :
2728 * @ctxt: an HTML parser context
2729 *
2730 * parse an HTML document (and build a tree if using the standard SAX
2731 * interface).
2732 *
2733 * Returns 0, -1 in case of error. the parser context is augmented
2734 * as a result of the parsing.
2735 */
2736
2737int
2738htmlParseDocument(htmlParserCtxtPtr ctxt) {
2739 htmlDefaultSAXHandlerInit();
2740 ctxt->html = 1;
2741
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002742 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002743 /*
Daniel Veillardb96e6431999-08-29 21:02:19 +00002744 * SAX: beginning of the document processing.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002745 */
2746 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
2747 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
2748
2749 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002750 * Wipe out everything which is before the first '<'
2751 */
Daniel Veillard35008381999-10-25 13:15:52 +00002752 SKIP_BLANKS;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002753 if (CUR == 0) {
2754 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2755 ctxt->sax->error(ctxt->userData, "Document is empty\n");
2756 ctxt->wellFormed = 0;
2757 }
2758
Daniel Veillard35008381999-10-25 13:15:52 +00002759 /*
2760 * Parse possible comments before any content
2761 */
2762 while ((CUR == '<') && (NXT(1) == '!') &&
2763 (NXT(2) == '-') && (NXT(3) == '-')) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002764 if (ctxt->myDoc == NULL)
2765 ctxt->myDoc = htmlNewDoc(NULL, NULL);
2766 htmlParseComment(ctxt);
Daniel Veillard35008381999-10-25 13:15:52 +00002767 SKIP_BLANKS;
2768 }
2769
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002770
2771 /*
2772 * Then possibly doc type declaration(s) and more Misc
2773 * (doctypedecl Misc*)?
2774 */
2775 if ((CUR == '<') && (NXT(1) == '!') &&
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002776 (UPP(2) == 'D') && (UPP(3) == 'O') &&
2777 (UPP(4) == 'C') && (UPP(5) == 'T') &&
2778 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
2779 (UPP(8) == 'E')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002780 htmlParseDocTypeDecl(ctxt);
2781 }
2782 SKIP_BLANKS;
2783
2784 /*
2785 * Create the document if not done already.
2786 */
2787 if (ctxt->myDoc == NULL) {
2788 ctxt->myDoc = htmlNewDoc(NULL, NULL);
2789 }
2790
2791 /*
2792 * Time to start parsing the tree itself
2793 */
Daniel Veillard35008381999-10-25 13:15:52 +00002794 htmlParseContent(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002795
2796 /*
2797 * SAX: end of the document processing.
2798 */
2799 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
2800 ctxt->sax->endDocument(ctxt->userData);
2801 if (! ctxt->wellFormed) return(-1);
2802 return(0);
2803}
2804
2805
Daniel Veillarddbfd6411999-12-28 16:35:14 +00002806/************************************************************************
2807 * *
2808 * Parser contexts handling *
2809 * *
2810 ************************************************************************/
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002811
2812/**
2813 * xmlInitParserCtxt:
2814 * @ctxt: an HTML parser context
2815 *
2816 * Initialize a parser context
2817 */
2818
2819void
2820htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
2821{
2822 htmlSAXHandler *sax;
2823
Daniel Veillard35008381999-10-25 13:15:52 +00002824 if (ctxt == NULL) return;
2825 memset(ctxt, 0, sizeof(htmlParserCtxt));
2826
Daniel Veillard6454aec1999-09-02 22:04:43 +00002827 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002828 if (sax == NULL) {
2829 fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
2830 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002831 memset(sax, 0, sizeof(htmlSAXHandler));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002832
2833 /* Allocate the Input stack */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002834 ctxt->inputTab = (htmlParserInputPtr *)
2835 xmlMalloc(5 * sizeof(htmlParserInputPtr));
2836 if (ctxt->inputTab == NULL) {
2837 fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
2838 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002839 ctxt->inputNr = 0;
2840 ctxt->inputMax = 5;
2841 ctxt->input = NULL;
2842 ctxt->version = NULL;
2843 ctxt->encoding = NULL;
2844 ctxt->standalone = -1;
Daniel Veillarddbfd6411999-12-28 16:35:14 +00002845 ctxt->instate = XML_PARSER_START;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002846
2847 /* Allocate the Node stack */
Daniel Veillard6454aec1999-09-02 22:04:43 +00002848 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002849 ctxt->nodeNr = 0;
2850 ctxt->nodeMax = 10;
2851 ctxt->node = NULL;
2852
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002853 /* Allocate the Name stack */
2854 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
2855 ctxt->nameNr = 0;
2856 ctxt->nameMax = 10;
2857 ctxt->name = NULL;
2858
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002859 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
2860 else {
2861 ctxt->sax = sax;
2862 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
2863 }
2864 ctxt->userData = ctxt;
2865 ctxt->myDoc = NULL;
2866 ctxt->wellFormed = 1;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002867 ctxt->replaceEntities = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002868 ctxt->html = 1;
2869 ctxt->record_info = 0;
Daniel Veillard35008381999-10-25 13:15:52 +00002870 ctxt->validate = 0;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002871 ctxt->nbChars = 0;
Daniel Veillarddbfd6411999-12-28 16:35:14 +00002872 ctxt->checkIndex = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002873 xmlInitNodeInfoSeq(&ctxt->node_seq);
2874}
2875
2876/**
2877 * htmlFreeParserCtxt:
2878 * @ctxt: an HTML parser context
2879 *
2880 * Free all the memory used by a parser context. However the parsed
2881 * document in ctxt->myDoc is not freed.
2882 */
2883
2884void
2885htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
2886{
2887 htmlParserInputPtr input;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002888 xmlChar *oldname;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002889
2890 if (ctxt == NULL) return;
2891
2892 while ((input = inputPop(ctxt)) != NULL) {
2893 xmlFreeInputStream(input);
2894 }
2895
Daniel Veillard6454aec1999-09-02 22:04:43 +00002896 if (ctxt->nodeTab != NULL) xmlFree(ctxt->nodeTab);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002897 while ((oldname = htmlnamePop(ctxt)) != NULL) {
2898 xmlFree(oldname);
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002899 }
2900 if (ctxt->nameTab != NULL) xmlFree(ctxt->nameTab);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002901 if (ctxt->directory != NULL) xmlFree(ctxt->directory);
Daniel Veillard6454aec1999-09-02 22:04:43 +00002902 if (ctxt->inputTab != NULL) xmlFree(ctxt->inputTab);
2903 if (ctxt->version != NULL) xmlFree((char *) ctxt->version);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002904 if ((ctxt->sax != NULL) && (ctxt->sax != &htmlDefaultSAXHandler))
Daniel Veillard6454aec1999-09-02 22:04:43 +00002905 xmlFree(ctxt->sax);
2906 xmlFree(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002907}
2908
2909/**
2910 * htmlCreateDocParserCtxt :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002911 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002912 * @encoding: a free form C string describing the HTML document encoding, or NULL
2913 *
2914 * Create a parser context for an HTML document.
2915 *
2916 * Returns the new parser context or NULL
2917 */
2918htmlParserCtxtPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002919htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002920 htmlParserCtxtPtr ctxt;
2921 htmlParserInputPtr input;
2922 /* htmlCharEncoding enc; */
2923
Daniel Veillard6454aec1999-09-02 22:04:43 +00002924 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002925 if (ctxt == NULL) {
2926 perror("malloc");
2927 return(NULL);
2928 }
2929 htmlInitParserCtxt(ctxt);
Daniel Veillard6454aec1999-09-02 22:04:43 +00002930 input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002931 if (input == NULL) {
2932 perror("malloc");
Daniel Veillard6454aec1999-09-02 22:04:43 +00002933 xmlFree(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002934 return(NULL);
2935 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002936 memset(input, 0, sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002937
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002938 input->line = 1;
2939 input->col = 1;
2940 input->base = cur;
2941 input->cur = cur;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002942
2943 inputPush(ctxt, input);
2944 return(ctxt);
2945}
2946
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002947/************************************************************************
2948 * *
2949 * Progressive parsing interfaces *
2950 * *
2951 ************************************************************************/
2952
2953/**
2954 * htmlParseLookupSequence:
2955 * @ctxt: an HTML parser context
2956 * @first: the first char to lookup
2957 * @next: the next char to lookup or zero
2958 * @third: the next char to lookup or zero
2959 *
2960 * Try to find if a sequence (first, next, third) or just (first next) or
2961 * (first) is available in the input stream.
2962 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
2963 * to avoid rescanning sequences of bytes, it DOES change the state of the
2964 * parser, do not use liberally.
2965 * This is basically similar to xmlParseLookupSequence()
2966 *
2967 * Returns the index to the current parsing point if the full sequence
2968 * is available, -1 otherwise.
2969 */
2970int
2971htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
2972 xmlChar next, xmlChar third) {
2973 int base, len;
2974 htmlParserInputPtr in;
2975 const xmlChar *buf;
2976
2977 in = ctxt->input;
2978 if (in == NULL) return(-1);
2979 base = in->cur - in->base;
2980 if (base < 0) return(-1);
2981 if (ctxt->checkIndex > base)
2982 base = ctxt->checkIndex;
2983 if (in->buf == NULL) {
2984 buf = in->base;
2985 len = in->length;
2986 } else {
2987 buf = in->buf->buffer->content;
2988 len = in->buf->buffer->use;
2989 }
2990 /* take into account the sequence length */
2991 if (third) len -= 2;
2992 else if (next) len --;
2993 for (;base < len;base++) {
2994 if (buf[base] == first) {
2995 if (third != 0) {
2996 if ((buf[base + 1] != next) ||
2997 (buf[base + 2] != third)) continue;
2998 } else if (next != 0) {
2999 if (buf[base + 1] != next) continue;
3000 }
3001 ctxt->checkIndex = 0;
3002#ifdef DEBUG_PUSH
3003 if (next == 0)
3004 fprintf(stderr, "HPP: lookup '%c' found at %d\n",
3005 first, base);
3006 else if (third == 0)
3007 fprintf(stderr, "HPP: lookup '%c%c' found at %d\n",
3008 first, next, base);
3009 else
3010 fprintf(stderr, "HPP: lookup '%c%c%c' found at %d\n",
3011 first, next, third, base);
3012#endif
3013 return(base - (in->cur - in->base));
3014 }
3015 }
3016 ctxt->checkIndex = base;
3017#ifdef DEBUG_PUSH
3018 if (next == 0)
3019 fprintf(stderr, "HPP: lookup '%c' failed\n", first);
3020 else if (third == 0)
3021 fprintf(stderr, "HPP: lookup '%c%c' failed\n", first, next);
3022 else
3023 fprintf(stderr, "HPP: lookup '%c%c%c' failed\n", first, next, third);
3024#endif
3025 return(-1);
3026}
3027
3028/**
Daniel Veillard71b656e2000-01-05 14:46:17 +00003029 * htmlParseTryOrFinish:
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003030 * @ctxt: an HTML parser context
Daniel Veillard71b656e2000-01-05 14:46:17 +00003031 * @terminate: last chunk indicator
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003032 *
3033 * Try to progress on parsing
3034 *
3035 * Returns zero if no parsing was possible
3036 */
3037int
Daniel Veillard71b656e2000-01-05 14:46:17 +00003038htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003039 int ret = 0;
3040 htmlParserInputPtr in;
3041 int avail;
3042 xmlChar cur, next;
3043
3044#ifdef DEBUG_PUSH
3045 switch (ctxt->instate) {
3046 case XML_PARSER_EOF:
3047 fprintf(stderr, "HPP: try EOF\n"); break;
3048 case XML_PARSER_START:
3049 fprintf(stderr, "HPP: try START\n"); break;
3050 case XML_PARSER_MISC:
3051 fprintf(stderr, "HPP: try MISC\n");break;
3052 case XML_PARSER_COMMENT:
3053 fprintf(stderr, "HPP: try COMMENT\n");break;
3054 case XML_PARSER_PROLOG:
3055 fprintf(stderr, "HPP: try PROLOG\n");break;
3056 case XML_PARSER_START_TAG:
3057 fprintf(stderr, "HPP: try START_TAG\n");break;
3058 case XML_PARSER_CONTENT:
3059 fprintf(stderr, "HPP: try CONTENT\n");break;
3060 case XML_PARSER_CDATA_SECTION:
3061 fprintf(stderr, "HPP: try CDATA_SECTION\n");break;
3062 case XML_PARSER_END_TAG:
3063 fprintf(stderr, "HPP: try END_TAG\n");break;
3064 case XML_PARSER_ENTITY_DECL:
3065 fprintf(stderr, "HPP: try ENTITY_DECL\n");break;
3066 case XML_PARSER_ENTITY_VALUE:
3067 fprintf(stderr, "HPP: try ENTITY_VALUE\n");break;
3068 case XML_PARSER_ATTRIBUTE_VALUE:
3069 fprintf(stderr, "HPP: try ATTRIBUTE_VALUE\n");break;
3070 case XML_PARSER_DTD:
3071 fprintf(stderr, "HPP: try DTD\n");break;
3072 case XML_PARSER_EPILOG:
3073 fprintf(stderr, "HPP: try EPILOG\n");break;
3074 case XML_PARSER_PI:
3075 fprintf(stderr, "HPP: try PI\n");break;
3076 }
3077#endif
3078
3079 while (1) {
3080
3081 in = ctxt->input;
3082 if (in == NULL) break;
3083 if (in->buf == NULL)
3084 avail = in->length - (in->cur - in->base);
3085 else
3086 avail = in->buf->buffer->use - (in->cur - in->base);
3087 if (avail < 1)
3088 goto done;
3089 switch (ctxt->instate) {
3090 case XML_PARSER_EOF:
3091 /*
3092 * Document parsing is done !
3093 */
3094 goto done;
3095 case XML_PARSER_START:
3096 /*
3097 * Very first chars read from the document flow.
3098 */
3099 cur = in->cur[0];
3100 if (IS_BLANK(cur)) {
3101 SKIP_BLANKS;
3102 if (in->buf == NULL)
3103 avail = in->length - (in->cur - in->base);
3104 else
3105 avail = in->buf->buffer->use - (in->cur - in->base);
3106 }
3107 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3108 ctxt->sax->setDocumentLocator(ctxt->userData,
3109 &xmlDefaultSAXLocator);
3110 cur = in->cur[0];
3111 next = in->cur[1];
3112 if ((cur == '<') && (next == '!') &&
3113 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3114 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3115 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3116 (UPP(8) == 'E')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003117 if ((!terminate) &&
3118 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003119 goto done;
3120#ifdef DEBUG_PUSH
3121 fprintf(stderr, "HPP: Parsing internal subset\n");
3122#endif
3123 htmlParseDocTypeDecl(ctxt);
3124 ctxt->instate = XML_PARSER_PROLOG;
3125#ifdef DEBUG_PUSH
3126 fprintf(stderr, "HPP: entering PROLOG\n");
3127#endif
3128 } else {
3129 ctxt->myDoc = htmlNewDoc(NULL, NULL);
3130 ctxt->instate = XML_PARSER_MISC;
3131 }
3132#ifdef DEBUG_PUSH
3133 fprintf(stderr, "HPP: entering MISC\n");
3134#endif
3135 break;
3136 case XML_PARSER_MISC:
3137 SKIP_BLANKS;
3138 if (in->buf == NULL)
3139 avail = in->length - (in->cur - in->base);
3140 else
3141 avail = in->buf->buffer->use - (in->cur - in->base);
3142 if (avail < 2)
3143 goto done;
3144 cur = in->cur[0];
3145 next = in->cur[1];
3146 if ((cur == '<') && (next == '!') &&
3147 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003148 if ((!terminate) &&
3149 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003150 goto done;
3151#ifdef DEBUG_PUSH
3152 fprintf(stderr, "HPP: Parsing Comment\n");
3153#endif
3154 htmlParseComment(ctxt);
3155 ctxt->instate = XML_PARSER_MISC;
3156 } else if ((cur == '<') && (next == '!') &&
3157 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3158 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3159 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3160 (UPP(8) == 'E')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003161 if ((!terminate) &&
3162 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003163 goto done;
3164#ifdef DEBUG_PUSH
3165 fprintf(stderr, "HPP: Parsing internal subset\n");
3166#endif
3167 htmlParseDocTypeDecl(ctxt);
3168 ctxt->instate = XML_PARSER_PROLOG;
3169#ifdef DEBUG_PUSH
3170 fprintf(stderr, "HPP: entering PROLOG\n");
3171#endif
3172 } else if ((cur == '<') && (next == '!') &&
3173 (avail < 9)) {
3174 goto done;
3175 } else {
3176 ctxt->instate = XML_PARSER_START_TAG;
3177#ifdef DEBUG_PUSH
3178 fprintf(stderr, "HPP: entering START_TAG\n");
3179#endif
3180 }
3181 break;
3182 case XML_PARSER_PROLOG:
3183 SKIP_BLANKS;
3184 if (in->buf == NULL)
3185 avail = in->length - (in->cur - in->base);
3186 else
3187 avail = in->buf->buffer->use - (in->cur - in->base);
3188 if (avail < 2)
3189 goto done;
3190 cur = in->cur[0];
3191 next = in->cur[1];
3192 if ((cur == '<') && (next == '!') &&
3193 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003194 if ((!terminate) &&
3195 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003196 goto done;
3197#ifdef DEBUG_PUSH
3198 fprintf(stderr, "HPP: Parsing Comment\n");
3199#endif
3200 htmlParseComment(ctxt);
3201 ctxt->instate = XML_PARSER_PROLOG;
3202 } else if ((cur == '<') && (next == '!') &&
3203 (avail < 4)) {
3204 goto done;
3205 } else {
3206 ctxt->instate = XML_PARSER_START_TAG;
3207#ifdef DEBUG_PUSH
3208 fprintf(stderr, "HPP: entering START_TAG\n");
3209#endif
3210 }
3211 break;
3212 case XML_PARSER_EPILOG:
3213 SKIP_BLANKS;
3214 if (in->buf == NULL)
3215 avail = in->length - (in->cur - in->base);
3216 else
3217 avail = in->buf->buffer->use - (in->cur - in->base);
3218 if (avail < 2)
3219 goto done;
3220 cur = in->cur[0];
3221 next = in->cur[1];
3222 if ((cur == '<') && (next == '!') &&
3223 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003224 if ((!terminate) &&
3225 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003226 goto done;
3227#ifdef DEBUG_PUSH
3228 fprintf(stderr, "HPP: Parsing Comment\n");
3229#endif
3230 htmlParseComment(ctxt);
3231 ctxt->instate = XML_PARSER_EPILOG;
3232 } else if ((cur == '<') && (next == '!') &&
3233 (avail < 4)) {
3234 goto done;
3235 } else {
3236 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3237 ctxt->sax->error(ctxt->userData,
3238 "Extra content at the end of the document\n");
3239 ctxt->wellFormed = 0;
3240 ctxt->errNo = XML_ERR_DOCUMENT_END;
3241 ctxt->instate = XML_PARSER_EOF;
3242#ifdef DEBUG_PUSH
3243 fprintf(stderr, "HPP: entering EOF\n");
3244#endif
3245 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3246 ctxt->sax->endDocument(ctxt->userData);
3247 goto done;
3248 }
3249 break;
3250 case XML_PARSER_START_TAG: {
3251 xmlChar *name, *oldname;
3252 int depth = ctxt->nameNr;
3253 htmlElemDescPtr info;
3254
3255 if (avail < 2)
3256 goto done;
3257 cur = in->cur[0];
3258 if (cur != '<') {
3259 ctxt->instate = XML_PARSER_CONTENT;
3260#ifdef DEBUG_PUSH
3261 fprintf(stderr, "HPP: entering CONTENT\n");
3262#endif
3263 break;
3264 }
Daniel Veillard71b656e2000-01-05 14:46:17 +00003265 if ((!terminate) &&
3266 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003267 goto done;
3268
3269 oldname = xmlStrdup(ctxt->name);
3270 htmlParseStartTag(ctxt);
3271 name = ctxt->name;
3272#ifdef DEBUG
3273 if (oldname == NULL)
3274 fprintf(stderr, "Start of element %s\n", name);
3275 else if (name == NULL)
3276 fprintf(stderr, "Start of element failed, was %s\n",
3277 oldname);
3278 else
3279 fprintf(stderr, "Start of element %s, was %s\n",
3280 name, oldname);
3281#endif
3282 if (((depth == ctxt->nameNr) &&
3283 (!xmlStrcmp(oldname, ctxt->name))) ||
3284 (name == NULL)) {
3285 if (CUR == '>')
3286 NEXT;
3287 if (oldname != NULL)
3288 xmlFree(oldname);
3289 break;
3290 }
3291 if (oldname != NULL)
3292 xmlFree(oldname);
3293
3294 /*
3295 * Lookup the info for that element.
3296 */
3297 info = htmlTagLookup(name);
3298 if (info == NULL) {
3299 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3300 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3301 name);
3302 ctxt->wellFormed = 0;
3303 } else if (info->depr) {
3304 /***************************
3305 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3306 ctxt->sax->warning(ctxt->userData,
3307 "Tag %s is deprecated\n",
3308 name);
3309 ***************************/
3310 }
3311
3312 /*
3313 * Check for an Empty Element labelled the XML/SGML way
3314 */
3315 if ((CUR == '/') && (NXT(1) == '>')) {
3316 SKIP(2);
3317 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3318 ctxt->sax->endElement(ctxt->userData, name);
3319 oldname = htmlnamePop(ctxt);
3320#ifdef DEBUG
3321 fprintf(stderr,"End of tag the XML way: popping out %s\n",
3322 oldname);
3323#endif
3324 if (oldname != NULL)
3325 xmlFree(oldname);
3326 ctxt->instate = XML_PARSER_CONTENT;
3327#ifdef DEBUG_PUSH
3328 fprintf(stderr, "HPP: entering CONTENT\n");
3329#endif
3330 break;
3331 }
3332
3333 if (CUR == '>') {
3334 NEXT;
3335 } else {
3336 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3337 ctxt->sax->error(ctxt->userData,
3338 "Couldn't find end of Start Tag %s\n",
3339 name);
3340 ctxt->wellFormed = 0;
3341
3342 /*
3343 * end of parsing of this node.
3344 */
3345 if (!xmlStrcmp(name, ctxt->name)) {
3346 nodePop(ctxt);
3347 oldname = htmlnamePop(ctxt);
3348#ifdef DEBUG
3349 fprintf(stderr,
3350 "End of start tag problem: popping out %s\n", oldname);
3351#endif
3352 if (oldname != NULL)
3353 xmlFree(oldname);
3354 }
3355
3356 ctxt->instate = XML_PARSER_CONTENT;
3357#ifdef DEBUG_PUSH
3358 fprintf(stderr, "HPP: entering CONTENT\n");
3359#endif
3360 break;
3361 }
3362
3363 /*
3364 * Check for an Empty Element from DTD definition
3365 */
3366 if ((info != NULL) && (info->empty)) {
3367 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3368 ctxt->sax->endElement(ctxt->userData, name);
3369 oldname = htmlnamePop(ctxt);
3370#ifdef DEBUG
3371 fprintf(stderr,"End of empty tag %s : popping out %s\n", name, oldname);
3372#endif
3373 if (oldname != NULL)
3374 xmlFree(oldname);
3375 }
3376 ctxt->instate = XML_PARSER_CONTENT;
3377#ifdef DEBUG_PUSH
3378 fprintf(stderr, "HPP: entering CONTENT\n");
3379#endif
3380 break;
3381 }
3382 case XML_PARSER_CONTENT:
3383 /*
3384 * Handle preparsed entities and charRef
3385 */
3386 if (ctxt->token != 0) {
3387 xmlChar cur[2] = { 0 , 0 } ;
3388
3389 cur[0] = (xmlChar) ctxt->token;
3390 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3391 ctxt->sax->characters(ctxt->userData, cur, 1);
3392 ctxt->token = 0;
3393 ctxt->checkIndex = 0;
3394 }
3395 if (avail < 2)
3396 goto done;
3397 cur = in->cur[0];
3398 next = in->cur[1];
3399 if ((cur == '<') && (next == '!') &&
3400 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003401 if ((!terminate) &&
3402 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003403 goto done;
3404#ifdef DEBUG_PUSH
3405 fprintf(stderr, "HPP: Parsing Comment\n");
3406#endif
3407 htmlParseComment(ctxt);
3408 ctxt->instate = XML_PARSER_CONTENT;
3409 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
3410 goto done;
3411 } else if ((cur == '<') && (next == '/')) {
3412 ctxt->instate = XML_PARSER_END_TAG;
3413 ctxt->checkIndex = 0;
3414#ifdef DEBUG_PUSH
3415 fprintf(stderr, "HPP: entering END_TAG\n");
3416#endif
3417 break;
3418 } else if (cur == '<') {
3419 ctxt->instate = XML_PARSER_START_TAG;
3420 ctxt->checkIndex = 0;
3421#ifdef DEBUG_PUSH
3422 fprintf(stderr, "HPP: entering START_TAG\n");
3423#endif
3424 break;
3425 } else if (cur == '&') {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003426 if ((!terminate) &&
3427 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003428 goto done;
3429#ifdef DEBUG_PUSH
3430 fprintf(stderr, "HPP: Parsing Reference\n");
3431#endif
3432 /* TODO: check generation of subtrees if noent !!! */
3433 htmlParseReference(ctxt);
3434 } else {
3435 /* TODO Avoid the extra copy, handle directly !!!!!! */
3436 /*
3437 * Goal of the following test is :
3438 * - minimize calls to the SAX 'character' callback
3439 * when they are mergeable
3440 */
3441 if ((ctxt->inputNr == 1) &&
3442 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003443 if ((!terminate) &&
3444 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003445 goto done;
3446 }
3447 ctxt->checkIndex = 0;
3448#ifdef DEBUG_PUSH
3449 fprintf(stderr, "HPP: Parsing char data\n");
3450#endif
3451 htmlParseCharData(ctxt, 0);
3452 }
3453 break;
3454 case XML_PARSER_END_TAG:
3455 if (avail < 2)
3456 goto done;
Daniel Veillard71b656e2000-01-05 14:46:17 +00003457 if ((!terminate) &&
3458 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003459 goto done;
3460 htmlParseEndTag(ctxt);
3461 if (ctxt->nameNr == 0) {
3462 ctxt->instate = XML_PARSER_EPILOG;
3463 } else {
3464 ctxt->instate = XML_PARSER_CONTENT;
3465 }
3466 ctxt->checkIndex = 0;
3467#ifdef DEBUG_PUSH
3468 fprintf(stderr, "HPP: entering CONTENT\n");
3469#endif
3470 break;
3471 case XML_PARSER_CDATA_SECTION:
3472 fprintf(stderr, "HPP: internal error, state == CDATA\n");
3473 ctxt->instate = XML_PARSER_CONTENT;
3474 ctxt->checkIndex = 0;
3475#ifdef DEBUG_PUSH
3476 fprintf(stderr, "HPP: entering CONTENT\n");
3477#endif
3478 break;
3479 case XML_PARSER_DTD:
3480 fprintf(stderr, "HPP: internal error, state == DTD\n");
3481 ctxt->instate = XML_PARSER_CONTENT;
3482 ctxt->checkIndex = 0;
3483#ifdef DEBUG_PUSH
3484 fprintf(stderr, "HPP: entering CONTENT\n");
3485#endif
3486 break;
3487 case XML_PARSER_COMMENT:
3488 fprintf(stderr, "HPP: internal error, state == COMMENT\n");
3489 ctxt->instate = XML_PARSER_CONTENT;
3490 ctxt->checkIndex = 0;
3491#ifdef DEBUG_PUSH
3492 fprintf(stderr, "HPP: entering CONTENT\n");
3493#endif
3494 break;
3495 case XML_PARSER_PI:
3496 fprintf(stderr, "HPP: internal error, state == PI\n");
3497 ctxt->instate = XML_PARSER_CONTENT;
3498 ctxt->checkIndex = 0;
3499#ifdef DEBUG_PUSH
3500 fprintf(stderr, "HPP: entering CONTENT\n");
3501#endif
3502 break;
3503 case XML_PARSER_ENTITY_DECL:
3504 fprintf(stderr, "HPP: internal error, state == ENTITY_DECL\n");
3505 ctxt->instate = XML_PARSER_CONTENT;
3506 ctxt->checkIndex = 0;
3507#ifdef DEBUG_PUSH
3508 fprintf(stderr, "HPP: entering CONTENT\n");
3509#endif
3510 break;
3511 case XML_PARSER_ENTITY_VALUE:
3512 fprintf(stderr, "HPP: internal error, state == ENTITY_VALUE\n");
3513 ctxt->instate = XML_PARSER_CONTENT;
3514 ctxt->checkIndex = 0;
3515#ifdef DEBUG_PUSH
3516 fprintf(stderr, "HPP: entering DTD\n");
3517#endif
3518 break;
3519 case XML_PARSER_ATTRIBUTE_VALUE:
3520 fprintf(stderr, "HPP: internal error, state == ATTRIBUTE_VALUE\n");
3521 ctxt->instate = XML_PARSER_START_TAG;
3522 ctxt->checkIndex = 0;
3523#ifdef DEBUG_PUSH
3524 fprintf(stderr, "HPP: entering START_TAG\n");
3525#endif
3526 break;
3527 }
3528 }
3529done:
3530#ifdef DEBUG_PUSH
3531 fprintf(stderr, "HPP: done %d\n", ret);
3532#endif
3533 return(ret);
3534}
3535
3536/**
Daniel Veillard71b656e2000-01-05 14:46:17 +00003537 * htmlParseTry:
3538 * @ctxt: an HTML parser context
3539 *
3540 * Try to progress on parsing
3541 *
3542 * Returns zero if no parsing was possible
3543 */
3544int
3545htmlParseTry(htmlParserCtxtPtr ctxt) {
3546 return(htmlParseTryOrFinish(ctxt, 0));
3547}
3548
3549/**
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003550 * htmlParseChunk:
3551 * @ctxt: an XML parser context
3552 * @chunk: an char array
3553 * @size: the size in byte of the chunk
3554 * @terminate: last chunk indicator
3555 *
3556 * Parse a Chunk of memory
3557 *
3558 * Returns zero if no error, the xmlParserErrors otherwise.
3559 */
3560int
3561htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
3562 int terminate) {
3563 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
3564 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
3565 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
3566 int cur = ctxt->input->cur - ctxt->input->base;
3567
3568 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
3569 ctxt->input->base = ctxt->input->buf->buffer->content + base;
3570 ctxt->input->cur = ctxt->input->base + cur;
3571#ifdef DEBUG_PUSH
3572 fprintf(stderr, "HPP: pushed %d\n", size);
3573#endif
3574
Daniel Veillardd0f7f742000-02-02 17:42:48 +00003575 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
3576 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003577 } else if (ctxt->instate != XML_PARSER_EOF)
Daniel Veillard71b656e2000-01-05 14:46:17 +00003578 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003579 if (terminate) {
3580 if ((ctxt->instate != XML_PARSER_EOF) &&
3581 (ctxt->instate != XML_PARSER_EPILOG) &&
3582 (ctxt->instate != XML_PARSER_MISC)) {
3583 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3584 ctxt->sax->error(ctxt->userData,
3585 "Extra content at the end of the document\n");
3586 ctxt->wellFormed = 0;
3587 ctxt->errNo = XML_ERR_DOCUMENT_END;
3588 }
3589 if (ctxt->instate != XML_PARSER_EOF) {
3590 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3591 ctxt->sax->endDocument(ctxt->userData);
3592 }
3593 ctxt->instate = XML_PARSER_EOF;
3594 }
3595 return((xmlParserErrors) ctxt->errNo);
3596}
3597
3598/************************************************************************
3599 * *
3600 * User entry points *
3601 * *
3602 ************************************************************************/
3603
3604/**
3605 * htmlCreatePushParserCtxt :
3606 * @sax: a SAX handler
3607 * @user_data: The user data returned on SAX callbacks
3608 * @chunk: a pointer to an array of chars
3609 * @size: number of chars in the array
3610 * @filename: an optional file name or URI
3611 * @enc: an optional encoding
3612 *
3613 * Create a parser context for using the HTML parser in push mode
3614 * To allow content encoding detection, @size should be >= 4
3615 * The value of @filename is used for fetching external entities
3616 * and error/warning reports.
3617 *
3618 * Returns the new parser context or NULL
3619 */
3620htmlParserCtxtPtr
3621htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
3622 const char *chunk, int size, const char *filename,
3623 xmlCharEncoding enc) {
3624 htmlParserCtxtPtr ctxt;
3625 htmlParserInputPtr inputStream;
3626 xmlParserInputBufferPtr buf;
3627
3628 buf = xmlAllocParserInputBuffer(enc);
3629 if (buf == NULL) return(NULL);
3630
3631 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
3632 if (ctxt == NULL) {
3633 xmlFree(buf);
3634 return(NULL);
3635 }
3636 memset(ctxt, 0, sizeof(htmlParserCtxt));
3637 htmlInitParserCtxt(ctxt);
3638 if (sax != NULL) {
3639 if (ctxt->sax != &htmlDefaultSAXHandler)
3640 xmlFree(ctxt->sax);
3641 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
3642 if (ctxt->sax == NULL) {
3643 xmlFree(buf);
3644 xmlFree(ctxt);
3645 return(NULL);
3646 }
3647 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
3648 if (user_data != NULL)
3649 ctxt->userData = user_data;
3650 }
3651 if (filename == NULL) {
3652 ctxt->directory = NULL;
3653 } else {
3654 ctxt->directory = xmlParserGetDirectory(filename);
3655 }
3656
3657 inputStream = htmlNewInputStream(ctxt);
3658 if (inputStream == NULL) {
3659 xmlFreeParserCtxt(ctxt);
3660 return(NULL);
3661 }
3662
3663 if (filename == NULL)
3664 inputStream->filename = NULL;
3665 else
3666 inputStream->filename = xmlMemStrdup(filename);
3667 inputStream->buf = buf;
3668 inputStream->base = inputStream->buf->buffer->content;
3669 inputStream->cur = inputStream->buf->buffer->content;
3670
3671 inputPush(ctxt, inputStream);
3672
3673 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
3674 (ctxt->input->buf != NULL)) {
3675 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
3676#ifdef DEBUG_PUSH
3677 fprintf(stderr, "HPP: pushed %d\n", size);
3678#endif
3679 }
3680
3681 return(ctxt);
3682}
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003683
3684/**
3685 * htmlSAXParseDoc :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003686 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003687 * @encoding: a free form C string describing the HTML document encoding, or NULL
3688 * @sax: the SAX handler block
3689 * @userData: if using SAX, this pointer will be provided on callbacks.
3690 *
3691 * parse an HTML in-memory document and build a tree.
3692 * It use the given SAX function block to handle the parsing callback.
3693 * If sax is NULL, fallback to the default DOM tree building routines.
3694 *
3695 * Returns the resulting document tree
3696 */
3697
3698htmlDocPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003699htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003700 htmlDocPtr ret;
3701 htmlParserCtxtPtr ctxt;
3702
3703 if (cur == NULL) return(NULL);
3704
3705
3706 ctxt = htmlCreateDocParserCtxt(cur, encoding);
3707 if (ctxt == NULL) return(NULL);
3708 if (sax != NULL) {
3709 ctxt->sax = sax;
3710 ctxt->userData = userData;
3711 }
3712
3713 htmlParseDocument(ctxt);
3714 ret = ctxt->myDoc;
3715 if (sax != NULL) {
3716 ctxt->sax = NULL;
3717 ctxt->userData = NULL;
3718 }
3719 htmlFreeParserCtxt(ctxt);
3720
3721 return(ret);
3722}
3723
3724/**
3725 * htmlParseDoc :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003726 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003727 * @encoding: a free form C string describing the HTML document encoding, or NULL
3728 *
3729 * parse an HTML in-memory document and build a tree.
3730 *
3731 * Returns the resulting document tree
3732 */
3733
3734htmlDocPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003735htmlParseDoc(xmlChar *cur, const char *encoding) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003736 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
3737}
3738
3739
3740/**
3741 * htmlCreateFileParserCtxt :
3742 * @filename: the filename
3743 * @encoding: a free form C string describing the HTML document encoding, or NULL
3744 *
3745 * Create a parser context for a file content.
3746 * Automatic support for ZLIB/Compress compressed document is provided
3747 * by default if found at compile-time.
3748 *
3749 * Returns the new parser context or NULL
3750 */
3751htmlParserCtxtPtr
3752htmlCreateFileParserCtxt(const char *filename, const char *encoding)
3753{
3754 htmlParserCtxtPtr ctxt;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003755 htmlParserInputPtr inputStream;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00003756 xmlParserInputBufferPtr buf;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003757 /* htmlCharEncoding enc; */
3758
Daniel Veillarde2d034d1999-07-27 19:52:06 +00003759 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
3760 if (buf == NULL) return(NULL);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003761
Daniel Veillard6454aec1999-09-02 22:04:43 +00003762 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003763 if (ctxt == NULL) {
3764 perror("malloc");
3765 return(NULL);
3766 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003767 memset(ctxt, 0, sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003768 htmlInitParserCtxt(ctxt);
Daniel Veillard6454aec1999-09-02 22:04:43 +00003769 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003770 if (inputStream == NULL) {
3771 perror("malloc");
Daniel Veillard6454aec1999-09-02 22:04:43 +00003772 xmlFree(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003773 return(NULL);
3774 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003775 memset(inputStream, 0, sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003776
Daniel Veillard6454aec1999-09-02 22:04:43 +00003777 inputStream->filename = xmlMemStrdup(filename);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003778 inputStream->line = 1;
3779 inputStream->col = 1;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00003780 inputStream->buf = buf;
Daniel Veillard35008381999-10-25 13:15:52 +00003781 inputStream->directory = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003782
Daniel Veillarde2d034d1999-07-27 19:52:06 +00003783 inputStream->base = inputStream->buf->buffer->content;
3784 inputStream->cur = inputStream->buf->buffer->content;
3785 inputStream->free = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003786
3787 inputPush(ctxt, inputStream);
3788 return(ctxt);
3789}
3790
3791/**
3792 * htmlSAXParseFile :
3793 * @filename: the filename
3794 * @encoding: a free form C string describing the HTML document encoding, or NULL
3795 * @sax: the SAX handler block
3796 * @userData: if using SAX, this pointer will be provided on callbacks.
3797 *
3798 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
3799 * compressed document is provided by default if found at compile-time.
3800 * It use the given SAX function block to handle the parsing callback.
3801 * If sax is NULL, fallback to the default DOM tree building routines.
3802 *
3803 * Returns the resulting document tree
3804 */
3805
3806htmlDocPtr
3807htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
3808 void *userData) {
3809 htmlDocPtr ret;
3810 htmlParserCtxtPtr ctxt;
3811
3812 ctxt = htmlCreateFileParserCtxt(filename, encoding);
3813 if (ctxt == NULL) return(NULL);
3814 if (sax != NULL) {
3815 ctxt->sax = sax;
3816 ctxt->userData = userData;
3817 }
3818
3819 htmlParseDocument(ctxt);
3820
3821 ret = ctxt->myDoc;
3822 if (sax != NULL) {
3823 ctxt->sax = NULL;
3824 ctxt->userData = NULL;
3825 }
3826 htmlFreeParserCtxt(ctxt);
3827
3828 return(ret);
3829}
3830
3831/**
3832 * htmlParseFile :
3833 * @filename: the filename
3834 * @encoding: a free form C string describing the HTML document encoding, or NULL
3835 *
3836 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
3837 * compressed document is provided by default if found at compile-time.
3838 *
3839 * Returns the resulting document tree
3840 */
3841
3842htmlDocPtr
3843htmlParseFile(const char *filename, const char *encoding) {
3844 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
3845}