blob: 0877f4cca9d9f63eabdeee5152a92b55aa38bc37 [file] [log] [blame]
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * Daniel.Veillard@w3.org
7 */
8
9#ifdef WIN32
Daniel Veillard3c558c31999-12-22 11:30:41 +000010#include "win32config.h"
Daniel Veillardbe70ff71999-07-05 16:50:46 +000011#else
Daniel Veillard7f7d1111999-09-22 09:46:25 +000012#include "config.h"
Daniel Veillardbe70ff71999-07-05 16:50:46 +000013#endif
Daniel Veillard7f7d1111999-09-22 09:46:25 +000014
Daniel Veillard361d8452000-04-03 19:48:13 +000015#include "xmlversion.h"
16#ifdef LIBXML_HTML_ENABLED
17
Daniel Veillardbe70ff71999-07-05 16:50:46 +000018#include <stdio.h>
Daniel Veillard32bc74e2000-07-14 14:49:25 +000019#include <string.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000020#ifdef HAVE_CTYPE_H
21#include <ctype.h>
22#endif
23#ifdef HAVE_STDLIB_H
Daniel Veillardbe70ff71999-07-05 16:50:46 +000024#include <stdlib.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000025#endif
26#ifdef HAVE_SYS_STAT_H
Daniel Veillardbe70ff71999-07-05 16:50:46 +000027#include <sys/stat.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000028#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +000029#ifdef HAVE_FCNTL_H
30#include <fcntl.h>
31#endif
32#ifdef HAVE_UNISTD_H
33#include <unistd.h>
34#endif
35#ifdef HAVE_ZLIB_H
36#include <zlib.h>
37#endif
38
Daniel Veillard361d8452000-04-03 19:48:13 +000039#include <libxml/xmlmemory.h>
40#include <libxml/tree.h>
41#include <libxml/HTMLparser.h>
42#include <libxml/entities.h>
43#include <libxml/encoding.h>
Daniel Veillard32bc74e2000-07-14 14:49:25 +000044#include <libxml/parser.h>
Daniel Veillard361d8452000-04-03 19:48:13 +000045#include <libxml/valid.h>
46#include <libxml/parserInternals.h>
47#include <libxml/xmlIO.h>
Daniel Veillard5e5c6231999-12-29 12:49:06 +000048#include "xml-error.h"
Daniel Veillarde2d034d1999-07-27 19:52:06 +000049
50#define HTML_MAX_NAMELEN 1000
51#define INPUT_CHUNK 50
Daniel Veillard32bc74e2000-07-14 14:49:25 +000052#define HTML_PARSER_BIG_BUFFER_SIZE 1000
Daniel Veillard5e5c6231999-12-29 12:49:06 +000053#define HTML_PARSER_BUFFER_SIZE 100
Daniel Veillardbe70ff71999-07-05 16:50:46 +000054
Daniel Veillard82150d81999-07-07 07:32:15 +000055/* #define DEBUG */
Daniel Veillard5e5c6231999-12-29 12:49:06 +000056/* #define DEBUG_PUSH */
Daniel Veillard5233ffc1999-07-06 22:25:25 +000057
58/************************************************************************
59 * *
60 * Parser stacks related functions and macros *
61 * *
62 ************************************************************************/
63
64/*
65 * Generic function for accessing stacks in the Parser Context
66 */
67
Daniel Veillarddbfd6411999-12-28 16:35:14 +000068#define PUSH_AND_POP(scope, type, name) \
69scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000070 if (ctxt->name##Nr >= ctxt->name##Max) { \
71 ctxt->name##Max *= 2; \
Daniel Veillard32bc74e2000-07-14 14:49:25 +000072 ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000073 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
74 if (ctxt->name##Tab == NULL) { \
75 fprintf(stderr, "realloc failed !\n"); \
Daniel Veillard0142b842000-01-14 14:45:24 +000076 return(0); \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000077 } \
78 } \
79 ctxt->name##Tab[ctxt->name##Nr] = value; \
80 ctxt->name = value; \
81 return(ctxt->name##Nr++); \
82} \
Daniel Veillarddbfd6411999-12-28 16:35:14 +000083scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000084 type ret; \
Daniel Veillard7c1206f1999-10-14 09:10:25 +000085 if (ctxt->name##Nr < 0) return(0); \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000086 ctxt->name##Nr--; \
Daniel Veillard7c1206f1999-10-14 09:10:25 +000087 if (ctxt->name##Nr < 0) return(0); \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000088 if (ctxt->name##Nr > 0) \
89 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
90 else \
91 ctxt->name = NULL; \
92 ret = ctxt->name##Tab[ctxt->name##Nr]; \
93 ctxt->name##Tab[ctxt->name##Nr] = 0; \
94 return(ret); \
95} \
96
Daniel Veillarddbfd6411999-12-28 16:35:14 +000097PUSH_AND_POP(extern, xmlNodePtr, node)
98PUSH_AND_POP(extern, xmlChar*, name)
Daniel Veillard5233ffc1999-07-06 22:25:25 +000099
100/*
101 * Macros for accessing the content. Those should be used only by the parser,
102 * and not exported.
103 *
104 * Dirty macros, i.e. one need to make assumption on the context to use them
105 *
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000106 * CUR_PTR return the current pointer to the xmlChar to be parsed.
107 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000108 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
109 * in UNICODE mode. This should be used internally by the parser
110 * only to compare to ASCII values otherwise it would break when
111 * running with UTF-8 encoding.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000112 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000113 * to compare on ASCII based substring.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000114 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000115 * it should be used only to compare on ASCII based substring.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000116 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000117 * strings within the parser.
118 *
119 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
120 *
121 * CURRENT Returns the current char value, with the full decoding of
122 * UTF-8 if we are using this mode. It returns an int.
123 * NEXT Skip to the next character, this does the proper decoding
124 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000125 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
126 */
127
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000128#define UPPER (toupper(*ctxt->input->cur))
Daniel Veillardcf461992000-03-14 18:30:20 +0000129
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000130#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
Daniel Veillardcf461992000-03-14 18:30:20 +0000131
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000132#define NXT(val) ctxt->input->cur[(val)]
Daniel Veillardcf461992000-03-14 18:30:20 +0000133
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000134#define UPP(val) (toupper(ctxt->input->cur[(val)]))
Daniel Veillardcf461992000-03-14 18:30:20 +0000135
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000136#define CUR_PTR ctxt->input->cur
Daniel Veillardcf461992000-03-14 18:30:20 +0000137
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000138#define SHRINK xmlParserInputShrink(ctxt->input)
Daniel Veillardcf461992000-03-14 18:30:20 +0000139
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000140#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000141
Daniel Veillardcf461992000-03-14 18:30:20 +0000142#define CURRENT ((int) (*ctxt->input->cur))
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000143
Daniel Veillardcf461992000-03-14 18:30:20 +0000144#define SKIP_BLANKS htmlSkipBlankChars(ctxt);
145
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000146#if 0
147#define CUR ((int) (*ctxt->input->cur))
148#define NEXT htmlNextChar(ctxt);
149#else
150/* Inported from XML */
151
152/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
153#define CUR ((int) (*ctxt->input->cur))
154#define NEXT xmlNextChar(ctxt);ctxt->nbChars++;
155
156#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
157#define NXT(val) ctxt->input->cur[(val)]
158#define CUR_PTR ctxt->input->cur
159
160
161#define NEXTL(l) \
162 if (*(ctxt->input->cur) == '\n') { \
163 ctxt->input->line++; ctxt->input->col = 1; \
164 } else ctxt->input->col++; \
165 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++;
166
167/************
168 \
169 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
170 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
171 ************/
172
173#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l);
174#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l);
175
176#define COPY_BUF(l,b,i,v) \
177 if (l == 1) b[i++] = (xmlChar) v; \
178 else i += xmlCopyChar(l,&b[i],v);
179#endif
180
181/**
182 * htmlCurrentChar:
183 * @ctxt: the HTML parser context
184 * @len: pointer to the length of the char read
185 *
186 * The current char value, if using UTF-8 this may actaully span multiple
187 * bytes in the input buffer. Implement the end of line normalization:
188 * 2.11 End-of-Line Handling
189 * If the encoding is unspecified, in the case we find an ISO-Latin-1
190 * char, then the encoding converter is plugged in automatically.
191 *
192 * Returns the current char value and its lenght
193 */
194
195int
196htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
197 if (ctxt->instate == XML_PARSER_EOF)
198 return(0);
199
200 if (ctxt->token != 0) {
201 *len = 0;
202 return(ctxt->token);
203 }
204 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
205 /*
206 * We are supposed to handle UTF8, check it's valid
207 * From rfc2044: encoding of the Unicode values on UTF-8:
208 *
209 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
210 * 0000 0000-0000 007F 0xxxxxxx
211 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
212 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
213 *
214 * Check for the 0x110000 limit too
215 */
216 const unsigned char *cur = ctxt->input->cur;
217 unsigned char c;
218 unsigned int val;
219
220 c = *cur;
221 if (c & 0x80) {
222 if (cur[1] == 0)
223 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
224 if ((cur[1] & 0xc0) != 0x80)
225 goto encoding_error;
226 if ((c & 0xe0) == 0xe0) {
227
228 if (cur[2] == 0)
229 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
230 if ((cur[2] & 0xc0) != 0x80)
231 goto encoding_error;
232 if ((c & 0xf0) == 0xf0) {
233 if (cur[3] == 0)
234 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
235 if (((c & 0xf8) != 0xf0) ||
236 ((cur[3] & 0xc0) != 0x80))
237 goto encoding_error;
238 /* 4-byte code */
239 *len = 4;
240 val = (cur[0] & 0x7) << 18;
241 val |= (cur[1] & 0x3f) << 12;
242 val |= (cur[2] & 0x3f) << 6;
243 val |= cur[3] & 0x3f;
244 } else {
245 /* 3-byte code */
246 *len = 3;
247 val = (cur[0] & 0xf) << 12;
248 val |= (cur[1] & 0x3f) << 6;
249 val |= cur[2] & 0x3f;
250 }
251 } else {
252 /* 2-byte code */
253 *len = 2;
254 val = (cur[0] & 0x1f) << 6;
255 val |= cur[1] & 0x3f;
256 }
257 if (!IS_CHAR(val)) {
258 if ((ctxt->sax != NULL) &&
259 (ctxt->sax->error != NULL))
260 ctxt->sax->error(ctxt->userData,
261 "Char 0x%X out of allowed range\n", val);
262 ctxt->errNo = XML_ERR_INVALID_ENCODING;
263 ctxt->wellFormed = 0;
264 ctxt->disableSAX = 1;
265 }
266 return(val);
267 } else {
268 /* 1-byte code */
269 *len = 1;
270 return((int) *ctxt->input->cur);
271 }
272 }
273 /*
274 * Assume it's a fixed lenght encoding (1) with
275 * a compatibke encoding for the ASCII set, since
276 * XML constructs only use < 128 chars
277 */
278 *len = 1;
279 if ((int) *ctxt->input->cur < 0x80)
280 return((int) *ctxt->input->cur);
281
282 /*
283 * Humm this is bad, do an automatic flow conversion
284 */
285 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
286 ctxt->charset = XML_CHAR_ENCODING_UTF8;
287 return(xmlCurrentChar(ctxt, len));
288
289encoding_error:
290 /*
291 * If we detect an UTF8 error that probably mean that the
292 * input encoding didn't get properly advertized in the
293 * declaration header. Report the error and switch the encoding
294 * to ISO-Latin-1 (if you don't like this policy, just declare the
295 * encoding !)
296 */
297 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
298 ctxt->sax->error(ctxt->userData,
299 "Input is not proper UTF-8, indicate encoding !\n");
300 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
301 ctxt->input->cur[0], ctxt->input->cur[1],
302 ctxt->input->cur[2], ctxt->input->cur[3]);
303 }
304 ctxt->errNo = XML_ERR_INVALID_ENCODING;
305
306 ctxt->charset = XML_CHAR_ENCODING_8859_1;
307 *len = 1;
308 return((int) *ctxt->input->cur);
309}
310
Daniel Veillardcf461992000-03-14 18:30:20 +0000311/**
312 * htmlNextChar:
313 * @ctxt: the HTML parser context
314 *
315 * Skip to the next char input char.
316 */
317
318void
319htmlNextChar(htmlParserCtxtPtr ctxt) {
Daniel Veillard3f6f7f62000-06-30 17:58:25 +0000320 if (ctxt->instate == XML_PARSER_EOF)
321 return;
Daniel Veillardcf461992000-03-14 18:30:20 +0000322 if ((*ctxt->input->cur == 0) &&
323 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
324 xmlPopInput(ctxt);
325 } else {
326 if (*(ctxt->input->cur) == '\n') {
327 ctxt->input->line++; ctxt->input->col = 1;
328 } else ctxt->input->col++;
329 ctxt->input->cur++;
330 ctxt->nbChars++;
331 if (*ctxt->input->cur == 0)
332 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
333 }
334}
335
336/**
337 * htmlSkipBlankChars:
338 * @ctxt: the HTML parser context
339 *
340 * skip all blanks character found at that point in the input streams.
341 *
342 * Returns the number of space chars skipped
343 */
344
345int
346htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
347 int res = 0;
348
349 while (IS_BLANK(*(ctxt->input->cur))) {
350 if ((*ctxt->input->cur == 0) &&
351 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
352 xmlPopInput(ctxt);
353 } else {
354 if (*(ctxt->input->cur) == '\n') {
355 ctxt->input->line++; ctxt->input->col = 1;
356 } else ctxt->input->col++;
357 ctxt->input->cur++;
358 ctxt->nbChars++;
359 if (*ctxt->input->cur == 0)
360 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
361 }
362 res++;
363 }
364 return(res);
365}
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000366
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000367
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000368
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000369/************************************************************************
370 * *
371 * The list of HTML elements and their properties *
372 * *
373 ************************************************************************/
374
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000375/*
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000376 * Start Tag: 1 means the start tag can be ommited
377 * End Tag: 1 means the end tag can be ommited
378 * 2 means it's forbidden (empty elements)
379 * Depr: this element is deprecated
380 * DTD: 1 means that this element is valid only in the Loose DTD
381 * 2 means that this element is valid only in the Frameset DTD
382 *
383 * Name,Start Tag,End Tag, Empty, Depr., DTD, Description
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000384 */
385htmlElemDesc html40ElementTable[] = {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000386{ "a", 0, 0, 0, 0, 0, "anchor " },
387{ "abbr", 0, 0, 0, 0, 0, "abbreviated form" },
388{ "acronym", 0, 0, 0, 0, 0, "" },
389{ "address", 0, 0, 0, 0, 0, "information on author " },
390{ "applet", 0, 0, 0, 1, 1, "java applet " },
391{ "area", 0, 2, 1, 0, 0, "client-side image map area " },
392{ "b", 0, 0, 0, 0, 0, "bold text style" },
393{ "base", 0, 2, 1, 0, 0, "document base uri " },
394{ "basefont", 0, 2, 1, 1, 1, "base font size " },
395{ "bdo", 0, 0, 0, 0, 0, "i18n bidi over-ride " },
396{ "big", 0, 0, 0, 0, 0, "large text style" },
397{ "blockquote", 0, 0, 0, 0, 0, "long quotation " },
398{ "body", 1, 1, 0, 0, 0, "document body " },
399{ "br", 0, 2, 1, 0, 0, "forced line break " },
400{ "button", 0, 0, 0, 0, 0, "push button " },
401{ "caption", 0, 0, 0, 0, 0, "table caption " },
402{ "center", 0, 0, 0, 1, 1, "shorthand for div align=center " },
403{ "cite", 0, 0, 0, 0, 0, "citation" },
404{ "code", 0, 0, 0, 0, 0, "computer code fragment" },
405{ "col", 0, 2, 1, 0, 0, "table column " },
406{ "colgroup", 0, 1, 0, 0, 0, "table column group " },
407{ "dd", 0, 1, 0, 0, 0, "definition description " },
408{ "del", 0, 0, 0, 0, 0, "deleted text " },
409{ "dfn", 0, 0, 0, 0, 0, "instance definition" },
410{ "dir", 0, 0, 0, 1, 1, "directory list" },
411{ "div", 0, 0, 0, 0, 0, "generic language/style container"},
412{ "dl", 0, 0, 0, 0, 0, "definition list " },
413{ "dt", 0, 1, 0, 0, 0, "definition term " },
414{ "em", 0, 0, 0, 0, 0, "emphasis" },
415{ "fieldset", 0, 0, 0, 0, 0, "form control group " },
416{ "font", 0, 0, 0, 1, 1, "local change to font " },
417{ "form", 0, 0, 0, 0, 0, "interactive form " },
418{ "frame", 0, 2, 1, 0, 2, "subwindow " },
419{ "frameset", 0, 0, 0, 0, 2, "window subdivision" },
420{ "h1", 0, 0, 0, 0, 0, "heading " },
421{ "h2", 0, 0, 0, 0, 0, "heading " },
422{ "h3", 0, 0, 0, 0, 0, "heading " },
423{ "h4", 0, 0, 0, 0, 0, "heading " },
424{ "h5", 0, 0, 0, 0, 0, "heading " },
425{ "h6", 0, 0, 0, 0, 0, "heading " },
426{ "head", 1, 1, 0, 0, 0, "document head " },
427{ "hr", 0, 2, 1, 0, 0, "horizontal rule " },
428{ "html", 1, 1, 0, 0, 0, "document root element " },
429{ "i", 0, 0, 0, 0, 0, "italic text style" },
430{ "iframe", 0, 0, 0, 0, 1, "inline subwindow " },
431{ "img", 0, 2, 1, 0, 0, "embedded image " },
432{ "input", 0, 2, 1, 0, 0, "form control " },
433{ "ins", 0, 0, 0, 0, 0, "inserted text" },
434{ "isindex", 0, 2, 1, 1, 1, "single line prompt " },
435{ "kbd", 0, 0, 0, 0, 0, "text to be entered by the user" },
436{ "label", 0, 0, 0, 0, 0, "form field label text " },
437{ "legend", 0, 0, 0, 0, 0, "fieldset legend " },
438{ "li", 0, 1, 0, 0, 0, "list item " },
439{ "link", 0, 2, 1, 0, 0, "a media-independent link " },
440{ "map", 0, 0, 0, 0, 0, "client-side image map " },
441{ "menu", 0, 0, 0, 1, 1, "menu list " },
442{ "meta", 0, 2, 1, 0, 0, "generic metainformation " },
443{ "noframes", 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " },
444{ "noscript", 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
445{ "object", 0, 0, 0, 0, 0, "generic embedded object " },
446{ "ol", 0, 0, 0, 0, 0, "ordered list " },
447{ "optgroup", 0, 0, 0, 0, 0, "option group " },
448{ "option", 0, 1, 0, 0, 0, "selectable choice " },
449{ "p", 0, 1, 0, 0, 0, "paragraph " },
450{ "param", 0, 2, 1, 0, 0, "named property value " },
451{ "pre", 0, 0, 0, 0, 0, "preformatted text " },
452{ "q", 0, 0, 0, 0, 0, "short inline quotation " },
453{ "s", 0, 0, 0, 1, 1, "strike-through text style" },
454{ "samp", 0, 0, 0, 0, 0, "sample program output, scripts, etc." },
455{ "script", 0, 0, 0, 0, 0, "script statements " },
456{ "select", 0, 0, 0, 0, 0, "option selector " },
457{ "small", 0, 0, 0, 0, 0, "small text style" },
458{ "span", 0, 0, 0, 0, 0, "generic language/style container " },
459{ "strike", 0, 0, 0, 1, 1, "strike-through text" },
460{ "strong", 0, 0, 0, 0, 0, "strong emphasis" },
461{ "style", 0, 0, 0, 0, 0, "style info " },
462{ "sub", 0, 0, 0, 0, 0, "subscript" },
463{ "sup", 0, 0, 0, 0, 0, "superscript " },
464{ "table", 0, 0, 0, 0, 0, "&#160;" },
465{ "tbody", 1, 1, 0, 0, 0, "table body " },
466{ "td", 0, 1, 0, 0, 0, "table data cell" },
467{ "textarea", 0, 0, 0, 0, 0, "multi-line text field " },
468{ "tfoot", 0, 1, 0, 0, 0, "table footer " },
469{ "th", 0, 1, 0, 0, 0, "table header cell" },
470{ "thead", 0, 1, 0, 0, 0, "table header " },
471{ "title", 0, 0, 0, 0, 0, "document title " },
472{ "tr", 0, 1, 0, 0, 0, "table row " },
473{ "tt", 0, 0, 0, 0, 0, "teletype or monospaced text style" },
474{ "u", 0, 0, 0, 1, 1, "underlined text style" },
475{ "ul", 0, 0, 0, 0, 0, "unordered list " },
476{ "var", 0, 0, 0, 0, 0, "instance of a variable or program argument" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000477};
478
479/*
480 * start tags that imply the end of a current element
481 * any tag of each line implies the end of the current element if the type of
482 * that element is in the same line
483 */
Daniel Veillardb96e6431999-08-29 21:02:19 +0000484char *htmlEquEnd[] = {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000485"dt", "dd", "li", "option", NULL,
486"h1", "h2", "h3", "h4", "h5", "h6", NULL,
487"ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000488NULL
489};
490/*
491 * acording the HTML DTD, HR should be added to the 2nd line above, as it
492 * is not allowed within a H1, H2, H3, etc. But we should tolerate that case
493 * because many documents contain rules in headings...
494 */
495
496/*
497 * start tags that imply the end of current element
498 */
Daniel Veillardb96e6431999-08-29 21:02:19 +0000499char *htmlStartClose[] = {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000500"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
501 "dl", "ul", "ol", "menu", "dir", "address", "pre",
502 "listing", "xmp", "head", NULL,
503"head", "p", NULL,
504"title", "p", NULL,
505"body", "head", "style", "link", "title", "p", NULL,
506"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
507 "pre", "listing", "xmp", "head", "li", NULL,
508"hr", "p", "head", NULL,
509"h1", "p", "head", NULL,
510"h2", "p", "head", NULL,
511"h3", "p", "head", NULL,
512"h4", "p", "head", NULL,
513"h5", "p", "head", NULL,
514"h6", "p", "head", NULL,
515"dir", "p", "head", NULL,
516"address", "p", "head", "ul", NULL,
517"pre", "p", "head", "ul", NULL,
518"listing", "p", "head", NULL,
519"xmp", "p", "head", NULL,
520"blockquote", "p", "head", NULL,
521"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
522 "xmp", "head", NULL,
523"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
524 "head", "dd", NULL,
525"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
526 "head", "dt", NULL,
527"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
528 "listing", "xmp", NULL,
529"ol", "p", "head", "ul", NULL,
530"menu", "p", "head", "ul", NULL,
531"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
532"div", "p", "head", NULL,
533"noscript", "p", "head", NULL,
534"center", "font", "b", "i", "p", "head", NULL,
535"a", "a", NULL,
536"caption", "p", NULL,
537"colgroup", "caption", "colgroup", "col", "p", NULL,
538"col", "caption", "col", "p", NULL,
539"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
540 "listing", "xmp", "a", NULL,
541"th", "th", "td", NULL,
542"td", "th", "td", "p", NULL,
543"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
544"thead", "caption", "col", "colgroup", NULL,
545"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
546 "tbody", "p", NULL,
547"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
548 "tfoot", "tbody", "p", NULL,
549"optgroup", "option", NULL,
550"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
551 "pre", "listing", "xmp", "a", NULL,
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000552NULL
553};
554
Daniel Veillardbe803962000-06-28 23:40:59 +0000555
Daniel Veillardb96e6431999-08-29 21:02:19 +0000556static char** htmlStartCloseIndex[100];
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000557static int htmlStartCloseIndexinitialized = 0;
558
559/************************************************************************
560 * *
561 * functions to handle HTML specific data *
562 * *
563 ************************************************************************/
564
565/**
566 * htmlInitAutoClose:
567 *
568 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
569 *
570 */
571void
572htmlInitAutoClose(void) {
573 int index, i = 0;
574
575 if (htmlStartCloseIndexinitialized) return;
576
577 for (index = 0;index < 100;index ++) htmlStartCloseIndex[index] = NULL;
578 index = 0;
579 while ((htmlStartClose[i] != NULL) && (index < 100 - 1)) {
580 htmlStartCloseIndex[index++] = &htmlStartClose[i];
581 while (htmlStartClose[i] != NULL) i++;
582 i++;
583 }
584}
585
586/**
587 * htmlTagLookup:
588 * @tag: The tag name
589 *
590 * Lookup the HTML tag in the ElementTable
591 *
592 * Returns the related htmlElemDescPtr or NULL if not found.
593 */
594htmlElemDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000595htmlTagLookup(const xmlChar *tag) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000596 int i = 0;
597
598 for (i = 0; i < (sizeof(html40ElementTable) /
599 sizeof(html40ElementTable[0]));i++) {
Daniel Veillardb96e6431999-08-29 21:02:19 +0000600 if (!xmlStrcmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000601 return(&html40ElementTable[i]);
602 }
603 return(NULL);
604}
605
606/**
607 * htmlCheckAutoClose:
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000608 * @newtag: The new tag name
609 * @oldtag: The old tag name
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000610 *
611 * Checks wether the new tag is one of the registered valid tags for closing old.
612 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
613 *
614 * Returns 0 if no, 1 if yes.
615 */
616int
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000617htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000618 int i, index;
Daniel Veillardb96e6431999-08-29 21:02:19 +0000619 char **close;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000620
621 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
622
623 /* inefficient, but not a big deal */
624 for (index = 0; index < 100;index++) {
625 close = htmlStartCloseIndex[index];
626 if (close == NULL) return(0);
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000627 if (!xmlStrcmp(BAD_CAST *close, newtag)) break;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000628 }
629
630 i = close - htmlStartClose;
631 i++;
632 while (htmlStartClose[i] != NULL) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000633 if (!xmlStrcmp(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000634 return(1);
635 }
636 i++;
637 }
638 return(0);
639}
640
641/**
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000642 * htmlAutoCloseOnClose:
643 * @ctxt: an HTML parser context
644 * @newtag: The new tag name
645 *
646 * The HTmL DtD allows an ending tag to implicitely close other tags.
647 */
648void
649htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
650 htmlElemDescPtr info;
651 xmlChar *oldname;
652 int i;
653
654#ifdef DEBUG
655 fprintf(stderr,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
656 for (i = 0;i < ctxt->nameNr;i++)
657 fprintf(stderr,"%d : %s\n", i, ctxt->nameTab[i]);
658#endif
659
660 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
661 if (!xmlStrcmp(newtag, ctxt->nameTab[i])) break;
662 }
663 if (i < 0) return;
664
665 while (xmlStrcmp(newtag, ctxt->name)) {
666 info = htmlTagLookup(ctxt->name);
667 if ((info == NULL) || (info->endTag == 1)) {
668#ifdef DEBUG
669 fprintf(stderr,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
670#endif
671 } else {
672 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
673 ctxt->sax->error(ctxt->userData,
674 "Opening and ending tag mismatch: %s and %s\n",
675 newtag, ctxt->name);
676 ctxt->wellFormed = 0;
677 }
678 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
679 ctxt->sax->endElement(ctxt->userData, ctxt->name);
680 oldname = htmlnamePop(ctxt);
681 if (oldname != NULL) {
682#ifdef DEBUG
683 fprintf(stderr,"htmlAutoCloseOnClose: popped %s\n", oldname);
684#endif
685 xmlFree(oldname);
686 }
687 }
688}
689
690/**
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000691 * htmlAutoClose:
692 * @ctxt: an HTML parser context
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000693 * @newtag: The new tag name or NULL
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000694 *
695 * The HTmL DtD allows a tag to implicitely close other tags.
696 * The list is kept in htmlStartClose array. This function is
697 * called when a new tag has been detected and generates the
698 * appropriates closes if possible/needed.
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000699 * If newtag is NULL this mean we are at the end of the resource
Daniel Veillard365e13b2000-07-02 07:56:37 +0000700 * and we should check
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000701 */
702void
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000703htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000704 xmlChar *oldname;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000705 while ((newtag != NULL) && (ctxt->name != NULL) &&
706 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000707#ifdef DEBUG
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000708 fprintf(stderr,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000709#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000710 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000711 ctxt->sax->endElement(ctxt->userData, ctxt->name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +0000712 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000713 if (oldname != NULL) {
714#ifdef DEBUG
715 fprintf(stderr,"htmlAutoClose: popped %s\n", oldname);
716#endif
717 xmlFree(oldname);
718 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000719 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000720 if (newtag == NULL) {
721 htmlAutoCloseOnClose(ctxt, BAD_CAST"head");
722 htmlAutoCloseOnClose(ctxt, BAD_CAST"body");
723 htmlAutoCloseOnClose(ctxt, BAD_CAST"html");
724 }
725 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard365e13b2000-07-02 07:56:37 +0000726 ((!xmlStrcmp(ctxt->name, BAD_CAST"head")) ||
727 (!xmlStrcmp(ctxt->name, BAD_CAST"body")) ||
728 (!xmlStrcmp(ctxt->name, BAD_CAST"html")))) {
729#ifdef DEBUG
730 fprintf(stderr,"htmlAutoClose: EOF closes %s\n", ctxt->name);
731#endif
732 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
733 ctxt->sax->endElement(ctxt->userData, ctxt->name);
734 oldname = htmlnamePop(ctxt);
735 if (oldname != NULL) {
736#ifdef DEBUG
737 fprintf(stderr,"htmlAutoClose: popped %s\n", oldname);
738#endif
739 xmlFree(oldname);
740 }
741 }
742
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000743}
744
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000745/**
Daniel Veillard5cb5ab81999-12-21 15:35:29 +0000746 * htmlAutoCloseTag:
747 * @doc: the HTML document
748 * @name: The tag name
749 * @elem: the HTML element
750 *
751 * The HTmL DtD allows a tag to implicitely close other tags.
752 * The list is kept in htmlStartClose array. This function checks
753 * if the element or one of it's children would autoclose the
754 * given tag.
755 *
756 * Returns 1 if autoclose, 0 otherwise
757 */
758int
759htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
760 htmlNodePtr child;
761
762 if (elem == NULL) return(1);
763 if (!xmlStrcmp(name, elem->name)) return(0);
764 if (htmlCheckAutoClose(elem->name, name)) return(1);
Daniel Veillardcf461992000-03-14 18:30:20 +0000765 child = elem->children;
Daniel Veillard5cb5ab81999-12-21 15:35:29 +0000766 while (child != NULL) {
767 if (htmlAutoCloseTag(doc, name, child)) return(1);
768 child = child->next;
769 }
770 return(0);
771}
772
773/**
774 * htmlIsAutoClosed:
775 * @doc: the HTML document
776 * @elem: the HTML element
777 *
778 * The HTmL DtD allows a tag to implicitely close other tags.
779 * The list is kept in htmlStartClose array. This function checks
780 * if a tag is autoclosed by one of it's child
781 *
782 * Returns 1 if autoclosed, 0 otherwise
783 */
784int
785htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
786 htmlNodePtr child;
787
788 if (elem == NULL) return(1);
Daniel Veillardcf461992000-03-14 18:30:20 +0000789 child = elem->children;
Daniel Veillard5cb5ab81999-12-21 15:35:29 +0000790 while (child != NULL) {
791 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
792 child = child->next;
793 }
794 return(0);
795}
796
797/**
Daniel Veillardbe803962000-06-28 23:40:59 +0000798 * htmlCheckImplied:
799 * @ctxt: an HTML parser context
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000800 * @newtag: The new tag name
Daniel Veillardbe803962000-06-28 23:40:59 +0000801 *
802 * The HTmL DtD allows a tag to exists only implicitely
803 * called when a new tag has been detected and generates the
804 * appropriates implicit tags if missing
805 */
806void
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000807htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
808 if (!xmlStrcmp(newtag, BAD_CAST"html"))
Daniel Veillardbe803962000-06-28 23:40:59 +0000809 return;
810 if (ctxt->nameNr <= 0) {
811#ifdef DEBUG
812 fprintf(stderr,"Implied element html: pushed html\n");
813#endif
814 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
815 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
816 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
817 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000818 if ((!xmlStrcmp(newtag, BAD_CAST"body")) || (!xmlStrcmp(newtag, BAD_CAST"head")))
Daniel Veillardbe803962000-06-28 23:40:59 +0000819 return;
820 if (ctxt->nameNr <= 1) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000821 if ((!xmlStrcmp(newtag, BAD_CAST"script")) ||
822 (!xmlStrcmp(newtag, BAD_CAST"style")) ||
823 (!xmlStrcmp(newtag, BAD_CAST"meta")) ||
824 (!xmlStrcmp(newtag, BAD_CAST"link")) ||
825 (!xmlStrcmp(newtag, BAD_CAST"title")) ||
826 (!xmlStrcmp(newtag, BAD_CAST"base"))) {
Daniel Veillardbe803962000-06-28 23:40:59 +0000827 /*
828 * dropped OBJECT ... i you put it first BODY will be
829 * assumed !
830 */
831#ifdef DEBUG
832 fprintf(stderr,"Implied element head: pushed head\n");
833#endif
834 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
835 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
836 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
837 } else {
838#ifdef DEBUG
839 fprintf(stderr,"Implied element body: pushed body\n");
840#endif
841 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
842 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
843 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
844 }
845 }
846}
847
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000848/************************************************************************
849 * *
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000850 * The list of HTML predefined entities *
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000851 * *
852 ************************************************************************/
853
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000854
855htmlEntityDesc html40EntitiesTable[] = {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000856/*
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000857 * the 4 absolute ones,
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000858 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000859{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
860{ 38, "amp", "ampersand, U+0026 ISOnum" },
861{ 60, "lt", "less-than sign, U+003C ISOnum" },
862{ 62, "gt", "greater-than sign, U+003E ISOnum" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000863
864/*
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000865 * A bunch still in the 128-255 range
866 * Replacing them depend really on the charset used.
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000867 */
Daniel Veillard5cb5ab81999-12-21 15:35:29 +0000868{ 39, "apos", "single quote" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000869{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
870{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
871{ 162, "cent", "cent sign, U+00A2 ISOnum" },
872{ 163, "pound","pound sign, U+00A3 ISOnum" },
873{ 164, "curren","currency sign, U+00A4 ISOnum" },
874{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
875{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
876{ 167, "sect", "section sign, U+00A7 ISOnum" },
877{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
878{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
879{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
880{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
881{ 172, "not", "not sign, U+00AC ISOnum" },
882{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
883{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
884{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
885{ 176, "deg", "degree sign, U+00B0 ISOnum" },
886{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
887{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
888{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
889{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
890{ 181, "micro","micro sign, U+00B5 ISOnum" },
891{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000892{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000893{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
894{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
895{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000896{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000897{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
898{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
899{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
900{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
901{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
902{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
903{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
904{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
905{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
906{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
907{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
908{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
909{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
910{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
911{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
912{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
913{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
914{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
915{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
916{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
917{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
918{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
919{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
920{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
921{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
922{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
923{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
924{ 215, "times","multiplication sign, U+00D7 ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000925{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000926{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
927{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
928{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
929{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
930{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
931{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
932{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
933{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
934{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
935{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
936{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
937{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
938{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
939{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
940{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
941{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
942{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
943{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
944{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
945{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
946{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
947{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
948{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
949{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
950{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
951{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
952{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
953{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
954{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
955{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
956{ 247, "divide","division sign, U+00F7 ISOnum" },
957{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
958{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
959{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
960{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
961{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
962{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
963{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
964{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000965
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000966/*
967 * Anything below should really be kept as entities references
968 */
969{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000970
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000971{ 913, "Alpha","greek capital letter alpha, U+0391" },
972{ 914, "Beta", "greek capital letter beta, U+0392" },
973{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
974{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
975{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
976{ 918, "Zeta", "greek capital letter zeta, U+0396" },
977{ 919, "Eta", "greek capital letter eta, U+0397" },
978{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
979{ 921, "Iota", "greek capital letter iota, U+0399" },
980{ 922, "Kappa","greek capital letter kappa, U+039A" },
981{ 923, "Lambda""greek capital letter lambda, U+039B ISOgrk3" },
982{ 924, "Mu", "greek capital letter mu, U+039C" },
983{ 925, "Nu", "greek capital letter nu, U+039D" },
984{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
985{ 927, "Omicron","greek capital letter omicron, U+039F" },
986{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
987{ 929, "Rho", "greek capital letter rho, U+03A1" },
988{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
989{ 932, "Tau", "greek capital letter tau, U+03A4" },
990{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
991{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
992{ 935, "Chi", "greek capital letter chi, U+03A7" },
993{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
994{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000995
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000996{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
997{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
998{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
999{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1000{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1001{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1002{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1003{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1004{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1005{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1006{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1007{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1008{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1009{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1010{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1011{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1012{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1013{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1014{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1015{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1016{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1017{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1018{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1019{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1020{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1021{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1022{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1023{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001024
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001025{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1026{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1027{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1028{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1029{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1030{ 8260, "frasl","fraction slash, U+2044 NEW" },
1031
Daniel Veillardb05deb71999-08-10 19:04:08 +00001032{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001033{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1034{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1035{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1036{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1037{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1038{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1039{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1040{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1041{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1042{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1043{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1044{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1045{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1046{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1047{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1048
1049
1050{ 8704, "forall","for all, U+2200 ISOtech" },
1051{ 8706, "part", "partial differential, U+2202 ISOtech" },
1052{ 8707, "exist","there exists, U+2203 ISOtech" },
1053{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1054{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1055{ 8712, "isin", "element of, U+2208 ISOtech" },
1056{ 8713, "notin","not an element of, U+2209 ISOtech" },
1057{ 8715, "ni", "contains as member, U+220B ISOtech" },
1058{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1059{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
1060{ 8722, "minus","minus sign, U+2212 ISOtech" },
1061{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1062{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1063{ 8733, "prop", "proportional to, U+221D ISOtech" },
1064{ 8734, "infin","infinity, U+221E ISOtech" },
1065{ 8736, "ang", "angle, U+2220 ISOamso" },
1066{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1067{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1068{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1069{ 8746, "cup", "union = cup, U+222A ISOtech" },
1070{ 8747, "int", "integral, U+222B ISOtech" },
1071{ 8756, "there4","therefore, U+2234 ISOtech" },
1072{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1073{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1074{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1075{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1076{ 8801, "equiv","identical to, U+2261 ISOtech" },
1077{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1078{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1079{ 8834, "sub", "subset of, U+2282 ISOtech" },
1080{ 8835, "sup", "superset of, U+2283 ISOtech" },
1081{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1082{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1083{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1084{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1085{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1086{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1087{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1088{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1089{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1090{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1091{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1092{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1093{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1094{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1095
1096{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1097{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1098{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1099{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1100
1101{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1102{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1103{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1104{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1105{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1106{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1107{ 732, "tilde","small tilde, U+02DC ISOdia" },
1108
1109{ 8194, "ensp", "en space, U+2002 ISOpub" },
1110{ 8195, "emsp", "em space, U+2003 ISOpub" },
1111{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1112{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1113{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1114{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1115{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1116{ 8211, "ndash","en dash, U+2013 ISOpub" },
1117{ 8212, "mdash","em dash, U+2014 ISOpub" },
1118{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1119{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1120{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1121{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1122{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1123{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1124{ 8224, "dagger","dagger, U+2020 ISOpub" },
1125{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1126{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1127{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
Daniel Veillardb05deb71999-08-10 19:04:08 +00001128{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001129{ 8364, "euro", "euro sign, U+20AC NEW" }
1130};
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001131
1132/************************************************************************
1133 * *
1134 * Commodity functions to handle entities *
1135 * *
1136 ************************************************************************/
1137
1138/*
1139 * Macro used to grow the current buffer.
1140 */
1141#define growBuffer(buffer) { \
1142 buffer##_size *= 2; \
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001143 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001144 if (buffer == NULL) { \
1145 perror("realloc failed"); \
Daniel Veillard0142b842000-01-14 14:45:24 +00001146 return(NULL); \
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001147 } \
1148}
1149
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001150/**
1151 * htmlEntityLookup:
1152 * @name: the entity name
1153 *
1154 * Lookup the given entity in EntitiesTable
1155 *
1156 * TODO: the linear scan is really ugly, an hash table is really needed.
1157 *
1158 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1159 */
1160htmlEntityDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001161htmlEntityLookup(const xmlChar *name) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001162 int i;
1163
1164 for (i = 0;i < (sizeof(html40EntitiesTable)/
1165 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillardb96e6431999-08-29 21:02:19 +00001166 if (!xmlStrcmp(name, BAD_CAST html40EntitiesTable[i].name)) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001167#ifdef DEBUG
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001168 fprintf(stderr,"Found entity %s\n", name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001169#endif
1170 return(&html40EntitiesTable[i]);
1171 }
1172 }
1173 return(NULL);
1174}
1175
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001176/**
1177 * UTF8ToHtml:
1178 * @out: a pointer to an array of bytes to store the result
1179 * @outlen: the length of @out
1180 * @in: a pointer to an array of UTF-8 chars
1181 * @inlen: the length of @in
1182 *
1183 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1184 * plus HTML entities block of chars out.
1185 *
1186 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1187 * The value of @inlen after return is the number of octets consumed
1188 * as the return value is positive, else unpredictiable.
1189 * The value of @outlen after return is the number of octets consumed.
1190 */
1191int
1192UTF8ToHtml(unsigned char* out, int *outlen,
1193 const unsigned char* in, int *inlen) {
1194 const unsigned char* processed = in;
1195 const unsigned char* outend;
1196 const unsigned char* outstart = out;
1197 const unsigned char* instart = in;
1198 const unsigned char* inend;
1199 unsigned int c, d;
1200 int trailing;
1201
1202 if (in == NULL) {
1203 /*
1204 * initialization nothing to do
1205 */
1206 *outlen = 0;
1207 *inlen = 0;
1208 return(0);
1209 }
1210 inend = in + (*inlen);
1211 outend = out + (*outlen);
1212 while (in < inend) {
1213 d = *in++;
1214 if (d < 0x80) { c= d; trailing= 0; }
1215 else if (d < 0xC0) {
1216 /* trailing byte in leading position */
1217 *outlen = out - outstart;
1218 *inlen = processed - instart;
1219 return(-2);
1220 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1221 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1222 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1223 else {
1224 /* no chance for this in Ascii */
1225 *outlen = out - outstart;
1226 *inlen = processed - instart;
1227 return(-2);
1228 }
1229
1230 if (inend - in < trailing) {
1231 break;
1232 }
1233
1234 for ( ; trailing; trailing--) {
1235 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1236 break;
1237 c <<= 6;
1238 c |= d & 0x3F;
1239 }
1240
1241 /* assertion: c is a single UTF-4 value */
1242 if (c < 0x80) {
1243 if (out >= outend)
1244 break;
1245 *out++ = c;
1246 } else {
1247 int i, j, len;
1248 /*
1249 * Try to lookup a predefined HTML entity for it
1250 */
1251
1252 for (i = 0;i < (sizeof(html40EntitiesTable)/
1253 sizeof(html40EntitiesTable[0]));i++) {
1254 if (html40EntitiesTable[i].value == c) {
1255#ifdef DEBUG
1256 fprintf(stderr,"Found entity %s\n", name);
1257#endif
1258 goto found_ent;
1259 }
1260 if (html40EntitiesTable[i].value > c)
1261 break;
1262 }
1263
1264 /* no chance for this in Ascii */
1265 *outlen = out - outstart;
1266 *inlen = processed - instart;
1267 return(-2);
1268found_ent:
1269 len = strlen(html40EntitiesTable[i].name);
1270 if (out + 2 + len >= outend)
1271 break;
1272 *out++ = '&';
1273 for (j = 0;j < len;j++)
1274 *out++ = html40EntitiesTable[i].name[j];
1275 *out++ = ';';
1276 }
1277 processed = in;
1278 }
1279 *outlen = out - outstart;
1280 *inlen = processed - instart;
1281 return(0);
1282}
1283
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001284
1285/**
1286 * htmlDecodeEntities:
1287 * @ctxt: the parser context
1288 * @len: the len to decode (in bytes !), -1 for no size limit
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001289 * @end: an end marker xmlChar, 0 if none
1290 * @end2: an end marker xmlChar, 0 if none
1291 * @end3: an end marker xmlChar, 0 if none
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001292 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001293 * Subtitute the HTML entities by their value
1294 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001295 * DEPRECATED !!!!
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001296 *
1297 * Returns A newly allocated string with the substitution done. The caller
1298 * must deallocate it !
1299 */
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001300xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001301htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001302 xmlChar end, xmlChar end2, xmlChar end3) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001303 xmlChar *name = NULL;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001304 xmlChar *buffer = NULL;
1305 unsigned int buffer_size = 0;
1306 unsigned int nbchars = 0;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001307 htmlEntityDescPtr ent;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001308 unsigned int max = (unsigned int) len;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001309 int c,l;
1310
1311 if (ctxt->depth > 40) {
1312 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1313 ctxt->sax->error(ctxt->userData,
1314 "Detected entity reference loop\n");
1315 ctxt->wellFormed = 0;
1316 ctxt->disableSAX = 1;
1317 ctxt->errNo = XML_ERR_ENTITY_LOOP;
1318 return(NULL);
1319 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001320
1321 /*
1322 * allocate a translation buffer.
1323 */
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001324 buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001325 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001326 if (buffer == NULL) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001327 perror("xmlDecodeEntities: malloc failed");
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001328 return(NULL);
1329 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001330
1331 /*
1332 * Ok loop until we reach one of the ending char or a size limit.
1333 */
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001334 c = CUR_CHAR(l);
1335 while ((nbchars < max) && (c != end) &&
1336 (c != end2) && (c != end3)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001337
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001338 if (c == 0) break;
1339 if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
1340 int val = htmlParseCharRef(ctxt);
1341 COPY_BUF(0,buffer,nbchars,val);
1342 NEXTL(l);
1343 } else if ((c == '&') && (ctxt->token != '&')) {
1344 ent = htmlParseEntityRef(ctxt, &name);
1345 if (name != NULL) {
1346 if (ent != NULL) {
1347 int val = ent->value;
1348 COPY_BUF(0,buffer,nbchars,val);
1349 NEXTL(l);
1350 } else {
1351 const xmlChar *cur = name;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001352
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001353 buffer[nbchars++] = '&';
1354 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1355 growBuffer(buffer);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001356 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001357 while (*cur != 0) {
1358 buffer[nbchars++] = *cur++;
1359 }
1360 buffer[nbchars++] = ';';
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001361 }
1362 }
1363 } else {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001364 COPY_BUF(l,buffer,nbchars,c);
1365 NEXTL(l);
1366 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001367 growBuffer(buffer);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001368 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001369 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001370 c = CUR_CHAR(l);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001371 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001372 buffer[nbchars++] = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001373 return(buffer);
1374}
1375
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001376/************************************************************************
1377 * *
1378 * Commodity functions to handle streams *
1379 * *
1380 ************************************************************************/
1381
1382/**
1383 * htmlFreeInputStream:
1384 * @input: an htmlParserInputPtr
1385 *
1386 * Free up an input stream.
1387 */
1388void
1389htmlFreeInputStream(htmlParserInputPtr input) {
1390 if (input == NULL) return;
1391
1392 if (input->filename != NULL) xmlFree((char *) input->filename);
1393 if (input->directory != NULL) xmlFree((char *) input->directory);
1394 if ((input->free != NULL) && (input->base != NULL))
1395 input->free((xmlChar *) input->base);
1396 if (input->buf != NULL)
1397 xmlFreeParserInputBuffer(input->buf);
1398 memset(input, -1, sizeof(htmlParserInput));
1399 xmlFree(input);
1400}
1401
1402/**
1403 * htmlNewInputStream:
1404 * @ctxt: an HTML parser context
1405 *
1406 * Create a new input stream structure
1407 * Returns the new input stream or NULL
1408 */
1409htmlParserInputPtr
1410htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1411 htmlParserInputPtr input;
1412
1413 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1414 if (input == NULL) {
1415 ctxt->errNo = XML_ERR_NO_MEMORY;
1416 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1417 ctxt->sax->error(ctxt->userData,
1418 "malloc: couldn't allocate a new input stream\n");
1419 ctxt->errNo = XML_ERR_NO_MEMORY;
1420 return(NULL);
1421 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001422 memset(input, 0, sizeof(htmlParserInput));
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001423 input->filename = NULL;
1424 input->directory = NULL;
1425 input->base = NULL;
1426 input->cur = NULL;
1427 input->buf = NULL;
1428 input->line = 1;
1429 input->col = 1;
1430 input->buf = NULL;
1431 input->free = NULL;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001432 input->version = NULL;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001433 input->consumed = 0;
1434 input->length = 0;
1435 return(input);
1436}
1437
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001438
1439/************************************************************************
1440 * *
1441 * Commodity functions, cleanup needed ? *
1442 * *
1443 ************************************************************************/
1444
1445/**
1446 * areBlanks:
1447 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001448 * @str: a xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001449 * @len: the size of @str
1450 *
1451 * Is this a sequence of blank chars that one can ignore ?
1452 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001453 * Returns 1 if ignorable 0 otherwise.
1454 */
1455
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001456static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001457 int i;
1458 xmlNodePtr lastChild;
1459
1460 for (i = 0;i < len;i++)
1461 if (!(IS_BLANK(str[i]))) return(0);
1462
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001463 if (CUR == 0) return(1);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001464 if (CUR != '<') return(0);
1465 if (ctxt->node == NULL) return(0);
1466 lastChild = xmlGetLastChild(ctxt->node);
1467 if (lastChild == NULL) {
1468 if (ctxt->node->content != NULL) return(0);
1469 } else if (xmlNodeIsText(lastChild))
1470 return(0);
1471 return(1);
1472}
1473
1474/**
1475 * htmlHandleEntity:
1476 * @ctxt: an HTML parser context
1477 * @entity: an XML entity pointer.
1478 *
1479 * Default handling of an HTML entity, call the parser with the
1480 * substitution string
1481 */
1482
1483void
1484htmlHandleEntity(htmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
1485 int len;
1486
1487 if (entity->content == NULL) {
1488 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1489 ctxt->sax->error(ctxt->userData, "htmlHandleEntity %s: content == NULL\n",
1490 entity->name);
1491 ctxt->wellFormed = 0;
1492 return;
1493 }
1494 len = xmlStrlen(entity->content);
1495
1496 /*
1497 * Just handle the content as a set of chars.
1498 */
1499 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
1500 ctxt->sax->characters(ctxt->userData, entity->content, len);
1501
1502}
1503
1504/**
1505 * htmlNewDoc:
1506 * @URI: URI for the dtd, or NULL
1507 * @ExternalID: the external ID of the DTD, or NULL
1508 *
1509 * Returns a new document
1510 */
1511htmlDocPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001512htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001513 xmlDocPtr cur;
1514
1515 /*
1516 * Allocate a new document and fill the fields.
1517 */
Daniel Veillard6454aec1999-09-02 22:04:43 +00001518 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001519 if (cur == NULL) {
1520 fprintf(stderr, "xmlNewDoc : malloc failed\n");
1521 return(NULL);
1522 }
Daniel Veillarde7a5a771999-08-30 13:05:42 +00001523 memset(cur, 0, sizeof(xmlDoc));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001524
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001525 cur->type = XML_HTML_DOCUMENT_NODE;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001526 cur->version = NULL;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001527 cur->intSubset = NULL;
Daniel Veillard5cb5ab81999-12-21 15:35:29 +00001528 if ((ExternalID == NULL) &&
1529 (URI == NULL))
1530 xmlCreateIntSubset(cur, BAD_CAST "HTML",
1531 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
1532 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
1533 else
1534 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
Daniel Veillardbe803962000-06-28 23:40:59 +00001535 cur->doc = cur;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001536 cur->name = NULL;
Daniel Veillardcf461992000-03-14 18:30:20 +00001537 cur->children = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001538 cur->extSubset = NULL;
1539 cur->oldNs = NULL;
1540 cur->encoding = NULL;
1541 cur->standalone = 1;
1542 cur->compression = 0;
Daniel Veillardc08a2c61999-09-08 21:35:25 +00001543 cur->ids = NULL;
1544 cur->refs = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001545#ifndef XML_WITHOUT_CORBA
1546 cur->_private = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001547#endif
1548 return(cur);
1549}
1550
1551
1552/************************************************************************
1553 * *
1554 * The parser itself *
1555 * Relates to http://www.w3.org/TR/html40 *
1556 * *
1557 ************************************************************************/
1558
1559/************************************************************************
1560 * *
1561 * The parser itself *
1562 * *
1563 ************************************************************************/
1564
1565/**
1566 * htmlParseHTMLName:
1567 * @ctxt: an HTML parser context
1568 *
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001569 * parse an HTML tag or attribute name, note that we convert it to lowercase
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001570 * since HTML names are not case-sensitive.
1571 *
1572 * Returns the Tag Name parsed or NULL
1573 */
1574
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001575xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001576htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001577 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001578 int i = 0;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001579 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001580
1581 if (!IS_LETTER(CUR) && (CUR != '_') &&
1582 (CUR != ':')) return(NULL);
1583
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001584 while ((i < HTML_PARSER_BUFFER_SIZE) &&
Daniel Veillard3f6f7f62000-06-30 17:58:25 +00001585 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1586 (CUR == ':') || (CUR == '_'))) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001587 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001588 else loc[i] = CUR;
1589 i++;
1590
1591 NEXT;
1592 }
1593
1594 ret = xmlStrndup(loc, i);
1595
1596 return(ret);
1597}
1598
1599/**
1600 * htmlParseName:
1601 * @ctxt: an HTML parser context
1602 *
1603 * parse an HTML name, this routine is case sensistive.
1604 *
1605 * Returns the Name parsed or NULL
1606 */
1607
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001608xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001609htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001610 xmlChar buf[HTML_MAX_NAMELEN];
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001611 int len = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001612
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001613 GROW;
1614 if (!IS_LETTER(CUR) && (CUR != '_')) {
1615 return(NULL);
1616 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001617
1618 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1619 (CUR == '.') || (CUR == '-') ||
1620 (CUR == '_') || (CUR == ':') ||
1621 (IS_COMBINING(CUR)) ||
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001622 (IS_EXTENDER(CUR))) {
1623 buf[len++] = CUR;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001624 NEXT;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001625 if (len >= HTML_MAX_NAMELEN) {
1626 fprintf(stderr,
1627 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1628 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1629 (CUR == '.') || (CUR == '-') ||
1630 (CUR == '_') || (CUR == ':') ||
1631 (IS_COMBINING(CUR)) ||
1632 (IS_EXTENDER(CUR)))
1633 NEXT;
1634 break;
1635 }
1636 }
1637 return(xmlStrndup(buf, len));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001638}
1639
1640/**
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001641 * htmlParseHTMLAttribute:
1642 * @ctxt: an HTML parser context
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001643 * @stop: a char stop value
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001644 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001645 * parse an HTML attribute value till the stop (quote), if
1646 * stop is 0 then it stops at the first space
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001647 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001648 * Returns the attribute parsed or NULL
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001649 */
1650
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001651xmlChar *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001652htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00001653#if 0
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001654 xmlChar buf[HTML_MAX_NAMELEN];
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001655 int len = 0;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001656
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001657 GROW;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001658 while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1659 if ((stop == 0) && (IS_BLANK(CUR))) break;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001660 buf[len++] = CUR;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001661 NEXT;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001662 if (len >= HTML_MAX_NAMELEN) {
1663 fprintf(stderr,
1664 "htmlParseHTMLAttribute: reached HTML_MAX_NAMELEN limit\n");
1665 while ((!IS_BLANK(CUR)) && (CUR != '<') &&
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001666 (CUR != '>') &&
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001667 (CUR != '\'') && (CUR != '"'))
1668 NEXT;
1669 break;
1670 }
1671 }
1672 return(xmlStrndup(buf, len));
Daniel Veillard71b656e2000-01-05 14:46:17 +00001673#else
1674 xmlChar *buffer = NULL;
1675 int buffer_size = 0;
1676 xmlChar *out = NULL;
1677 xmlChar *name = NULL;
1678
1679 xmlChar *cur = NULL;
1680 htmlEntityDescPtr ent;
1681
1682 /*
1683 * allocate a translation buffer.
1684 */
1685 buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1686 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1687 if (buffer == NULL) {
1688 perror("htmlParseHTMLAttribute: malloc failed");
1689 return(NULL);
1690 }
1691 out = buffer;
1692
1693 /*
1694 * Ok loop until we reach one of the ending chars
1695 */
1696 while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1697 if ((stop == 0) && (IS_BLANK(CUR))) break;
1698 if (CUR == '&') {
1699 if (NXT(1) == '#') {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001700 unsigned int c;
1701 int bits;
1702
1703 c = htmlParseCharRef(ctxt);
1704 if (c < 0x80)
1705 { *out++ = c; bits= -6; }
1706 else if (c < 0x800)
1707 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1708 else if (c < 0x10000)
1709 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1710 else
1711 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1712
1713 for ( ; bits >= 0; bits-= 6) {
1714 *out++ = ((c >> bits) & 0x3F) | 0x80;
1715 }
Daniel Veillard71b656e2000-01-05 14:46:17 +00001716 } else {
1717 ent = htmlParseEntityRef(ctxt, &name);
1718 if (name == NULL) {
1719 *out++ = '&';
1720 if (out - buffer > buffer_size - 100) {
1721 int index = out - buffer;
1722
1723 growBuffer(buffer);
1724 out = &buffer[index];
1725 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001726 } else if (ent == NULL) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00001727 *out++ = '&';
1728 cur = name;
1729 while (*cur != 0) {
1730 if (out - buffer > buffer_size - 100) {
1731 int index = out - buffer;
1732
1733 growBuffer(buffer);
1734 out = &buffer[index];
1735 }
1736 *out++ = *cur++;
1737 }
1738 xmlFree(name);
1739 } else {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001740 unsigned int c;
1741 int bits;
1742
Daniel Veillard71b656e2000-01-05 14:46:17 +00001743 if (out - buffer > buffer_size - 100) {
1744 int index = out - buffer;
1745
1746 growBuffer(buffer);
1747 out = &buffer[index];
1748 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001749 c = (xmlChar)ent->value;
1750 if (c < 0x80)
1751 { *out++ = c; bits= -6; }
1752 else if (c < 0x800)
1753 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1754 else if (c < 0x10000)
1755 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1756 else
1757 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1758
1759 for ( ; bits >= 0; bits-= 6) {
1760 *out++ = ((c >> bits) & 0x3F) | 0x80;
1761 }
Daniel Veillard71b656e2000-01-05 14:46:17 +00001762 xmlFree(name);
1763 }
1764 }
1765 } else {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001766 unsigned int c;
1767 int bits;
1768
Daniel Veillard71b656e2000-01-05 14:46:17 +00001769 if (out - buffer > buffer_size - 100) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001770 int index = out - buffer;
1771
1772 growBuffer(buffer);
1773 out = &buffer[index];
1774 }
1775 c = CUR;
1776 if (c < 0x80)
1777 { *out++ = c; bits= -6; }
1778 else if (c < 0x800)
1779 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1780 else if (c < 0x10000)
1781 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1782 else
1783 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1784
1785 for ( ; bits >= 0; bits-= 6) {
1786 *out++ = ((c >> bits) & 0x3F) | 0x80;
Daniel Veillard71b656e2000-01-05 14:46:17 +00001787 }
1788 NEXT;
1789 }
1790 }
1791 *out++ = 0;
1792 return(buffer);
1793#endif
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001794}
1795
1796/**
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001797 * htmlParseNmtoken:
1798 * @ctxt: an HTML parser context
1799 *
1800 * parse an HTML Nmtoken.
1801 *
1802 * Returns the Nmtoken parsed or NULL
1803 */
1804
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001805xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001806htmlParseNmtoken(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001807 xmlChar buf[HTML_MAX_NAMELEN];
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001808 int len = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001809
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001810 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001811 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1812 (CUR == '.') || (CUR == '-') ||
1813 (CUR == '_') || (CUR == ':') ||
1814 (IS_COMBINING(CUR)) ||
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001815 (IS_EXTENDER(CUR))) {
1816 buf[len++] = CUR;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001817 NEXT;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001818 if (len >= HTML_MAX_NAMELEN) {
1819 fprintf(stderr,
1820 "htmlParseNmtoken: reached HTML_MAX_NAMELEN limit\n");
1821 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1822 (CUR == '.') || (CUR == '-') ||
1823 (CUR == '_') || (CUR == ':') ||
1824 (IS_COMBINING(CUR)) ||
1825 (IS_EXTENDER(CUR)))
1826 NEXT;
1827 break;
1828 }
1829 }
1830 return(xmlStrndup(buf, len));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001831}
1832
1833/**
1834 * htmlParseEntityRef:
1835 * @ctxt: an HTML parser context
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001836 * @str: location to store the entity name
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001837 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001838 * parse an HTML ENTITY references
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001839 *
1840 * [68] EntityRef ::= '&' Name ';'
1841 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001842 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
1843 * if non-NULL *str will have to be freed by the caller.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001844 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001845htmlEntityDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001846htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
1847 xmlChar *name;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001848 htmlEntityDescPtr ent = NULL;
1849 *str = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001850
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001851 if (CUR == '&') {
1852 NEXT;
1853 name = htmlParseName(ctxt);
1854 if (name == NULL) {
1855 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1856 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
1857 ctxt->wellFormed = 0;
1858 } else {
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001859 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001860 if (CUR == ';') {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001861 *str = name;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001862
1863 /*
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001864 * Lookup the entity in the table.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001865 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001866 ent = htmlEntityLookup(name);
Daniel Veillard71b656e2000-01-05 14:46:17 +00001867 if (ent != NULL) /* OK that's ugly !!! */
1868 NEXT;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001869 } else {
1870 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1871 ctxt->sax->error(ctxt->userData,
1872 "htmlParseEntityRef: expecting ';'\n");
Daniel Veillard71b656e2000-01-05 14:46:17 +00001873 *str = name;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001874 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001875 }
1876 }
1877 return(ent);
1878}
1879
1880/**
1881 * htmlParseAttValue:
1882 * @ctxt: an HTML parser context
1883 *
1884 * parse a value for an attribute
1885 * Note: the parser won't do substitution of entities here, this
1886 * will be handled later in xmlStringGetNodeList, unless it was
1887 * asked for ctxt->replaceEntities != 0
1888 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001889 * Returns the AttValue parsed or NULL.
1890 */
1891
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001892xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001893htmlParseAttValue(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001894 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001895
1896 if (CUR == '"') {
1897 NEXT;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001898 ret = htmlParseHTMLAttribute(ctxt, '"');
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001899 if (CUR != '"') {
1900 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1901 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
1902 ctxt->wellFormed = 0;
1903 } else
1904 NEXT;
1905 } else if (CUR == '\'') {
1906 NEXT;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001907 ret = htmlParseHTMLAttribute(ctxt, '\'');
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001908 if (CUR != '\'') {
1909 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1910 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
1911 ctxt->wellFormed = 0;
1912 } else
1913 NEXT;
1914 } else {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001915 /*
1916 * That's an HTMLism, the attribute value may not be quoted
1917 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001918 ret = htmlParseHTMLAttribute(ctxt, 0);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001919 if (ret == NULL) {
1920 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1921 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
1922 ctxt->wellFormed = 0;
1923 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001924 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001925 return(ret);
1926}
1927
1928/**
1929 * htmlParseSystemLiteral:
1930 * @ctxt: an HTML parser context
1931 *
1932 * parse an HTML Literal
1933 *
1934 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
1935 *
1936 * Returns the SystemLiteral parsed or NULL
1937 */
1938
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001939xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001940htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001941 const xmlChar *q;
1942 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001943
1944 if (CUR == '"') {
1945 NEXT;
1946 q = CUR_PTR;
1947 while ((IS_CHAR(CUR)) && (CUR != '"'))
1948 NEXT;
1949 if (!IS_CHAR(CUR)) {
1950 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1951 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
1952 ctxt->wellFormed = 0;
1953 } else {
1954 ret = xmlStrndup(q, CUR_PTR - q);
1955 NEXT;
1956 }
1957 } else if (CUR == '\'') {
1958 NEXT;
1959 q = CUR_PTR;
1960 while ((IS_CHAR(CUR)) && (CUR != '\''))
1961 NEXT;
1962 if (!IS_CHAR(CUR)) {
1963 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1964 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
1965 ctxt->wellFormed = 0;
1966 } else {
1967 ret = xmlStrndup(q, CUR_PTR - q);
1968 NEXT;
1969 }
1970 } else {
1971 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardcf461992000-03-14 18:30:20 +00001972 ctxt->sax->error(ctxt->userData,
1973 "SystemLiteral \" or ' expected\n");
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001974 ctxt->wellFormed = 0;
1975 }
1976
1977 return(ret);
1978}
1979
1980/**
1981 * htmlParsePubidLiteral:
1982 * @ctxt: an HTML parser context
1983 *
1984 * parse an HTML public literal
1985 *
1986 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
1987 *
1988 * Returns the PubidLiteral parsed or NULL.
1989 */
1990
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001991xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001992htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001993 const xmlChar *q;
1994 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001995 /*
1996 * Name ::= (Letter | '_') (NameChar)*
1997 */
1998 if (CUR == '"') {
1999 NEXT;
2000 q = CUR_PTR;
2001 while (IS_PUBIDCHAR(CUR)) NEXT;
2002 if (CUR != '"') {
2003 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2004 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2005 ctxt->wellFormed = 0;
2006 } else {
2007 ret = xmlStrndup(q, CUR_PTR - q);
2008 NEXT;
2009 }
2010 } else if (CUR == '\'') {
2011 NEXT;
2012 q = CUR_PTR;
2013 while ((IS_LETTER(CUR)) && (CUR != '\''))
2014 NEXT;
2015 if (!IS_LETTER(CUR)) {
2016 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2017 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2018 ctxt->wellFormed = 0;
2019 } else {
2020 ret = xmlStrndup(q, CUR_PTR - q);
2021 NEXT;
2022 }
2023 } else {
2024 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2025 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2026 ctxt->wellFormed = 0;
2027 }
2028
2029 return(ret);
2030}
2031
2032/**
2033 * htmlParseCharData:
2034 * @ctxt: an HTML parser context
2035 * @cdata: int indicating whether we are within a CDATA section
2036 *
2037 * parse a CharData section.
2038 * if we are within a CDATA section ']]>' marks an end of section.
2039 *
2040 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2041 */
2042
2043void
2044htmlParseCharData(htmlParserCtxtPtr ctxt, int cdata) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002045 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2046 int nbchar = 0;
2047 int cur, l;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002048
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002049 SHRINK;
2050 cur = CUR_CHAR(l);
2051 while (((cur != '<') || (ctxt->token == '<')) &&
2052 ((cur != '&') || (ctxt->token == '&')) &&
2053 (IS_CHAR(cur))) {
2054 COPY_BUF(l,buf,nbchar,cur);
2055 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2056 /*
2057 * Ok the segment is to be consumed as chars.
2058 */
2059 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2060 if (areBlanks(ctxt, buf, nbchar)) {
2061 if (ctxt->sax->ignorableWhitespace != NULL)
2062 ctxt->sax->ignorableWhitespace(ctxt->userData,
2063 buf, nbchar);
2064 } else {
2065 if (ctxt->sax->characters != NULL)
2066 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2067 }
2068 }
2069 nbchar = 0;
2070 }
2071 NEXTL(l);
2072 cur = CUR_CHAR(l);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002073 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002074 if (nbchar != 0) {
2075 /*
2076 * Ok the segment is to be consumed as chars.
2077 */
2078 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2079 if (areBlanks(ctxt, buf, nbchar)) {
2080 if (ctxt->sax->ignorableWhitespace != NULL)
2081 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2082 } else {
2083 if (ctxt->sax->characters != NULL)
2084 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002085 }
2086 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002087 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002088}
2089
2090/**
2091 * htmlParseExternalID:
2092 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002093 * @publicID: a xmlChar** receiving PubidLiteral
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002094 * @strict: indicate whether we should restrict parsing to only
2095 * production [75], see NOTE below
2096 *
2097 * Parse an External ID or a Public ID
2098 *
2099 * NOTE: Productions [75] and [83] interract badly since [75] can generate
2100 * 'PUBLIC' S PubidLiteral S SystemLiteral
2101 *
2102 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2103 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2104 *
2105 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2106 *
2107 * Returns the function returns SystemLiteral and in the second
2108 * case publicID receives PubidLiteral, is strict is off
2109 * it is possible to return NULL and have publicID set.
2110 */
2111
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002112xmlChar *
2113htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID, int strict) {
2114 xmlChar *URI = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002115
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002116 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2117 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2118 (UPP(4) == 'E') && (UPP(5) == 'M')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002119 SKIP(6);
2120 if (!IS_BLANK(CUR)) {
2121 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2122 ctxt->sax->error(ctxt->userData,
2123 "Space required after 'SYSTEM'\n");
2124 ctxt->wellFormed = 0;
2125 }
2126 SKIP_BLANKS;
2127 URI = htmlParseSystemLiteral(ctxt);
2128 if (URI == NULL) {
2129 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2130 ctxt->sax->error(ctxt->userData,
2131 "htmlParseExternalID: SYSTEM, no URI\n");
2132 ctxt->wellFormed = 0;
2133 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002134 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2135 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2136 (UPP(4) == 'I') && (UPP(5) == 'C')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002137 SKIP(6);
2138 if (!IS_BLANK(CUR)) {
2139 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2140 ctxt->sax->error(ctxt->userData,
2141 "Space required after 'PUBLIC'\n");
2142 ctxt->wellFormed = 0;
2143 }
2144 SKIP_BLANKS;
2145 *publicID = htmlParsePubidLiteral(ctxt);
2146 if (*publicID == NULL) {
2147 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2148 ctxt->sax->error(ctxt->userData,
2149 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2150 ctxt->wellFormed = 0;
2151 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002152 SKIP_BLANKS;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002153 if ((CUR == '"') || (CUR == '\'')) {
2154 URI = htmlParseSystemLiteral(ctxt);
2155 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002156 }
2157 return(URI);
2158}
2159
2160/**
2161 * htmlParseComment:
2162 * @ctxt: an HTML parser context
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002163 *
2164 * Parse an XML (SGML) comment <!-- .... -->
2165 *
2166 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2167 */
2168void
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002169htmlParseComment(htmlParserCtxtPtr ctxt) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002170 xmlChar *buf = NULL;
2171 int len = 0;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002172 int size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002173 register xmlChar s, r, q;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002174
2175 /*
2176 * Check that there is a comment right here.
2177 */
2178 if ((CUR != '<') || (NXT(1) != '!') ||
2179 (NXT(2) != '-') || (NXT(3) != '-')) return;
2180
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002181 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2182 if (buf == NULL) {
2183 fprintf(stderr, "malloc of %d byte failed\n", size);
2184 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002185 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002186 q = r = '-'; /* 0 or '-' to cover our ass against <!--> and <!---> ? !!! */
2187 SKIP(4);
2188 s = CUR;
2189
2190 while (IS_CHAR(s) &&
2191 ((s != '>') || (r != '-') || (q != '-'))) {
2192 if (len + 1 >= size) {
2193 size *= 2;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002194 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002195 if (buf == NULL) {
2196 fprintf(stderr, "realloc of %d byte failed\n", size);
2197 return;
2198 }
2199 }
2200 buf[len++] = s;
2201 NEXT;
2202 q = r;
2203 r = s;
2204 s = CUR;
2205 }
2206 buf[len - 2] = 0;
2207 if (!IS_CHAR(s)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002208 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002209 ctxt->sax->error(ctxt->userData, "Comment not terminated \n<!--%.50s\n", buf);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002210 ctxt->wellFormed = 0;
2211 } else {
2212 NEXT;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002213 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL)) {
2214 ctxt->sax->comment(ctxt->userData, buf);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002215 }
2216 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002217 xmlFree(buf);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002218}
2219
2220/**
2221 * htmlParseCharRef:
2222 * @ctxt: an HTML parser context
2223 *
2224 * parse Reference declarations
2225 *
2226 * [66] CharRef ::= '&#' [0-9]+ ';' |
2227 * '&#x' [0-9a-fA-F]+ ';'
2228 *
2229 * Returns the value parsed (as an int)
2230 */
2231int
2232htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2233 int val = 0;
2234
2235 if ((CUR == '&') && (NXT(1) == '#') &&
2236 (NXT(2) == 'x')) {
2237 SKIP(3);
2238 while (CUR != ';') {
2239 if ((CUR >= '0') && (CUR <= '9'))
2240 val = val * 16 + (CUR - '0');
2241 else if ((CUR >= 'a') && (CUR <= 'f'))
2242 val = val * 16 + (CUR - 'a') + 10;
2243 else if ((CUR >= 'A') && (CUR <= 'F'))
2244 val = val * 16 + (CUR - 'A') + 10;
2245 else {
2246 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2247 ctxt->sax->error(ctxt->userData,
2248 "htmlParseCharRef: invalid hexadecimal value\n");
2249 ctxt->wellFormed = 0;
2250 val = 0;
2251 break;
2252 }
2253 NEXT;
2254 }
2255 if (CUR == ';')
2256 NEXT;
2257 } else if ((CUR == '&') && (NXT(1) == '#')) {
2258 SKIP(2);
2259 while (CUR != ';') {
2260 if ((CUR >= '0') && (CUR <= '9'))
2261 val = val * 10 + (CUR - '0');
2262 else {
2263 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2264 ctxt->sax->error(ctxt->userData,
2265 "htmlParseCharRef: invalid decimal value\n");
2266 ctxt->wellFormed = 0;
2267 val = 0;
2268 break;
2269 }
2270 NEXT;
2271 }
2272 if (CUR == ';')
2273 NEXT;
2274 } else {
2275 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2276 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2277 ctxt->wellFormed = 0;
2278 }
2279 /*
2280 * Check the value IS_CHAR ...
2281 */
2282 if (IS_CHAR(val)) {
2283 return(val);
2284 } else {
2285 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002286 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002287 val);
2288 ctxt->wellFormed = 0;
2289 }
2290 return(0);
2291}
2292
2293
2294/**
2295 * htmlParseDocTypeDecl :
2296 * @ctxt: an HTML parser context
2297 *
2298 * parse a DOCTYPE declaration
2299 *
2300 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2301 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2302 */
2303
2304void
2305htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002306 xmlChar *name;
2307 xmlChar *ExternalID = NULL;
2308 xmlChar *URI = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002309
2310 /*
2311 * We know that '<!DOCTYPE' has been detected.
2312 */
2313 SKIP(9);
2314
2315 SKIP_BLANKS;
2316
2317 /*
2318 * Parse the DOCTYPE name.
2319 */
2320 name = htmlParseName(ctxt);
2321 if (name == NULL) {
2322 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2323 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2324 ctxt->wellFormed = 0;
2325 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002326 /*
2327 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2328 */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002329
2330 SKIP_BLANKS;
2331
2332 /*
2333 * Check for SystemID and ExternalID
2334 */
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002335 URI = htmlParseExternalID(ctxt, &ExternalID, 0);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002336 SKIP_BLANKS;
2337
2338 /*
2339 * We should be at the end of the DOCTYPE declaration.
2340 */
2341 if (CUR != '>') {
2342 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2343 ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
2344 ctxt->wellFormed = 0;
2345 /* We shouldn't try to resynchronize ... */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002346 }
2347 NEXT;
2348
2349 /*
Daniel Veillardd83eb822000-06-30 18:39:56 +00002350 * Create or update the document accordingly to the DOCTYPE
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002351 */
Daniel Veillardd83eb822000-06-30 18:39:56 +00002352 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2353 (!ctxt->disableSAX))
2354 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002355
2356 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002357 * Cleanup, since we don't use all those identifiers
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002358 */
Daniel Veillard6454aec1999-09-02 22:04:43 +00002359 if (URI != NULL) xmlFree(URI);
2360 if (ExternalID != NULL) xmlFree(ExternalID);
2361 if (name != NULL) xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002362}
2363
2364/**
2365 * htmlParseAttribute:
2366 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002367 * @value: a xmlChar ** used to store the value of the attribute
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002368 *
2369 * parse an attribute
2370 *
2371 * [41] Attribute ::= Name Eq AttValue
2372 *
2373 * [25] Eq ::= S? '=' S?
2374 *
2375 * With namespace:
2376 *
2377 * [NS 11] Attribute ::= QName Eq AttValue
2378 *
2379 * Also the case QName == xmlns:??? is handled independently as a namespace
2380 * definition.
2381 *
2382 * Returns the attribute name, and the value in *value.
2383 */
2384
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002385xmlChar *
2386htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002387 xmlChar *name, *val = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002388
2389 *value = NULL;
2390 name = htmlParseName(ctxt);
2391 if (name == NULL) {
2392 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2393 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2394 ctxt->wellFormed = 0;
2395 return(NULL);
2396 }
2397
2398 /*
2399 * read the value
2400 */
2401 SKIP_BLANKS;
2402 if (CUR == '=') {
2403 NEXT;
2404 SKIP_BLANKS;
2405 val = htmlParseAttValue(ctxt);
Daniel Veillardbe803962000-06-28 23:40:59 +00002406 /******
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002407 } else {
Daniel Veillardbe803962000-06-28 23:40:59 +00002408 * TODO : some attribute must have values, some may not
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002409 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002410 ctxt->sax->warning(ctxt->userData,
Daniel Veillardbe803962000-06-28 23:40:59 +00002411 "No value for attribute %s\n", name); */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002412 }
2413
2414 *value = val;
2415 return(name);
2416}
2417
2418/**
Daniel Veillard365e13b2000-07-02 07:56:37 +00002419 * htmlCheckEncoding:
2420 * @ctxt: an HTML parser context
2421 * @attvalue: the attribute value
2422 *
2423 * Checks an http-equiv attribute from a Meta tag to detect
2424 * the encoding
2425 * If a new encoding is detected the parser is switched to decode
2426 * it and pass UTF8
2427 */
2428void
2429htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2430 const xmlChar *encoding;
2431
2432 if ((ctxt == NULL) || (attvalue == NULL))
2433 return;
2434
Daniel Veillard365e13b2000-07-02 07:56:37 +00002435 encoding = xmlStrstr(attvalue, BAD_CAST"charset=");
2436 if (encoding == NULL)
2437 encoding = xmlStrstr(attvalue, BAD_CAST"Charset=");
2438 if (encoding == NULL)
2439 encoding = xmlStrstr(attvalue, BAD_CAST"CHARSET=");
2440 if (encoding != NULL) {
2441 encoding += 8;
2442 } else {
2443 encoding = xmlStrstr(attvalue, BAD_CAST"charset =");
2444 if (encoding == NULL)
2445 encoding = xmlStrstr(attvalue, BAD_CAST"Charset =");
2446 if (encoding == NULL)
2447 encoding = xmlStrstr(attvalue, BAD_CAST"CHARSET =");
2448 if (encoding != NULL)
2449 encoding += 9;
2450 }
2451 if (encoding != NULL) {
2452 xmlCharEncoding enc;
2453 xmlCharEncodingHandlerPtr handler;
2454
2455 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2456
2457 if (ctxt->input->encoding != NULL)
2458 xmlFree((xmlChar *) ctxt->input->encoding);
2459 ctxt->input->encoding = xmlStrdup(encoding);
2460
2461 enc = xmlParseCharEncoding((const char *) encoding);
2462 /*
2463 * registered set of known encodings
2464 */
2465 if (enc != XML_CHAR_ENCODING_ERROR) {
2466 xmlSwitchEncoding(ctxt, enc);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002467 ctxt->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillard365e13b2000-07-02 07:56:37 +00002468 } else {
2469 /*
2470 * fallback for unknown encodings
2471 */
2472 handler = xmlFindCharEncodingHandler((const char *) encoding);
2473 if (handler != NULL) {
2474 xmlSwitchToEncoding(ctxt, handler);
2475 } else {
2476 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2477 }
2478 }
2479 }
2480}
2481
2482/**
2483 * htmlCheckMeta:
2484 * @ctxt: an HTML parser context
2485 * @atts: the attributes values
2486 *
2487 * Checks an attributes from a Meta tag
2488 */
2489void
2490htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2491 int i;
2492 const xmlChar *att, *value;
2493 int http = 0;
2494 const xmlChar *content = NULL;
2495
2496 if ((ctxt == NULL) || (atts == NULL))
2497 return;
2498
2499 i = 0;
2500 att = atts[i++];
2501 while (att != NULL) {
2502 value = atts[i++];
2503 if ((value != NULL) &&
2504 ((!xmlStrcmp(att, BAD_CAST"http-equiv")) ||
2505 (!xmlStrcmp(att, BAD_CAST"Http-Equiv")) ||
2506 (!xmlStrcmp(att, BAD_CAST"HTTP-EQUIV"))) &&
2507 ((!xmlStrcmp(value, BAD_CAST"Content-Type")) ||
2508 (!xmlStrcmp(value, BAD_CAST"content-type")) ||
2509 (!xmlStrcmp(value, BAD_CAST"CONTENT-TYPE"))))
2510 http = 1;
2511 else if ((value != NULL) &&
2512 ((!xmlStrcmp(att, BAD_CAST"content")) ||
2513 (!xmlStrcmp(att, BAD_CAST"Content")) ||
2514 (!xmlStrcmp(att, BAD_CAST"CONTENT"))))
2515 content = value;
2516 att = atts[i++];
2517 }
2518 if ((http) && (content != NULL))
2519 htmlCheckEncoding(ctxt, content);
2520
2521}
2522
2523/**
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002524 * htmlParseStartTag:
2525 * @ctxt: an HTML parser context
2526 *
2527 * parse a start of tag either for rule element or
2528 * EmptyElement. In both case we don't parse the tag closing chars.
2529 *
2530 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2531 *
2532 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2533 *
2534 * With namespace:
2535 *
2536 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2537 *
2538 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2539 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002540 */
2541
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002542void
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002543htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002544 xmlChar *name;
2545 xmlChar *attname;
2546 xmlChar *attvalue;
2547 const xmlChar **atts = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002548 int nbatts = 0;
2549 int maxatts = 0;
Daniel Veillard365e13b2000-07-02 07:56:37 +00002550 int meta = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002551 int i;
2552
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002553 if (CUR != '<') return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002554 NEXT;
2555
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002556 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002557 name = htmlParseHTMLName(ctxt);
2558 if (name == NULL) {
2559 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2560 ctxt->sax->error(ctxt->userData,
2561 "htmlParseStartTag: invalid element name\n");
2562 ctxt->wellFormed = 0;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002563 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002564 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00002565 if (!xmlStrcmp(name, BAD_CAST"meta"))
2566 meta = 1;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002567
2568 /*
2569 * Check for auto-closure of HTML elements.
2570 */
2571 htmlAutoClose(ctxt, name);
2572
2573 /*
Daniel Veillardbe803962000-06-28 23:40:59 +00002574 * Check for implied HTML elements.
2575 */
2576 htmlCheckImplied(ctxt, name);
2577
2578 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002579 * Now parse the attributes, it ends up with the ending
2580 *
2581 * (S Attribute)* S?
2582 */
2583 SKIP_BLANKS;
2584 while ((IS_CHAR(CUR)) &&
2585 (CUR != '>') &&
2586 ((CUR != '/') || (NXT(1) != '>'))) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002587 long cons = ctxt->nbChars;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002588
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002589 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002590 attname = htmlParseAttribute(ctxt, &attvalue);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002591 if (attname != NULL) {
Daniel Veillard365e13b2000-07-02 07:56:37 +00002592
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002593 /*
2594 * Well formedness requires at most one declaration of an attribute
2595 */
2596 for (i = 0; i < nbatts;i += 2) {
2597 if (!xmlStrcmp(atts[i], attname)) {
2598 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002599 ctxt->sax->error(ctxt->userData,
2600 "Attribute %s redefined\n",
2601 attname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002602 ctxt->wellFormed = 0;
Daniel Veillard6454aec1999-09-02 22:04:43 +00002603 xmlFree(attname);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002604 if (attvalue != NULL)
2605 xmlFree(attvalue);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002606 goto failed;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002607 }
2608 }
2609
2610 /*
2611 * Add the pair to atts
2612 */
2613 if (atts == NULL) {
2614 maxatts = 10;
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002615 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002616 if (atts == NULL) {
2617 fprintf(stderr, "malloc of %ld byte failed\n",
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002618 maxatts * (long)sizeof(xmlChar *));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002619 if (name != NULL) xmlFree(name);
2620 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002621 }
Daniel Veillard51e3b151999-11-12 17:02:31 +00002622 } else if (nbatts + 4 > maxatts) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002623 maxatts *= 2;
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002624 atts = (const xmlChar **) xmlRealloc(atts, maxatts * sizeof(xmlChar *));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002625 if (atts == NULL) {
2626 fprintf(stderr, "realloc of %ld byte failed\n",
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002627 maxatts * (long)sizeof(xmlChar *));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002628 if (name != NULL) xmlFree(name);
2629 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002630 }
2631 }
2632 atts[nbatts++] = attname;
2633 atts[nbatts++] = attvalue;
2634 atts[nbatts] = NULL;
2635 atts[nbatts + 1] = NULL;
2636 }
2637
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002638failed:
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002639 SKIP_BLANKS;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002640 if (cons == ctxt->nbChars) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002641 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2642 ctxt->sax->error(ctxt->userData,
2643 "htmlParseStartTag: problem parsing attributes\n");
2644 ctxt->wellFormed = 0;
2645 break;
2646 }
2647 }
2648
2649 /*
Daniel Veillard365e13b2000-07-02 07:56:37 +00002650 * Handle specific association to the META tag
2651 */
2652 if (meta)
2653 htmlCheckMeta(ctxt, atts);
2654
2655 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002656 * SAX: Start of Element !
2657 */
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002658 htmlnamePush(ctxt, xmlStrdup(name));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002659#ifdef DEBUG
2660 fprintf(stderr,"Start of element %s: pushed %s\n", name, ctxt->name);
2661#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002662 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
2663 ctxt->sax->startElement(ctxt->userData, name, atts);
2664
2665 if (atts != NULL) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002666 for (i = 0;i < nbatts;i++) {
2667 if (atts[i] != NULL)
2668 xmlFree((xmlChar *) atts[i]);
2669 }
Daniel Veillard3f6f7f62000-06-30 17:58:25 +00002670 xmlFree((void *) atts);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002671 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002672 if (name != NULL) xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002673}
2674
2675/**
2676 * htmlParseEndTag:
2677 * @ctxt: an HTML parser context
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002678 *
2679 * parse an end of tag
2680 *
2681 * [42] ETag ::= '</' Name S? '>'
2682 *
2683 * With namespace
2684 *
2685 * [NS 9] ETag ::= '</' QName S? '>'
2686 */
2687
2688void
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002689htmlParseEndTag(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002690 xmlChar *name;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002691 xmlChar *oldname;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002692 int i;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002693
2694 if ((CUR != '<') || (NXT(1) != '/')) {
2695 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2696 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
2697 ctxt->wellFormed = 0;
2698 return;
2699 }
2700 SKIP(2);
2701
2702 name = htmlParseHTMLName(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002703 if (name == NULL) return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002704
2705 /*
2706 * We should definitely be at the ending "S? '>'" part
2707 */
2708 SKIP_BLANKS;
2709 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
2710 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2711 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
2712 ctxt->wellFormed = 0;
2713 } else
2714 NEXT;
2715
2716 /*
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002717 * If the name read is not one of the element in the parsing stack
2718 * then return, it's just an error.
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002719 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002720 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
2721 if (!xmlStrcmp(name, ctxt->nameTab[i])) break;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002722 }
2723 if (i < 0) {
2724 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002725 ctxt->sax->error(ctxt->userData,
2726 "Unexpected end tag : %s\n", name);
Daniel Veillard6454aec1999-09-02 22:04:43 +00002727 xmlFree(name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002728 ctxt->wellFormed = 0;
2729 return;
2730 }
2731
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002732
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002733 /*
2734 * Check for auto-closure of HTML elements.
2735 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002736
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002737 htmlAutoCloseOnClose(ctxt, name);
2738
2739 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002740 * Well formedness constraints, opening and closing must match.
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002741 * With the exception that the autoclose may have popped stuff out
2742 * of the stack.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002743 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002744 if (xmlStrcmp(name, ctxt->name)) {
2745#ifdef DEBUG
2746 fprintf(stderr,"End of tag %s: expecting %s\n", name, ctxt->name);
2747#endif
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002748 if ((ctxt->name != NULL) &&
2749 (xmlStrcmp(ctxt->name, name))) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002750 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2751 ctxt->sax->error(ctxt->userData,
2752 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002753 name, ctxt->name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002754 ctxt->wellFormed = 0;
2755 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002756 }
2757
2758 /*
2759 * SAX: End of Tag
2760 */
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002761 oldname = ctxt->name;
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002762 if ((oldname != NULL) && (!xmlStrcmp(oldname, name))) {
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002763 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2764 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002765 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002766 if (oldname != NULL) {
2767#ifdef DEBUG
2768 fprintf(stderr,"End of tag %s: popping out %s\n", name, oldname);
2769#endif
2770 xmlFree(oldname);
2771#ifdef DEBUG
2772 } else {
2773 fprintf(stderr,"End of tag %s: stack empty !!!\n", name);
2774#endif
2775 }
2776 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002777
2778 if (name != NULL)
Daniel Veillard6454aec1999-09-02 22:04:43 +00002779 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002780
2781 return;
2782}
2783
2784
2785/**
2786 * htmlParseReference:
2787 * @ctxt: an HTML parser context
2788 *
2789 * parse and handle entity references in content,
2790 * this will end-up in a call to character() since this is either a
2791 * CharRef, or a predefined entity.
2792 */
2793void
2794htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002795 htmlEntityDescPtr ent;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002796 xmlChar out[6];
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002797 xmlChar *name;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002798 if (CUR != '&') return;
2799
2800 if (NXT(1) == '#') {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002801 unsigned int c;
2802 int bits, i = 0;
2803
2804 c = htmlParseCharRef(ctxt);
2805 if (c < 0x80) { out[i++]= c; bits= -6; }
2806 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2807 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2808 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
2809
2810 for ( ; bits >= 0; bits-= 6) {
2811 out[i++]= ((c >> bits) & 0x3F) | 0x80;
2812 }
2813 out[i] = 0;
2814
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002815 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002816 ctxt->sax->characters(ctxt->userData, out, i);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002817 } else {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002818 ent = htmlParseEntityRef(ctxt, &name);
Daniel Veillard71b656e2000-01-05 14:46:17 +00002819 if (name == NULL) {
2820 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
2821 return;
2822 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002823 if ((ent == NULL) || (ent->value <= 0)) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002824 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
Daniel Veillardb96e6431999-08-29 21:02:19 +00002825 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002826 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
Daniel Veillard71b656e2000-01-05 14:46:17 +00002827 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002828 }
2829 } else {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002830 unsigned int c;
2831 int bits, i = 0;
2832
2833 c = ent->value;
2834 if (c < 0x80)
2835 { out[i++]= c; bits= -6; }
2836 else if (c < 0x800)
2837 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2838 else if (c < 0x10000)
2839 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2840 else
2841 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
2842
2843 for ( ; bits >= 0; bits-= 6) {
2844 out[i++]= ((c >> bits) & 0x3F) | 0x80;
2845 }
2846 out[i] = 0;
2847
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002848 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002849 ctxt->sax->characters(ctxt->userData, out, i);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002850 }
Daniel Veillard6454aec1999-09-02 22:04:43 +00002851 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002852 }
2853}
2854
2855/**
2856 * htmlParseContent:
2857 * @ctxt: an HTML parser context
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002858 * @name: the node name
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002859 *
2860 * Parse a content: comment, sub-element, reference or text.
2861 *
2862 */
2863
2864void
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002865htmlParseContent(htmlParserCtxtPtr ctxt) {
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002866 xmlChar *currentNode;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002867 int depth;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002868
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002869 currentNode = xmlStrdup(ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002870 depth = ctxt->nameNr;
2871 while (1) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002872 long cons = ctxt->nbChars;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002873
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002874 GROW;
2875 /*
2876 * Our tag or one of it's parent or children is ending.
2877 */
2878 if ((CUR == '<') && (NXT(1) == '/')) {
2879 htmlParseEndTag(ctxt);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002880 if (currentNode != NULL) xmlFree(currentNode);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002881 return;
2882 }
2883
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002884 /*
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002885 * Has this node been popped out during parsing of
2886 * the next element
2887 */
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002888 if ((xmlStrcmp(currentNode, ctxt->name)) &&
2889 (depth >= ctxt->nameNr)) {
2890 if (currentNode != NULL) xmlFree(currentNode);
2891 return;
2892 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002893
2894 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002895 * First case : a comment
2896 */
2897 if ((CUR == '<') && (NXT(1) == '!') &&
2898 (NXT(2) == '-') && (NXT(3) == '-')) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002899 htmlParseComment(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002900 }
2901
2902 /*
2903 * Second case : a sub-element.
2904 */
2905 else if (CUR == '<') {
2906 htmlParseElement(ctxt);
2907 }
2908
2909 /*
2910 * Third case : a reference. If if has not been resolved,
2911 * parsing returns it's Name, create the node
2912 */
2913 else if (CUR == '&') {
2914 htmlParseReference(ctxt);
2915 }
2916
2917 /*
Daniel Veillard365e13b2000-07-02 07:56:37 +00002918 * Fourth : end of the resource
2919 */
2920 else if (CUR == 0) {
2921 htmlAutoClose(ctxt, NULL);
2922 }
2923
2924 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002925 * Last case, text. Note that References are handled directly.
2926 */
2927 else {
2928 htmlParseCharData(ctxt, 0);
2929 }
2930
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002931 if (cons == ctxt->nbChars) {
Daniel Veillard35008381999-10-25 13:15:52 +00002932 if (ctxt->node != NULL) {
2933 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2934 ctxt->sax->error(ctxt->userData,
2935 "detected an error in element content\n");
2936 ctxt->wellFormed = 0;
2937 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002938 break;
2939 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002940
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002941 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002942 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002943 if (currentNode != NULL) xmlFree(currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002944}
2945
2946/**
2947 * htmlParseElement:
2948 * @ctxt: an HTML parser context
2949 *
2950 * parse an HTML element, this is highly recursive
2951 *
2952 * [39] element ::= EmptyElemTag | STag content ETag
2953 *
2954 * [41] Attribute ::= Name Eq AttValue
2955 */
2956
2957void
2958htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002959 const xmlChar *openTag = CUR_PTR;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002960 xmlChar *name;
Daniel Veillard7d2c2761999-10-11 15:09:51 +00002961 xmlChar *currentNode = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002962 htmlElemDescPtr info;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002963 htmlParserNodeInfo node_info;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002964 xmlChar *oldname;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002965 int depth = ctxt->nameNr;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002966
2967 /* Capture start position */
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00002968 if (ctxt->record_info) {
2969 node_info.begin_pos = ctxt->input->consumed +
2970 (CUR_PTR - ctxt->input->base);
2971 node_info.begin_line = ctxt->input->line;
2972 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002973
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002974 oldname = xmlStrdup(ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002975 htmlParseStartTag(ctxt);
2976 name = ctxt->name;
2977#ifdef DEBUG
2978 if (oldname == NULL)
2979 fprintf(stderr, "Start of element %s\n", name);
2980 else if (name == NULL)
2981 fprintf(stderr, "Start of element failed, was %s\n", oldname);
2982 else
2983 fprintf(stderr, "Start of element %s, was %s\n", name, oldname);
2984#endif
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002985 if (((depth == ctxt->nameNr) && (!xmlStrcmp(oldname, ctxt->name))) ||
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002986 (name == NULL)) {
2987 if (CUR == '>')
2988 NEXT;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002989 if (oldname != NULL)
2990 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002991 return;
2992 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002993 if (oldname != NULL)
2994 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002995
2996 /*
2997 * Lookup the info for that element.
2998 */
2999 info = htmlTagLookup(name);
3000 if (info == NULL) {
3001 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3002 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3003 name);
3004 ctxt->wellFormed = 0;
3005 } else if (info->depr) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003006/***************************
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003007 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3008 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3009 name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003010 ***************************/
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003011 }
3012
3013 /*
3014 * Check for an Empty Element labelled the XML/SGML way
3015 */
3016 if ((CUR == '/') && (NXT(1) == '>')) {
3017 SKIP(2);
3018 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3019 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003020 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003021#ifdef DEBUG
3022 fprintf(stderr,"End of tag the XML way: popping out %s\n", oldname);
3023#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003024 if (oldname != NULL)
3025 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003026 return;
3027 }
3028
Daniel Veillarde2d034d1999-07-27 19:52:06 +00003029 if (CUR == '>') {
3030 NEXT;
3031 } else {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003032 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3033 ctxt->sax->error(ctxt->userData, "Couldn't find end of Start Tag\n%.30s\n",
3034 openTag);
3035 ctxt->wellFormed = 0;
3036
3037 /*
3038 * end of parsing of this node.
3039 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003040 if (!xmlStrcmp(name, ctxt->name)) {
3041 nodePop(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003042 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003043#ifdef DEBUG
3044 fprintf(stderr,"End of start tag problem: popping out %s\n", oldname);
3045#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003046 if (oldname != NULL)
3047 xmlFree(oldname);
3048 }
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003049
3050 /*
3051 * Capture end position and add node
3052 */
3053 if ( currentNode != NULL && ctxt->record_info ) {
3054 node_info.end_pos = ctxt->input->consumed +
3055 (CUR_PTR - ctxt->input->base);
3056 node_info.end_line = ctxt->input->line;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003057 node_info.node = ctxt->node;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003058 xmlParserAddNodeInfo(ctxt, &node_info);
3059 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003060 return;
3061 }
3062
3063 /*
3064 * Check for an Empty Element from DTD definition
3065 */
3066 if ((info != NULL) && (info->empty)) {
3067 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3068 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003069 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003070#ifdef DEBUG
3071 fprintf(stderr,"End of empty tag %s : popping out %s\n", name, oldname);
3072#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003073 if (oldname != NULL)
3074 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003075 return;
3076 }
3077
3078 /*
3079 * Parse the content of the element:
3080 */
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003081 currentNode = xmlStrdup(ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003082 depth = ctxt->nameNr;
3083 while (IS_CHAR(CUR)) {
3084 htmlParseContent(ctxt);
3085 if (ctxt->nameNr < depth) break;
3086 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003087
3088 if (!IS_CHAR(CUR)) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003089 /************
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003090 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3091 ctxt->sax->error(ctxt->userData,
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003092 "Premature end of data in tag %s\n", currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003093 ctxt->wellFormed = 0;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003094 *************/
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003095
3096 /*
3097 * end of parsing of this node.
3098 */
3099 nodePop(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003100 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003101#ifdef DEBUG
3102 fprintf(stderr,"Premature end of tag %s : popping out %s\n", name, oldname);
3103#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003104 if (oldname != NULL)
3105 xmlFree(oldname);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003106 if (currentNode != NULL)
3107 xmlFree(currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003108 return;
3109 }
3110
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003111 /*
3112 * Capture end position and add node
3113 */
3114 if ( currentNode != NULL && ctxt->record_info ) {
3115 node_info.end_pos = ctxt->input->consumed +
3116 (CUR_PTR - ctxt->input->base);
3117 node_info.end_line = ctxt->input->line;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003118 node_info.node = ctxt->node;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003119 xmlParserAddNodeInfo(ctxt, &node_info);
3120 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003121 if (currentNode != NULL)
3122 xmlFree(currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003123}
3124
3125/**
3126 * htmlParseDocument :
3127 * @ctxt: an HTML parser context
3128 *
3129 * parse an HTML document (and build a tree if using the standard SAX
3130 * interface).
3131 *
3132 * Returns 0, -1 in case of error. the parser context is augmented
3133 * as a result of the parsing.
3134 */
3135
3136int
3137htmlParseDocument(htmlParserCtxtPtr ctxt) {
3138 htmlDefaultSAXHandlerInit();
3139 ctxt->html = 1;
3140
Daniel Veillarde2d034d1999-07-27 19:52:06 +00003141 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003142 /*
Daniel Veillardb96e6431999-08-29 21:02:19 +00003143 * SAX: beginning of the document processing.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003144 */
3145 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3146 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3147
3148 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003149 * Wipe out everything which is before the first '<'
3150 */
Daniel Veillard35008381999-10-25 13:15:52 +00003151 SKIP_BLANKS;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003152 if (CUR == 0) {
3153 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3154 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3155 ctxt->wellFormed = 0;
3156 }
3157
Daniel Veillardbe803962000-06-28 23:40:59 +00003158 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3159 ctxt->sax->startDocument(ctxt->userData);
3160
3161
Daniel Veillard35008381999-10-25 13:15:52 +00003162 /*
3163 * Parse possible comments before any content
3164 */
3165 while ((CUR == '<') && (NXT(1) == '!') &&
3166 (NXT(2) == '-') && (NXT(3) == '-')) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003167 htmlParseComment(ctxt);
Daniel Veillard35008381999-10-25 13:15:52 +00003168 SKIP_BLANKS;
3169 }
3170
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003171
3172 /*
3173 * Then possibly doc type declaration(s) and more Misc
3174 * (doctypedecl Misc*)?
3175 */
3176 if ((CUR == '<') && (NXT(1) == '!') &&
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003177 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3178 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3179 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3180 (UPP(8) == 'E')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003181 htmlParseDocTypeDecl(ctxt);
3182 }
3183 SKIP_BLANKS;
3184
3185 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003186 * Time to start parsing the tree itself
3187 */
Daniel Veillard35008381999-10-25 13:15:52 +00003188 htmlParseContent(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003189
3190 /*
Daniel Veillard365e13b2000-07-02 07:56:37 +00003191 * autoclose
3192 */
3193 if (CUR == 0)
3194 htmlAutoClose(ctxt, NULL);
3195
3196
3197 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003198 * SAX: end of the document processing.
3199 */
3200 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3201 ctxt->sax->endDocument(ctxt->userData);
3202 if (! ctxt->wellFormed) return(-1);
3203 return(0);
3204}
3205
3206
Daniel Veillarddbfd6411999-12-28 16:35:14 +00003207/************************************************************************
3208 * *
3209 * Parser contexts handling *
3210 * *
3211 ************************************************************************/
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003212
3213/**
3214 * xmlInitParserCtxt:
3215 * @ctxt: an HTML parser context
3216 *
3217 * Initialize a parser context
3218 */
3219
3220void
3221htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3222{
3223 htmlSAXHandler *sax;
3224
Daniel Veillard35008381999-10-25 13:15:52 +00003225 if (ctxt == NULL) return;
3226 memset(ctxt, 0, sizeof(htmlParserCtxt));
3227
Daniel Veillard6454aec1999-09-02 22:04:43 +00003228 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003229 if (sax == NULL) {
3230 fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
3231 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003232 memset(sax, 0, sizeof(htmlSAXHandler));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003233
3234 /* Allocate the Input stack */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003235 ctxt->inputTab = (htmlParserInputPtr *)
3236 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3237 if (ctxt->inputTab == NULL) {
3238 fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
3239 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003240 ctxt->inputNr = 0;
3241 ctxt->inputMax = 5;
3242 ctxt->input = NULL;
3243 ctxt->version = NULL;
3244 ctxt->encoding = NULL;
3245 ctxt->standalone = -1;
Daniel Veillarddbfd6411999-12-28 16:35:14 +00003246 ctxt->instate = XML_PARSER_START;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003247
3248 /* Allocate the Node stack */
Daniel Veillard6454aec1999-09-02 22:04:43 +00003249 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003250 ctxt->nodeNr = 0;
3251 ctxt->nodeMax = 10;
3252 ctxt->node = NULL;
3253
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003254 /* Allocate the Name stack */
3255 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
3256 ctxt->nameNr = 0;
3257 ctxt->nameMax = 10;
3258 ctxt->name = NULL;
3259
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003260 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3261 else {
3262 ctxt->sax = sax;
3263 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3264 }
3265 ctxt->userData = ctxt;
3266 ctxt->myDoc = NULL;
3267 ctxt->wellFormed = 1;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003268 ctxt->replaceEntities = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003269 ctxt->html = 1;
3270 ctxt->record_info = 0;
Daniel Veillard35008381999-10-25 13:15:52 +00003271 ctxt->validate = 0;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003272 ctxt->nbChars = 0;
Daniel Veillarddbfd6411999-12-28 16:35:14 +00003273 ctxt->checkIndex = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003274 xmlInitNodeInfoSeq(&ctxt->node_seq);
3275}
3276
3277/**
3278 * htmlFreeParserCtxt:
3279 * @ctxt: an HTML parser context
3280 *
3281 * Free all the memory used by a parser context. However the parsed
3282 * document in ctxt->myDoc is not freed.
3283 */
3284
3285void
3286htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3287{
Daniel Veillard365e13b2000-07-02 07:56:37 +00003288 xmlFreeParserCtxt(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003289}
3290
3291/**
3292 * htmlCreateDocParserCtxt :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003293 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003294 * @encoding: a free form C string describing the HTML document encoding, or NULL
3295 *
3296 * Create a parser context for an HTML document.
3297 *
3298 * Returns the new parser context or NULL
3299 */
3300htmlParserCtxtPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003301htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003302 htmlParserCtxtPtr ctxt;
3303 htmlParserInputPtr input;
3304 /* htmlCharEncoding enc; */
3305
Daniel Veillard6454aec1999-09-02 22:04:43 +00003306 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003307 if (ctxt == NULL) {
3308 perror("malloc");
3309 return(NULL);
3310 }
3311 htmlInitParserCtxt(ctxt);
Daniel Veillard6454aec1999-09-02 22:04:43 +00003312 input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003313 if (input == NULL) {
3314 perror("malloc");
Daniel Veillard6454aec1999-09-02 22:04:43 +00003315 xmlFree(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003316 return(NULL);
3317 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003318 memset(input, 0, sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003319
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003320 input->line = 1;
3321 input->col = 1;
3322 input->base = cur;
3323 input->cur = cur;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003324
3325 inputPush(ctxt, input);
3326 return(ctxt);
3327}
3328
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003329/************************************************************************
3330 * *
3331 * Progressive parsing interfaces *
3332 * *
3333 ************************************************************************/
3334
3335/**
3336 * htmlParseLookupSequence:
3337 * @ctxt: an HTML parser context
3338 * @first: the first char to lookup
3339 * @next: the next char to lookup or zero
3340 * @third: the next char to lookup or zero
3341 *
3342 * Try to find if a sequence (first, next, third) or just (first next) or
3343 * (first) is available in the input stream.
3344 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3345 * to avoid rescanning sequences of bytes, it DOES change the state of the
3346 * parser, do not use liberally.
3347 * This is basically similar to xmlParseLookupSequence()
3348 *
3349 * Returns the index to the current parsing point if the full sequence
3350 * is available, -1 otherwise.
3351 */
3352int
3353htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3354 xmlChar next, xmlChar third) {
3355 int base, len;
3356 htmlParserInputPtr in;
3357 const xmlChar *buf;
3358
3359 in = ctxt->input;
3360 if (in == NULL) return(-1);
3361 base = in->cur - in->base;
3362 if (base < 0) return(-1);
3363 if (ctxt->checkIndex > base)
3364 base = ctxt->checkIndex;
3365 if (in->buf == NULL) {
3366 buf = in->base;
3367 len = in->length;
3368 } else {
3369 buf = in->buf->buffer->content;
3370 len = in->buf->buffer->use;
3371 }
3372 /* take into account the sequence length */
3373 if (third) len -= 2;
3374 else if (next) len --;
3375 for (;base < len;base++) {
3376 if (buf[base] == first) {
3377 if (third != 0) {
3378 if ((buf[base + 1] != next) ||
3379 (buf[base + 2] != third)) continue;
3380 } else if (next != 0) {
3381 if (buf[base + 1] != next) continue;
3382 }
3383 ctxt->checkIndex = 0;
3384#ifdef DEBUG_PUSH
3385 if (next == 0)
3386 fprintf(stderr, "HPP: lookup '%c' found at %d\n",
3387 first, base);
3388 else if (third == 0)
3389 fprintf(stderr, "HPP: lookup '%c%c' found at %d\n",
3390 first, next, base);
3391 else
3392 fprintf(stderr, "HPP: lookup '%c%c%c' found at %d\n",
3393 first, next, third, base);
3394#endif
3395 return(base - (in->cur - in->base));
3396 }
3397 }
3398 ctxt->checkIndex = base;
3399#ifdef DEBUG_PUSH
3400 if (next == 0)
3401 fprintf(stderr, "HPP: lookup '%c' failed\n", first);
3402 else if (third == 0)
3403 fprintf(stderr, "HPP: lookup '%c%c' failed\n", first, next);
3404 else
3405 fprintf(stderr, "HPP: lookup '%c%c%c' failed\n", first, next, third);
3406#endif
3407 return(-1);
3408}
3409
3410/**
Daniel Veillard71b656e2000-01-05 14:46:17 +00003411 * htmlParseTryOrFinish:
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003412 * @ctxt: an HTML parser context
Daniel Veillard71b656e2000-01-05 14:46:17 +00003413 * @terminate: last chunk indicator
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003414 *
3415 * Try to progress on parsing
3416 *
3417 * Returns zero if no parsing was possible
3418 */
3419int
Daniel Veillard71b656e2000-01-05 14:46:17 +00003420htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003421 int ret = 0;
3422 htmlParserInputPtr in;
Daniel Veillard365e13b2000-07-02 07:56:37 +00003423 int avail = 0;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003424 xmlChar cur, next;
3425
3426#ifdef DEBUG_PUSH
3427 switch (ctxt->instate) {
3428 case XML_PARSER_EOF:
3429 fprintf(stderr, "HPP: try EOF\n"); break;
3430 case XML_PARSER_START:
3431 fprintf(stderr, "HPP: try START\n"); break;
3432 case XML_PARSER_MISC:
3433 fprintf(stderr, "HPP: try MISC\n");break;
3434 case XML_PARSER_COMMENT:
3435 fprintf(stderr, "HPP: try COMMENT\n");break;
3436 case XML_PARSER_PROLOG:
3437 fprintf(stderr, "HPP: try PROLOG\n");break;
3438 case XML_PARSER_START_TAG:
3439 fprintf(stderr, "HPP: try START_TAG\n");break;
3440 case XML_PARSER_CONTENT:
3441 fprintf(stderr, "HPP: try CONTENT\n");break;
3442 case XML_PARSER_CDATA_SECTION:
3443 fprintf(stderr, "HPP: try CDATA_SECTION\n");break;
3444 case XML_PARSER_END_TAG:
3445 fprintf(stderr, "HPP: try END_TAG\n");break;
3446 case XML_PARSER_ENTITY_DECL:
3447 fprintf(stderr, "HPP: try ENTITY_DECL\n");break;
3448 case XML_PARSER_ENTITY_VALUE:
3449 fprintf(stderr, "HPP: try ENTITY_VALUE\n");break;
3450 case XML_PARSER_ATTRIBUTE_VALUE:
3451 fprintf(stderr, "HPP: try ATTRIBUTE_VALUE\n");break;
3452 case XML_PARSER_DTD:
3453 fprintf(stderr, "HPP: try DTD\n");break;
3454 case XML_PARSER_EPILOG:
3455 fprintf(stderr, "HPP: try EPILOG\n");break;
3456 case XML_PARSER_PI:
3457 fprintf(stderr, "HPP: try PI\n");break;
3458 }
3459#endif
3460
3461 while (1) {
3462
3463 in = ctxt->input;
3464 if (in == NULL) break;
3465 if (in->buf == NULL)
3466 avail = in->length - (in->cur - in->base);
3467 else
3468 avail = in->buf->buffer->use - (in->cur - in->base);
Daniel Veillard365e13b2000-07-02 07:56:37 +00003469 if ((avail == 0) && (terminate)) {
3470 htmlAutoClose(ctxt, NULL);
3471 if (ctxt->nameNr == 0)
3472 ctxt->instate = XML_PARSER_EOF;
3473 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003474 if (avail < 1)
3475 goto done;
3476 switch (ctxt->instate) {
3477 case XML_PARSER_EOF:
3478 /*
3479 * Document parsing is done !
3480 */
3481 goto done;
3482 case XML_PARSER_START:
3483 /*
3484 * Very first chars read from the document flow.
3485 */
3486 cur = in->cur[0];
3487 if (IS_BLANK(cur)) {
3488 SKIP_BLANKS;
3489 if (in->buf == NULL)
3490 avail = in->length - (in->cur - in->base);
3491 else
3492 avail = in->buf->buffer->use - (in->cur - in->base);
3493 }
3494 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3495 ctxt->sax->setDocumentLocator(ctxt->userData,
3496 &xmlDefaultSAXLocator);
Daniel Veillardd83eb822000-06-30 18:39:56 +00003497 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
3498 (!ctxt->disableSAX))
3499 ctxt->sax->startDocument(ctxt->userData);
3500
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003501 cur = in->cur[0];
3502 next = in->cur[1];
3503 if ((cur == '<') && (next == '!') &&
3504 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3505 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3506 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3507 (UPP(8) == 'E')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003508 if ((!terminate) &&
3509 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003510 goto done;
3511#ifdef DEBUG_PUSH
3512 fprintf(stderr, "HPP: Parsing internal subset\n");
3513#endif
3514 htmlParseDocTypeDecl(ctxt);
3515 ctxt->instate = XML_PARSER_PROLOG;
3516#ifdef DEBUG_PUSH
3517 fprintf(stderr, "HPP: entering PROLOG\n");
3518#endif
3519 } else {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003520 ctxt->instate = XML_PARSER_MISC;
3521 }
3522#ifdef DEBUG_PUSH
3523 fprintf(stderr, "HPP: entering MISC\n");
3524#endif
3525 break;
3526 case XML_PARSER_MISC:
3527 SKIP_BLANKS;
3528 if (in->buf == NULL)
3529 avail = in->length - (in->cur - in->base);
3530 else
3531 avail = in->buf->buffer->use - (in->cur - in->base);
3532 if (avail < 2)
3533 goto done;
3534 cur = in->cur[0];
3535 next = in->cur[1];
3536 if ((cur == '<') && (next == '!') &&
3537 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003538 if ((!terminate) &&
3539 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003540 goto done;
3541#ifdef DEBUG_PUSH
3542 fprintf(stderr, "HPP: Parsing Comment\n");
3543#endif
3544 htmlParseComment(ctxt);
3545 ctxt->instate = XML_PARSER_MISC;
3546 } else if ((cur == '<') && (next == '!') &&
3547 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3548 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3549 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3550 (UPP(8) == 'E')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003551 if ((!terminate) &&
3552 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003553 goto done;
3554#ifdef DEBUG_PUSH
3555 fprintf(stderr, "HPP: Parsing internal subset\n");
3556#endif
3557 htmlParseDocTypeDecl(ctxt);
3558 ctxt->instate = XML_PARSER_PROLOG;
3559#ifdef DEBUG_PUSH
3560 fprintf(stderr, "HPP: entering PROLOG\n");
3561#endif
3562 } else if ((cur == '<') && (next == '!') &&
3563 (avail < 9)) {
3564 goto done;
3565 } else {
3566 ctxt->instate = XML_PARSER_START_TAG;
3567#ifdef DEBUG_PUSH
3568 fprintf(stderr, "HPP: entering START_TAG\n");
3569#endif
3570 }
3571 break;
3572 case XML_PARSER_PROLOG:
3573 SKIP_BLANKS;
3574 if (in->buf == NULL)
3575 avail = in->length - (in->cur - in->base);
3576 else
3577 avail = in->buf->buffer->use - (in->cur - in->base);
3578 if (avail < 2)
3579 goto done;
3580 cur = in->cur[0];
3581 next = in->cur[1];
3582 if ((cur == '<') && (next == '!') &&
3583 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003584 if ((!terminate) &&
3585 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003586 goto done;
3587#ifdef DEBUG_PUSH
3588 fprintf(stderr, "HPP: Parsing Comment\n");
3589#endif
3590 htmlParseComment(ctxt);
3591 ctxt->instate = XML_PARSER_PROLOG;
3592 } else if ((cur == '<') && (next == '!') &&
3593 (avail < 4)) {
3594 goto done;
3595 } else {
3596 ctxt->instate = XML_PARSER_START_TAG;
3597#ifdef DEBUG_PUSH
3598 fprintf(stderr, "HPP: entering START_TAG\n");
3599#endif
3600 }
3601 break;
3602 case XML_PARSER_EPILOG:
3603 SKIP_BLANKS;
3604 if (in->buf == NULL)
3605 avail = in->length - (in->cur - in->base);
3606 else
3607 avail = in->buf->buffer->use - (in->cur - in->base);
3608 if (avail < 2)
3609 goto done;
3610 cur = in->cur[0];
3611 next = in->cur[1];
3612 if ((cur == '<') && (next == '!') &&
3613 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003614 if ((!terminate) &&
3615 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003616 goto done;
3617#ifdef DEBUG_PUSH
3618 fprintf(stderr, "HPP: Parsing Comment\n");
3619#endif
3620 htmlParseComment(ctxt);
3621 ctxt->instate = XML_PARSER_EPILOG;
3622 } else if ((cur == '<') && (next == '!') &&
3623 (avail < 4)) {
3624 goto done;
3625 } else {
3626 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3627 ctxt->sax->error(ctxt->userData,
3628 "Extra content at the end of the document\n");
3629 ctxt->wellFormed = 0;
3630 ctxt->errNo = XML_ERR_DOCUMENT_END;
3631 ctxt->instate = XML_PARSER_EOF;
3632#ifdef DEBUG_PUSH
3633 fprintf(stderr, "HPP: entering EOF\n");
3634#endif
3635 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3636 ctxt->sax->endDocument(ctxt->userData);
3637 goto done;
3638 }
3639 break;
3640 case XML_PARSER_START_TAG: {
3641 xmlChar *name, *oldname;
3642 int depth = ctxt->nameNr;
3643 htmlElemDescPtr info;
3644
3645 if (avail < 2)
3646 goto done;
3647 cur = in->cur[0];
3648 if (cur != '<') {
3649 ctxt->instate = XML_PARSER_CONTENT;
3650#ifdef DEBUG_PUSH
3651 fprintf(stderr, "HPP: entering CONTENT\n");
3652#endif
3653 break;
3654 }
Daniel Veillard71b656e2000-01-05 14:46:17 +00003655 if ((!terminate) &&
3656 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003657 goto done;
3658
3659 oldname = xmlStrdup(ctxt->name);
3660 htmlParseStartTag(ctxt);
3661 name = ctxt->name;
3662#ifdef DEBUG
3663 if (oldname == NULL)
3664 fprintf(stderr, "Start of element %s\n", name);
3665 else if (name == NULL)
3666 fprintf(stderr, "Start of element failed, was %s\n",
3667 oldname);
3668 else
3669 fprintf(stderr, "Start of element %s, was %s\n",
3670 name, oldname);
3671#endif
3672 if (((depth == ctxt->nameNr) &&
3673 (!xmlStrcmp(oldname, ctxt->name))) ||
3674 (name == NULL)) {
3675 if (CUR == '>')
3676 NEXT;
3677 if (oldname != NULL)
3678 xmlFree(oldname);
3679 break;
3680 }
3681 if (oldname != NULL)
3682 xmlFree(oldname);
3683
3684 /*
3685 * Lookup the info for that element.
3686 */
3687 info = htmlTagLookup(name);
3688 if (info == NULL) {
3689 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3690 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3691 name);
3692 ctxt->wellFormed = 0;
3693 } else if (info->depr) {
3694 /***************************
3695 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3696 ctxt->sax->warning(ctxt->userData,
3697 "Tag %s is deprecated\n",
3698 name);
3699 ***************************/
3700 }
3701
3702 /*
3703 * Check for an Empty Element labelled the XML/SGML way
3704 */
3705 if ((CUR == '/') && (NXT(1) == '>')) {
3706 SKIP(2);
3707 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3708 ctxt->sax->endElement(ctxt->userData, name);
3709 oldname = htmlnamePop(ctxt);
3710#ifdef DEBUG
3711 fprintf(stderr,"End of tag the XML way: popping out %s\n",
3712 oldname);
3713#endif
3714 if (oldname != NULL)
3715 xmlFree(oldname);
3716 ctxt->instate = XML_PARSER_CONTENT;
3717#ifdef DEBUG_PUSH
3718 fprintf(stderr, "HPP: entering CONTENT\n");
3719#endif
3720 break;
3721 }
3722
3723 if (CUR == '>') {
3724 NEXT;
3725 } else {
3726 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3727 ctxt->sax->error(ctxt->userData,
3728 "Couldn't find end of Start Tag %s\n",
3729 name);
3730 ctxt->wellFormed = 0;
3731
3732 /*
3733 * end of parsing of this node.
3734 */
3735 if (!xmlStrcmp(name, ctxt->name)) {
3736 nodePop(ctxt);
3737 oldname = htmlnamePop(ctxt);
3738#ifdef DEBUG
3739 fprintf(stderr,
3740 "End of start tag problem: popping out %s\n", oldname);
3741#endif
3742 if (oldname != NULL)
3743 xmlFree(oldname);
3744 }
3745
3746 ctxt->instate = XML_PARSER_CONTENT;
3747#ifdef DEBUG_PUSH
3748 fprintf(stderr, "HPP: entering CONTENT\n");
3749#endif
3750 break;
3751 }
3752
3753 /*
3754 * Check for an Empty Element from DTD definition
3755 */
3756 if ((info != NULL) && (info->empty)) {
3757 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3758 ctxt->sax->endElement(ctxt->userData, name);
3759 oldname = htmlnamePop(ctxt);
3760#ifdef DEBUG
3761 fprintf(stderr,"End of empty tag %s : popping out %s\n", name, oldname);
3762#endif
3763 if (oldname != NULL)
3764 xmlFree(oldname);
3765 }
3766 ctxt->instate = XML_PARSER_CONTENT;
3767#ifdef DEBUG_PUSH
3768 fprintf(stderr, "HPP: entering CONTENT\n");
3769#endif
3770 break;
3771 }
3772 case XML_PARSER_CONTENT:
3773 /*
3774 * Handle preparsed entities and charRef
3775 */
3776 if (ctxt->token != 0) {
Daniel Veillard365e13b2000-07-02 07:56:37 +00003777 xmlChar chr[2] = { 0 , 0 } ;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003778
Daniel Veillard365e13b2000-07-02 07:56:37 +00003779 chr[0] = (xmlChar) ctxt->token;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003780 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
Daniel Veillard365e13b2000-07-02 07:56:37 +00003781 ctxt->sax->characters(ctxt->userData, chr, 1);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003782 ctxt->token = 0;
3783 ctxt->checkIndex = 0;
3784 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00003785 if ((avail == 1) && (terminate)) {
3786 cur = in->cur[0];
3787 if ((cur != '<') && (cur != '&')) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003788 if (ctxt->sax != NULL) {
3789 if (IS_BLANK(cur)) {
3790 if (ctxt->sax->ignorableWhitespace != NULL)
3791 ctxt->sax->ignorableWhitespace(
3792 ctxt->userData, &cur, 1);
3793 } else {
3794 if (ctxt->sax->characters != NULL)
3795 ctxt->sax->characters(
3796 ctxt->userData, &cur, 1);
3797 }
3798 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00003799 ctxt->token = 0;
3800 ctxt->checkIndex = 0;
3801 NEXT;
3802 }
3803 break;
3804 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003805 if (avail < 2)
3806 goto done;
3807 cur = in->cur[0];
3808 next = in->cur[1];
3809 if ((cur == '<') && (next == '!') &&
3810 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003811 if ((!terminate) &&
3812 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003813 goto done;
3814#ifdef DEBUG_PUSH
3815 fprintf(stderr, "HPP: Parsing Comment\n");
3816#endif
3817 htmlParseComment(ctxt);
3818 ctxt->instate = XML_PARSER_CONTENT;
3819 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
3820 goto done;
3821 } else if ((cur == '<') && (next == '/')) {
3822 ctxt->instate = XML_PARSER_END_TAG;
3823 ctxt->checkIndex = 0;
3824#ifdef DEBUG_PUSH
3825 fprintf(stderr, "HPP: entering END_TAG\n");
3826#endif
3827 break;
3828 } else if (cur == '<') {
3829 ctxt->instate = XML_PARSER_START_TAG;
3830 ctxt->checkIndex = 0;
3831#ifdef DEBUG_PUSH
3832 fprintf(stderr, "HPP: entering START_TAG\n");
3833#endif
3834 break;
3835 } else if (cur == '&') {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003836 if ((!terminate) &&
3837 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003838 goto done;
3839#ifdef DEBUG_PUSH
3840 fprintf(stderr, "HPP: Parsing Reference\n");
3841#endif
3842 /* TODO: check generation of subtrees if noent !!! */
3843 htmlParseReference(ctxt);
3844 } else {
3845 /* TODO Avoid the extra copy, handle directly !!!!!! */
3846 /*
3847 * Goal of the following test is :
3848 * - minimize calls to the SAX 'character' callback
3849 * when they are mergeable
3850 */
3851 if ((ctxt->inputNr == 1) &&
3852 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003853 if ((!terminate) &&
3854 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003855 goto done;
3856 }
3857 ctxt->checkIndex = 0;
3858#ifdef DEBUG_PUSH
3859 fprintf(stderr, "HPP: Parsing char data\n");
3860#endif
3861 htmlParseCharData(ctxt, 0);
3862 }
3863 break;
3864 case XML_PARSER_END_TAG:
3865 if (avail < 2)
3866 goto done;
Daniel Veillard71b656e2000-01-05 14:46:17 +00003867 if ((!terminate) &&
3868 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003869 goto done;
3870 htmlParseEndTag(ctxt);
3871 if (ctxt->nameNr == 0) {
3872 ctxt->instate = XML_PARSER_EPILOG;
3873 } else {
3874 ctxt->instate = XML_PARSER_CONTENT;
3875 }
3876 ctxt->checkIndex = 0;
3877#ifdef DEBUG_PUSH
3878 fprintf(stderr, "HPP: entering CONTENT\n");
3879#endif
3880 break;
3881 case XML_PARSER_CDATA_SECTION:
3882 fprintf(stderr, "HPP: internal error, state == CDATA\n");
3883 ctxt->instate = XML_PARSER_CONTENT;
3884 ctxt->checkIndex = 0;
3885#ifdef DEBUG_PUSH
3886 fprintf(stderr, "HPP: entering CONTENT\n");
3887#endif
3888 break;
3889 case XML_PARSER_DTD:
3890 fprintf(stderr, "HPP: internal error, state == DTD\n");
3891 ctxt->instate = XML_PARSER_CONTENT;
3892 ctxt->checkIndex = 0;
3893#ifdef DEBUG_PUSH
3894 fprintf(stderr, "HPP: entering CONTENT\n");
3895#endif
3896 break;
3897 case XML_PARSER_COMMENT:
3898 fprintf(stderr, "HPP: internal error, state == COMMENT\n");
3899 ctxt->instate = XML_PARSER_CONTENT;
3900 ctxt->checkIndex = 0;
3901#ifdef DEBUG_PUSH
3902 fprintf(stderr, "HPP: entering CONTENT\n");
3903#endif
3904 break;
3905 case XML_PARSER_PI:
3906 fprintf(stderr, "HPP: internal error, state == PI\n");
3907 ctxt->instate = XML_PARSER_CONTENT;
3908 ctxt->checkIndex = 0;
3909#ifdef DEBUG_PUSH
3910 fprintf(stderr, "HPP: entering CONTENT\n");
3911#endif
3912 break;
3913 case XML_PARSER_ENTITY_DECL:
3914 fprintf(stderr, "HPP: internal error, state == ENTITY_DECL\n");
3915 ctxt->instate = XML_PARSER_CONTENT;
3916 ctxt->checkIndex = 0;
3917#ifdef DEBUG_PUSH
3918 fprintf(stderr, "HPP: entering CONTENT\n");
3919#endif
3920 break;
3921 case XML_PARSER_ENTITY_VALUE:
3922 fprintf(stderr, "HPP: internal error, state == ENTITY_VALUE\n");
3923 ctxt->instate = XML_PARSER_CONTENT;
3924 ctxt->checkIndex = 0;
3925#ifdef DEBUG_PUSH
3926 fprintf(stderr, "HPP: entering DTD\n");
3927#endif
3928 break;
3929 case XML_PARSER_ATTRIBUTE_VALUE:
3930 fprintf(stderr, "HPP: internal error, state == ATTRIBUTE_VALUE\n");
3931 ctxt->instate = XML_PARSER_START_TAG;
3932 ctxt->checkIndex = 0;
3933#ifdef DEBUG_PUSH
3934 fprintf(stderr, "HPP: entering START_TAG\n");
3935#endif
3936 break;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003937 case XML_PARSER_SYSTEM_LITERAL:
3938 fprintf(stderr, "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
3939 ctxt->instate = XML_PARSER_CONTENT;
3940 ctxt->checkIndex = 0;
3941#ifdef DEBUG_PUSH
3942 fprintf(stderr, "HPP: entering CONTENT\n");
3943#endif
3944 break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003945 }
3946 }
3947done:
Daniel Veillard365e13b2000-07-02 07:56:37 +00003948 if ((avail == 0) && (terminate)) {
3949 htmlAutoClose(ctxt, NULL);
3950 if (ctxt->nameNr == 0)
3951 ctxt->instate = XML_PARSER_EOF;
3952 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003953#ifdef DEBUG_PUSH
3954 fprintf(stderr, "HPP: done %d\n", ret);
3955#endif
3956 return(ret);
3957}
3958
3959/**
Daniel Veillard71b656e2000-01-05 14:46:17 +00003960 * htmlParseTry:
3961 * @ctxt: an HTML parser context
3962 *
3963 * Try to progress on parsing
3964 *
3965 * Returns zero if no parsing was possible
3966 */
3967int
3968htmlParseTry(htmlParserCtxtPtr ctxt) {
3969 return(htmlParseTryOrFinish(ctxt, 0));
3970}
3971
3972/**
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003973 * htmlParseChunk:
3974 * @ctxt: an XML parser context
3975 * @chunk: an char array
3976 * @size: the size in byte of the chunk
3977 * @terminate: last chunk indicator
3978 *
3979 * Parse a Chunk of memory
3980 *
3981 * Returns zero if no error, the xmlParserErrors otherwise.
3982 */
3983int
3984htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
3985 int terminate) {
3986 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
3987 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
3988 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
3989 int cur = ctxt->input->cur - ctxt->input->base;
3990
3991 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
3992 ctxt->input->base = ctxt->input->buf->buffer->content + base;
3993 ctxt->input->cur = ctxt->input->base + cur;
3994#ifdef DEBUG_PUSH
3995 fprintf(stderr, "HPP: pushed %d\n", size);
3996#endif
3997
Daniel Veillardd0f7f742000-02-02 17:42:48 +00003998 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
3999 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004000 } else if (ctxt->instate != XML_PARSER_EOF)
Daniel Veillard71b656e2000-01-05 14:46:17 +00004001 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004002 if (terminate) {
4003 if ((ctxt->instate != XML_PARSER_EOF) &&
4004 (ctxt->instate != XML_PARSER_EPILOG) &&
4005 (ctxt->instate != XML_PARSER_MISC)) {
4006 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4007 ctxt->sax->error(ctxt->userData,
4008 "Extra content at the end of the document\n");
4009 ctxt->wellFormed = 0;
4010 ctxt->errNo = XML_ERR_DOCUMENT_END;
4011 }
4012 if (ctxt->instate != XML_PARSER_EOF) {
4013 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4014 ctxt->sax->endDocument(ctxt->userData);
4015 }
4016 ctxt->instate = XML_PARSER_EOF;
4017 }
4018 return((xmlParserErrors) ctxt->errNo);
4019}
4020
4021/************************************************************************
4022 * *
4023 * User entry points *
4024 * *
4025 ************************************************************************/
4026
4027/**
4028 * htmlCreatePushParserCtxt :
4029 * @sax: a SAX handler
4030 * @user_data: The user data returned on SAX callbacks
4031 * @chunk: a pointer to an array of chars
4032 * @size: number of chars in the array
4033 * @filename: an optional file name or URI
4034 * @enc: an optional encoding
4035 *
4036 * Create a parser context for using the HTML parser in push mode
4037 * To allow content encoding detection, @size should be >= 4
4038 * The value of @filename is used for fetching external entities
4039 * and error/warning reports.
4040 *
4041 * Returns the new parser context or NULL
4042 */
4043htmlParserCtxtPtr
4044htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4045 const char *chunk, int size, const char *filename,
4046 xmlCharEncoding enc) {
4047 htmlParserCtxtPtr ctxt;
4048 htmlParserInputPtr inputStream;
4049 xmlParserInputBufferPtr buf;
4050
4051 buf = xmlAllocParserInputBuffer(enc);
4052 if (buf == NULL) return(NULL);
4053
4054 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4055 if (ctxt == NULL) {
4056 xmlFree(buf);
4057 return(NULL);
4058 }
4059 memset(ctxt, 0, sizeof(htmlParserCtxt));
4060 htmlInitParserCtxt(ctxt);
4061 if (sax != NULL) {
4062 if (ctxt->sax != &htmlDefaultSAXHandler)
4063 xmlFree(ctxt->sax);
4064 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4065 if (ctxt->sax == NULL) {
4066 xmlFree(buf);
4067 xmlFree(ctxt);
4068 return(NULL);
4069 }
4070 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4071 if (user_data != NULL)
4072 ctxt->userData = user_data;
4073 }
4074 if (filename == NULL) {
4075 ctxt->directory = NULL;
4076 } else {
4077 ctxt->directory = xmlParserGetDirectory(filename);
4078 }
4079
4080 inputStream = htmlNewInputStream(ctxt);
4081 if (inputStream == NULL) {
4082 xmlFreeParserCtxt(ctxt);
4083 return(NULL);
4084 }
4085
4086 if (filename == NULL)
4087 inputStream->filename = NULL;
4088 else
4089 inputStream->filename = xmlMemStrdup(filename);
4090 inputStream->buf = buf;
4091 inputStream->base = inputStream->buf->buffer->content;
4092 inputStream->cur = inputStream->buf->buffer->content;
4093
4094 inputPush(ctxt, inputStream);
4095
4096 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4097 (ctxt->input->buf != NULL)) {
4098 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4099#ifdef DEBUG_PUSH
4100 fprintf(stderr, "HPP: pushed %d\n", size);
4101#endif
4102 }
4103
4104 return(ctxt);
4105}
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004106
4107/**
4108 * htmlSAXParseDoc :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00004109 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004110 * @encoding: a free form C string describing the HTML document encoding, or NULL
4111 * @sax: the SAX handler block
4112 * @userData: if using SAX, this pointer will be provided on callbacks.
4113 *
4114 * parse an HTML in-memory document and build a tree.
4115 * It use the given SAX function block to handle the parsing callback.
4116 * If sax is NULL, fallback to the default DOM tree building routines.
4117 *
4118 * Returns the resulting document tree
4119 */
4120
4121htmlDocPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00004122htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004123 htmlDocPtr ret;
4124 htmlParserCtxtPtr ctxt;
4125
4126 if (cur == NULL) return(NULL);
4127
4128
4129 ctxt = htmlCreateDocParserCtxt(cur, encoding);
4130 if (ctxt == NULL) return(NULL);
4131 if (sax != NULL) {
4132 ctxt->sax = sax;
4133 ctxt->userData = userData;
4134 }
4135
4136 htmlParseDocument(ctxt);
4137 ret = ctxt->myDoc;
4138 if (sax != NULL) {
4139 ctxt->sax = NULL;
4140 ctxt->userData = NULL;
4141 }
4142 htmlFreeParserCtxt(ctxt);
4143
4144 return(ret);
4145}
4146
4147/**
4148 * htmlParseDoc :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00004149 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004150 * @encoding: a free form C string describing the HTML document encoding, or NULL
4151 *
4152 * parse an HTML in-memory document and build a tree.
4153 *
4154 * Returns the resulting document tree
4155 */
4156
4157htmlDocPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00004158htmlParseDoc(xmlChar *cur, const char *encoding) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004159 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4160}
4161
4162
4163/**
4164 * htmlCreateFileParserCtxt :
4165 * @filename: the filename
4166 * @encoding: a free form C string describing the HTML document encoding, or NULL
4167 *
4168 * Create a parser context for a file content.
4169 * Automatic support for ZLIB/Compress compressed document is provided
4170 * by default if found at compile-time.
4171 *
4172 * Returns the new parser context or NULL
4173 */
4174htmlParserCtxtPtr
4175htmlCreateFileParserCtxt(const char *filename, const char *encoding)
4176{
4177 htmlParserCtxtPtr ctxt;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004178 htmlParserInputPtr inputStream;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00004179 xmlParserInputBufferPtr buf;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004180 /* htmlCharEncoding enc; */
4181
Daniel Veillarde2d034d1999-07-27 19:52:06 +00004182 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
4183 if (buf == NULL) return(NULL);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004184
Daniel Veillard6454aec1999-09-02 22:04:43 +00004185 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004186 if (ctxt == NULL) {
4187 perror("malloc");
4188 return(NULL);
4189 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00004190 memset(ctxt, 0, sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004191 htmlInitParserCtxt(ctxt);
Daniel Veillard6454aec1999-09-02 22:04:43 +00004192 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004193 if (inputStream == NULL) {
4194 perror("malloc");
Daniel Veillard6454aec1999-09-02 22:04:43 +00004195 xmlFree(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004196 return(NULL);
4197 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00004198 memset(inputStream, 0, sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004199
Daniel Veillard6454aec1999-09-02 22:04:43 +00004200 inputStream->filename = xmlMemStrdup(filename);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004201 inputStream->line = 1;
4202 inputStream->col = 1;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00004203 inputStream->buf = buf;
Daniel Veillard35008381999-10-25 13:15:52 +00004204 inputStream->directory = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004205
Daniel Veillarde2d034d1999-07-27 19:52:06 +00004206 inputStream->base = inputStream->buf->buffer->content;
4207 inputStream->cur = inputStream->buf->buffer->content;
4208 inputStream->free = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004209
4210 inputPush(ctxt, inputStream);
4211 return(ctxt);
4212}
4213
4214/**
4215 * htmlSAXParseFile :
4216 * @filename: the filename
4217 * @encoding: a free form C string describing the HTML document encoding, or NULL
4218 * @sax: the SAX handler block
4219 * @userData: if using SAX, this pointer will be provided on callbacks.
4220 *
4221 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4222 * compressed document is provided by default if found at compile-time.
4223 * It use the given SAX function block to handle the parsing callback.
4224 * If sax is NULL, fallback to the default DOM tree building routines.
4225 *
4226 * Returns the resulting document tree
4227 */
4228
4229htmlDocPtr
4230htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
4231 void *userData) {
4232 htmlDocPtr ret;
4233 htmlParserCtxtPtr ctxt;
4234
4235 ctxt = htmlCreateFileParserCtxt(filename, encoding);
4236 if (ctxt == NULL) return(NULL);
4237 if (sax != NULL) {
4238 ctxt->sax = sax;
4239 ctxt->userData = userData;
4240 }
4241
4242 htmlParseDocument(ctxt);
4243
4244 ret = ctxt->myDoc;
4245 if (sax != NULL) {
4246 ctxt->sax = NULL;
4247 ctxt->userData = NULL;
4248 }
4249 htmlFreeParserCtxt(ctxt);
4250
4251 return(ret);
4252}
4253
4254/**
4255 * htmlParseFile :
4256 * @filename: the filename
4257 * @encoding: a free form C string describing the HTML document encoding, or NULL
4258 *
4259 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4260 * compressed document is provided by default if found at compile-time.
4261 *
4262 * Returns the resulting document tree
4263 */
4264
4265htmlDocPtr
4266htmlParseFile(const char *filename, const char *encoding) {
4267 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
4268}
Daniel Veillard361d8452000-04-03 19:48:13 +00004269
4270#endif /* LIBXML_HTML_ENABLED */