blob: b0b43bd63d5cb4c42d4bccbe31e3101ffc937ede [file] [log] [blame]
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * Daniel.Veillard@w3.org
7 */
8
9#ifdef WIN32
Daniel Veillard3c558c31999-12-22 11:30:41 +000010#include "win32config.h"
Daniel Veillardbe70ff71999-07-05 16:50:46 +000011#else
Daniel Veillard7f7d1111999-09-22 09:46:25 +000012#include "config.h"
Daniel Veillardbe70ff71999-07-05 16:50:46 +000013#endif
Daniel Veillard7f7d1111999-09-22 09:46:25 +000014
Daniel Veillard361d8452000-04-03 19:48:13 +000015#include "xmlversion.h"
16#ifdef LIBXML_HTML_ENABLED
17
Daniel Veillardbe70ff71999-07-05 16:50:46 +000018#include <stdio.h>
Daniel Veillard32bc74e2000-07-14 14:49:25 +000019#include <string.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000020#ifdef HAVE_CTYPE_H
21#include <ctype.h>
22#endif
23#ifdef HAVE_STDLIB_H
Daniel Veillardbe70ff71999-07-05 16:50:46 +000024#include <stdlib.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000025#endif
26#ifdef HAVE_SYS_STAT_H
Daniel Veillardbe70ff71999-07-05 16:50:46 +000027#include <sys/stat.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000028#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +000029#ifdef HAVE_FCNTL_H
30#include <fcntl.h>
31#endif
32#ifdef HAVE_UNISTD_H
33#include <unistd.h>
34#endif
35#ifdef HAVE_ZLIB_H
36#include <zlib.h>
37#endif
38
Daniel Veillard361d8452000-04-03 19:48:13 +000039#include <libxml/xmlmemory.h>
40#include <libxml/tree.h>
41#include <libxml/HTMLparser.h>
42#include <libxml/entities.h>
43#include <libxml/encoding.h>
Daniel Veillard32bc74e2000-07-14 14:49:25 +000044#include <libxml/parser.h>
Daniel Veillard361d8452000-04-03 19:48:13 +000045#include <libxml/valid.h>
46#include <libxml/parserInternals.h>
47#include <libxml/xmlIO.h>
Daniel Veillard5e5c6231999-12-29 12:49:06 +000048#include "xml-error.h"
Daniel Veillarde2d034d1999-07-27 19:52:06 +000049
50#define HTML_MAX_NAMELEN 1000
51#define INPUT_CHUNK 50
Daniel Veillard32bc74e2000-07-14 14:49:25 +000052#define HTML_PARSER_BIG_BUFFER_SIZE 1000
Daniel Veillard5e5c6231999-12-29 12:49:06 +000053#define HTML_PARSER_BUFFER_SIZE 100
Daniel Veillardbe70ff71999-07-05 16:50:46 +000054
Daniel Veillard82150d81999-07-07 07:32:15 +000055/* #define DEBUG */
Daniel Veillard5e5c6231999-12-29 12:49:06 +000056/* #define DEBUG_PUSH */
Daniel Veillard5233ffc1999-07-06 22:25:25 +000057
58/************************************************************************
59 * *
60 * Parser stacks related functions and macros *
61 * *
62 ************************************************************************/
63
64/*
65 * Generic function for accessing stacks in the Parser Context
66 */
67
Daniel Veillarddbfd6411999-12-28 16:35:14 +000068#define PUSH_AND_POP(scope, type, name) \
69scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000070 if (ctxt->name##Nr >= ctxt->name##Max) { \
71 ctxt->name##Max *= 2; \
Daniel Veillard32bc74e2000-07-14 14:49:25 +000072 ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000073 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
74 if (ctxt->name##Tab == NULL) { \
75 fprintf(stderr, "realloc failed !\n"); \
Daniel Veillard0142b842000-01-14 14:45:24 +000076 return(0); \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000077 } \
78 } \
79 ctxt->name##Tab[ctxt->name##Nr] = value; \
80 ctxt->name = value; \
81 return(ctxt->name##Nr++); \
82} \
Daniel Veillarddbfd6411999-12-28 16:35:14 +000083scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000084 type ret; \
Daniel Veillard7c1206f1999-10-14 09:10:25 +000085 if (ctxt->name##Nr < 0) return(0); \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000086 ctxt->name##Nr--; \
Daniel Veillard7c1206f1999-10-14 09:10:25 +000087 if (ctxt->name##Nr < 0) return(0); \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000088 if (ctxt->name##Nr > 0) \
89 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
90 else \
91 ctxt->name = NULL; \
92 ret = ctxt->name##Tab[ctxt->name##Nr]; \
93 ctxt->name##Tab[ctxt->name##Nr] = 0; \
94 return(ret); \
95} \
96
Daniel Veillarddbfd6411999-12-28 16:35:14 +000097PUSH_AND_POP(extern, xmlNodePtr, node)
98PUSH_AND_POP(extern, xmlChar*, name)
Daniel Veillard5233ffc1999-07-06 22:25:25 +000099
100/*
101 * Macros for accessing the content. Those should be used only by the parser,
102 * and not exported.
103 *
104 * Dirty macros, i.e. one need to make assumption on the context to use them
105 *
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000106 * CUR_PTR return the current pointer to the xmlChar to be parsed.
107 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000108 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
109 * in UNICODE mode. This should be used internally by the parser
110 * only to compare to ASCII values otherwise it would break when
111 * running with UTF-8 encoding.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000112 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000113 * to compare on ASCII based substring.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000114 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000115 * it should be used only to compare on ASCII based substring.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000116 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000117 * strings within the parser.
118 *
119 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
120 *
121 * CURRENT Returns the current char value, with the full decoding of
122 * UTF-8 if we are using this mode. It returns an int.
123 * NEXT Skip to the next character, this does the proper decoding
124 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000125 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
126 */
127
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000128#define UPPER (toupper(*ctxt->input->cur))
Daniel Veillardcf461992000-03-14 18:30:20 +0000129
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000130#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
Daniel Veillardcf461992000-03-14 18:30:20 +0000131
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000132#define NXT(val) ctxt->input->cur[(val)]
Daniel Veillardcf461992000-03-14 18:30:20 +0000133
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000134#define UPP(val) (toupper(ctxt->input->cur[(val)]))
Daniel Veillardcf461992000-03-14 18:30:20 +0000135
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000136#define CUR_PTR ctxt->input->cur
Daniel Veillardcf461992000-03-14 18:30:20 +0000137
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000138#define SHRINK xmlParserInputShrink(ctxt->input)
Daniel Veillardcf461992000-03-14 18:30:20 +0000139
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000140#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000141
Daniel Veillardcf461992000-03-14 18:30:20 +0000142#define CURRENT ((int) (*ctxt->input->cur))
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000143
Daniel Veillardcf461992000-03-14 18:30:20 +0000144#define SKIP_BLANKS htmlSkipBlankChars(ctxt);
145
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000146#if 0
147#define CUR ((int) (*ctxt->input->cur))
148#define NEXT htmlNextChar(ctxt);
149#else
150/* Inported from XML */
151
152/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
153#define CUR ((int) (*ctxt->input->cur))
154#define NEXT xmlNextChar(ctxt);ctxt->nbChars++;
155
156#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
157#define NXT(val) ctxt->input->cur[(val)]
158#define CUR_PTR ctxt->input->cur
159
160
161#define NEXTL(l) \
162 if (*(ctxt->input->cur) == '\n') { \
163 ctxt->input->line++; ctxt->input->col = 1; \
164 } else ctxt->input->col++; \
165 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++;
166
167/************
168 \
169 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
170 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
171 ************/
172
173#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l);
174#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l);
175
176#define COPY_BUF(l,b,i,v) \
177 if (l == 1) b[i++] = (xmlChar) v; \
178 else i += xmlCopyChar(l,&b[i],v);
179#endif
180
181/**
182 * htmlCurrentChar:
183 * @ctxt: the HTML parser context
184 * @len: pointer to the length of the char read
185 *
186 * The current char value, if using UTF-8 this may actaully span multiple
187 * bytes in the input buffer. Implement the end of line normalization:
188 * 2.11 End-of-Line Handling
189 * If the encoding is unspecified, in the case we find an ISO-Latin-1
190 * char, then the encoding converter is plugged in automatically.
191 *
192 * Returns the current char value and its lenght
193 */
194
195int
196htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
197 if (ctxt->instate == XML_PARSER_EOF)
198 return(0);
199
200 if (ctxt->token != 0) {
201 *len = 0;
202 return(ctxt->token);
203 }
204 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
205 /*
206 * We are supposed to handle UTF8, check it's valid
207 * From rfc2044: encoding of the Unicode values on UTF-8:
208 *
209 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
210 * 0000 0000-0000 007F 0xxxxxxx
211 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
212 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
213 *
214 * Check for the 0x110000 limit too
215 */
216 const unsigned char *cur = ctxt->input->cur;
217 unsigned char c;
218 unsigned int val;
219
220 c = *cur;
221 if (c & 0x80) {
222 if (cur[1] == 0)
223 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
224 if ((cur[1] & 0xc0) != 0x80)
225 goto encoding_error;
226 if ((c & 0xe0) == 0xe0) {
227
228 if (cur[2] == 0)
229 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
230 if ((cur[2] & 0xc0) != 0x80)
231 goto encoding_error;
232 if ((c & 0xf0) == 0xf0) {
233 if (cur[3] == 0)
234 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
235 if (((c & 0xf8) != 0xf0) ||
236 ((cur[3] & 0xc0) != 0x80))
237 goto encoding_error;
238 /* 4-byte code */
239 *len = 4;
240 val = (cur[0] & 0x7) << 18;
241 val |= (cur[1] & 0x3f) << 12;
242 val |= (cur[2] & 0x3f) << 6;
243 val |= cur[3] & 0x3f;
244 } else {
245 /* 3-byte code */
246 *len = 3;
247 val = (cur[0] & 0xf) << 12;
248 val |= (cur[1] & 0x3f) << 6;
249 val |= cur[2] & 0x3f;
250 }
251 } else {
252 /* 2-byte code */
253 *len = 2;
254 val = (cur[0] & 0x1f) << 6;
255 val |= cur[1] & 0x3f;
256 }
257 if (!IS_CHAR(val)) {
258 if ((ctxt->sax != NULL) &&
259 (ctxt->sax->error != NULL))
260 ctxt->sax->error(ctxt->userData,
261 "Char 0x%X out of allowed range\n", val);
262 ctxt->errNo = XML_ERR_INVALID_ENCODING;
263 ctxt->wellFormed = 0;
264 ctxt->disableSAX = 1;
265 }
266 return(val);
267 } else {
268 /* 1-byte code */
269 *len = 1;
270 return((int) *ctxt->input->cur);
271 }
272 }
273 /*
274 * Assume it's a fixed lenght encoding (1) with
275 * a compatibke encoding for the ASCII set, since
276 * XML constructs only use < 128 chars
277 */
278 *len = 1;
279 if ((int) *ctxt->input->cur < 0x80)
280 return((int) *ctxt->input->cur);
281
282 /*
283 * Humm this is bad, do an automatic flow conversion
284 */
285 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
286 ctxt->charset = XML_CHAR_ENCODING_UTF8;
287 return(xmlCurrentChar(ctxt, len));
288
289encoding_error:
290 /*
291 * If we detect an UTF8 error that probably mean that the
292 * input encoding didn't get properly advertized in the
293 * declaration header. Report the error and switch the encoding
294 * to ISO-Latin-1 (if you don't like this policy, just declare the
295 * encoding !)
296 */
297 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
298 ctxt->sax->error(ctxt->userData,
299 "Input is not proper UTF-8, indicate encoding !\n");
300 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
301 ctxt->input->cur[0], ctxt->input->cur[1],
302 ctxt->input->cur[2], ctxt->input->cur[3]);
303 }
304 ctxt->errNo = XML_ERR_INVALID_ENCODING;
305
306 ctxt->charset = XML_CHAR_ENCODING_8859_1;
307 *len = 1;
308 return((int) *ctxt->input->cur);
309}
310
Daniel Veillardcf461992000-03-14 18:30:20 +0000311/**
312 * htmlNextChar:
313 * @ctxt: the HTML parser context
314 *
315 * Skip to the next char input char.
316 */
317
318void
319htmlNextChar(htmlParserCtxtPtr ctxt) {
Daniel Veillard3f6f7f62000-06-30 17:58:25 +0000320 if (ctxt->instate == XML_PARSER_EOF)
321 return;
Daniel Veillardcf461992000-03-14 18:30:20 +0000322 if ((*ctxt->input->cur == 0) &&
323 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
324 xmlPopInput(ctxt);
325 } else {
326 if (*(ctxt->input->cur) == '\n') {
327 ctxt->input->line++; ctxt->input->col = 1;
328 } else ctxt->input->col++;
329 ctxt->input->cur++;
330 ctxt->nbChars++;
331 if (*ctxt->input->cur == 0)
332 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
333 }
334}
335
336/**
337 * htmlSkipBlankChars:
338 * @ctxt: the HTML parser context
339 *
340 * skip all blanks character found at that point in the input streams.
341 *
342 * Returns the number of space chars skipped
343 */
344
345int
346htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
347 int res = 0;
348
349 while (IS_BLANK(*(ctxt->input->cur))) {
350 if ((*ctxt->input->cur == 0) &&
351 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
352 xmlPopInput(ctxt);
353 } else {
354 if (*(ctxt->input->cur) == '\n') {
355 ctxt->input->line++; ctxt->input->col = 1;
356 } else ctxt->input->col++;
357 ctxt->input->cur++;
358 ctxt->nbChars++;
359 if (*ctxt->input->cur == 0)
360 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
361 }
362 res++;
363 }
364 return(res);
365}
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000366
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000367
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000368
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000369/************************************************************************
370 * *
371 * The list of HTML elements and their properties *
372 * *
373 ************************************************************************/
374
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000375/*
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000376 * Start Tag: 1 means the start tag can be ommited
377 * End Tag: 1 means the end tag can be ommited
378 * 2 means it's forbidden (empty elements)
379 * Depr: this element is deprecated
380 * DTD: 1 means that this element is valid only in the Loose DTD
381 * 2 means that this element is valid only in the Frameset DTD
382 *
383 * Name,Start Tag,End Tag, Empty, Depr., DTD, Description
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000384 */
385htmlElemDesc html40ElementTable[] = {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000386{ "a", 0, 0, 0, 0, 0, "anchor " },
387{ "abbr", 0, 0, 0, 0, 0, "abbreviated form" },
388{ "acronym", 0, 0, 0, 0, 0, "" },
389{ "address", 0, 0, 0, 0, 0, "information on author " },
390{ "applet", 0, 0, 0, 1, 1, "java applet " },
391{ "area", 0, 2, 1, 0, 0, "client-side image map area " },
392{ "b", 0, 0, 0, 0, 0, "bold text style" },
393{ "base", 0, 2, 1, 0, 0, "document base uri " },
394{ "basefont", 0, 2, 1, 1, 1, "base font size " },
395{ "bdo", 0, 0, 0, 0, 0, "i18n bidi over-ride " },
396{ "big", 0, 0, 0, 0, 0, "large text style" },
397{ "blockquote", 0, 0, 0, 0, 0, "long quotation " },
398{ "body", 1, 1, 0, 0, 0, "document body " },
399{ "br", 0, 2, 1, 0, 0, "forced line break " },
400{ "button", 0, 0, 0, 0, 0, "push button " },
401{ "caption", 0, 0, 0, 0, 0, "table caption " },
402{ "center", 0, 0, 0, 1, 1, "shorthand for div align=center " },
403{ "cite", 0, 0, 0, 0, 0, "citation" },
404{ "code", 0, 0, 0, 0, 0, "computer code fragment" },
405{ "col", 0, 2, 1, 0, 0, "table column " },
406{ "colgroup", 0, 1, 0, 0, 0, "table column group " },
407{ "dd", 0, 1, 0, 0, 0, "definition description " },
408{ "del", 0, 0, 0, 0, 0, "deleted text " },
409{ "dfn", 0, 0, 0, 0, 0, "instance definition" },
410{ "dir", 0, 0, 0, 1, 1, "directory list" },
411{ "div", 0, 0, 0, 0, 0, "generic language/style container"},
412{ "dl", 0, 0, 0, 0, 0, "definition list " },
413{ "dt", 0, 1, 0, 0, 0, "definition term " },
414{ "em", 0, 0, 0, 0, 0, "emphasis" },
415{ "fieldset", 0, 0, 0, 0, 0, "form control group " },
416{ "font", 0, 0, 0, 1, 1, "local change to font " },
417{ "form", 0, 0, 0, 0, 0, "interactive form " },
418{ "frame", 0, 2, 1, 0, 2, "subwindow " },
419{ "frameset", 0, 0, 0, 0, 2, "window subdivision" },
420{ "h1", 0, 0, 0, 0, 0, "heading " },
421{ "h2", 0, 0, 0, 0, 0, "heading " },
422{ "h3", 0, 0, 0, 0, 0, "heading " },
423{ "h4", 0, 0, 0, 0, 0, "heading " },
424{ "h5", 0, 0, 0, 0, 0, "heading " },
425{ "h6", 0, 0, 0, 0, 0, "heading " },
426{ "head", 1, 1, 0, 0, 0, "document head " },
427{ "hr", 0, 2, 1, 0, 0, "horizontal rule " },
428{ "html", 1, 1, 0, 0, 0, "document root element " },
429{ "i", 0, 0, 0, 0, 0, "italic text style" },
430{ "iframe", 0, 0, 0, 0, 1, "inline subwindow " },
431{ "img", 0, 2, 1, 0, 0, "embedded image " },
432{ "input", 0, 2, 1, 0, 0, "form control " },
433{ "ins", 0, 0, 0, 0, 0, "inserted text" },
434{ "isindex", 0, 2, 1, 1, 1, "single line prompt " },
435{ "kbd", 0, 0, 0, 0, 0, "text to be entered by the user" },
436{ "label", 0, 0, 0, 0, 0, "form field label text " },
437{ "legend", 0, 0, 0, 0, 0, "fieldset legend " },
438{ "li", 0, 1, 0, 0, 0, "list item " },
439{ "link", 0, 2, 1, 0, 0, "a media-independent link " },
440{ "map", 0, 0, 0, 0, 0, "client-side image map " },
441{ "menu", 0, 0, 0, 1, 1, "menu list " },
442{ "meta", 0, 2, 1, 0, 0, "generic metainformation " },
443{ "noframes", 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " },
444{ "noscript", 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
445{ "object", 0, 0, 0, 0, 0, "generic embedded object " },
446{ "ol", 0, 0, 0, 0, 0, "ordered list " },
447{ "optgroup", 0, 0, 0, 0, 0, "option group " },
448{ "option", 0, 1, 0, 0, 0, "selectable choice " },
449{ "p", 0, 1, 0, 0, 0, "paragraph " },
450{ "param", 0, 2, 1, 0, 0, "named property value " },
451{ "pre", 0, 0, 0, 0, 0, "preformatted text " },
452{ "q", 0, 0, 0, 0, 0, "short inline quotation " },
453{ "s", 0, 0, 0, 1, 1, "strike-through text style" },
454{ "samp", 0, 0, 0, 0, 0, "sample program output, scripts, etc." },
455{ "script", 0, 0, 0, 0, 0, "script statements " },
456{ "select", 0, 0, 0, 0, 0, "option selector " },
457{ "small", 0, 0, 0, 0, 0, "small text style" },
458{ "span", 0, 0, 0, 0, 0, "generic language/style container " },
459{ "strike", 0, 0, 0, 1, 1, "strike-through text" },
460{ "strong", 0, 0, 0, 0, 0, "strong emphasis" },
461{ "style", 0, 0, 0, 0, 0, "style info " },
462{ "sub", 0, 0, 0, 0, 0, "subscript" },
463{ "sup", 0, 0, 0, 0, 0, "superscript " },
464{ "table", 0, 0, 0, 0, 0, "&#160;" },
465{ "tbody", 1, 1, 0, 0, 0, "table body " },
466{ "td", 0, 1, 0, 0, 0, "table data cell" },
467{ "textarea", 0, 0, 0, 0, 0, "multi-line text field " },
468{ "tfoot", 0, 1, 0, 0, 0, "table footer " },
469{ "th", 0, 1, 0, 0, 0, "table header cell" },
470{ "thead", 0, 1, 0, 0, 0, "table header " },
471{ "title", 0, 0, 0, 0, 0, "document title " },
472{ "tr", 0, 1, 0, 0, 0, "table row " },
473{ "tt", 0, 0, 0, 0, 0, "teletype or monospaced text style" },
474{ "u", 0, 0, 0, 1, 1, "underlined text style" },
475{ "ul", 0, 0, 0, 0, 0, "unordered list " },
476{ "var", 0, 0, 0, 0, 0, "instance of a variable or program argument" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000477};
478
479/*
480 * start tags that imply the end of a current element
481 * any tag of each line implies the end of the current element if the type of
482 * that element is in the same line
483 */
Daniel Veillardb96e6431999-08-29 21:02:19 +0000484char *htmlEquEnd[] = {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000485"dt", "dd", "li", "option", NULL,
486"h1", "h2", "h3", "h4", "h5", "h6", NULL,
487"ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000488NULL
489};
490/*
491 * acording the HTML DTD, HR should be added to the 2nd line above, as it
492 * is not allowed within a H1, H2, H3, etc. But we should tolerate that case
493 * because many documents contain rules in headings...
494 */
495
496/*
497 * start tags that imply the end of current element
498 */
Daniel Veillardb96e6431999-08-29 21:02:19 +0000499char *htmlStartClose[] = {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000500"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
501 "dl", "ul", "ol", "menu", "dir", "address", "pre",
502 "listing", "xmp", "head", NULL,
503"head", "p", NULL,
504"title", "p", NULL,
505"body", "head", "style", "link", "title", "p", NULL,
506"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
507 "pre", "listing", "xmp", "head", "li", NULL,
508"hr", "p", "head", NULL,
509"h1", "p", "head", NULL,
510"h2", "p", "head", NULL,
511"h3", "p", "head", NULL,
512"h4", "p", "head", NULL,
513"h5", "p", "head", NULL,
514"h6", "p", "head", NULL,
515"dir", "p", "head", NULL,
516"address", "p", "head", "ul", NULL,
517"pre", "p", "head", "ul", NULL,
518"listing", "p", "head", NULL,
519"xmp", "p", "head", NULL,
520"blockquote", "p", "head", NULL,
521"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
522 "xmp", "head", NULL,
523"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
524 "head", "dd", NULL,
525"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
526 "head", "dt", NULL,
527"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
528 "listing", "xmp", NULL,
529"ol", "p", "head", "ul", NULL,
530"menu", "p", "head", "ul", NULL,
531"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
532"div", "p", "head", NULL,
533"noscript", "p", "head", NULL,
534"center", "font", "b", "i", "p", "head", NULL,
535"a", "a", NULL,
536"caption", "p", NULL,
537"colgroup", "caption", "colgroup", "col", "p", NULL,
538"col", "caption", "col", "p", NULL,
539"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
540 "listing", "xmp", "a", NULL,
541"th", "th", "td", NULL,
542"td", "th", "td", "p", NULL,
543"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
544"thead", "caption", "col", "colgroup", NULL,
545"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
546 "tbody", "p", NULL,
547"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
548 "tfoot", "tbody", "p", NULL,
549"optgroup", "option", NULL,
550"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
551 "pre", "listing", "xmp", "a", NULL,
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000552NULL
553};
554
Daniel Veillardb8f25c92000-08-19 19:52:36 +0000555/*
556 * The list of HTML elements which are supposed not to have
557 * CDATA content and where a p element will be implied
558 *
559 * TODO: extend that list by reading the HTML SGML DtD on
560 * implied paragraph
561 */
562static char *htmlNoContentElements[] = {
563 "html",
564 "head",
565 "body",
566 NULL
567};
568
Daniel Veillardbe803962000-06-28 23:40:59 +0000569
Daniel Veillardb96e6431999-08-29 21:02:19 +0000570static char** htmlStartCloseIndex[100];
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000571static int htmlStartCloseIndexinitialized = 0;
572
573/************************************************************************
574 * *
575 * functions to handle HTML specific data *
576 * *
577 ************************************************************************/
578
579/**
580 * htmlInitAutoClose:
581 *
582 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
583 *
584 */
585void
586htmlInitAutoClose(void) {
587 int index, i = 0;
588
589 if (htmlStartCloseIndexinitialized) return;
590
591 for (index = 0;index < 100;index ++) htmlStartCloseIndex[index] = NULL;
592 index = 0;
593 while ((htmlStartClose[i] != NULL) && (index < 100 - 1)) {
594 htmlStartCloseIndex[index++] = &htmlStartClose[i];
595 while (htmlStartClose[i] != NULL) i++;
596 i++;
597 }
598}
599
600/**
601 * htmlTagLookup:
602 * @tag: The tag name
603 *
604 * Lookup the HTML tag in the ElementTable
605 *
606 * Returns the related htmlElemDescPtr or NULL if not found.
607 */
608htmlElemDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000609htmlTagLookup(const xmlChar *tag) {
Daniel Veillard47f3f312000-08-27 22:40:15 +0000610 int i;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000611
612 for (i = 0; i < (sizeof(html40ElementTable) /
613 sizeof(html40ElementTable[0]));i++) {
Daniel Veillardb96e6431999-08-29 21:02:19 +0000614 if (!xmlStrcmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000615 return(&html40ElementTable[i]);
616 }
617 return(NULL);
618}
619
620/**
621 * htmlCheckAutoClose:
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000622 * @newtag: The new tag name
623 * @oldtag: The old tag name
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000624 *
625 * Checks wether the new tag is one of the registered valid tags for closing old.
626 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
627 *
628 * Returns 0 if no, 1 if yes.
629 */
630int
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000631htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000632 int i, index;
Daniel Veillardb96e6431999-08-29 21:02:19 +0000633 char **close;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000634
635 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
636
637 /* inefficient, but not a big deal */
638 for (index = 0; index < 100;index++) {
639 close = htmlStartCloseIndex[index];
640 if (close == NULL) return(0);
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000641 if (!xmlStrcmp(BAD_CAST *close, newtag)) break;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000642 }
643
644 i = close - htmlStartClose;
645 i++;
646 while (htmlStartClose[i] != NULL) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000647 if (!xmlStrcmp(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000648 return(1);
649 }
650 i++;
651 }
652 return(0);
653}
654
655/**
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000656 * htmlAutoCloseOnClose:
657 * @ctxt: an HTML parser context
658 * @newtag: The new tag name
659 *
660 * The HTmL DtD allows an ending tag to implicitely close other tags.
661 */
662void
663htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
664 htmlElemDescPtr info;
665 xmlChar *oldname;
666 int i;
667
668#ifdef DEBUG
669 fprintf(stderr,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
670 for (i = 0;i < ctxt->nameNr;i++)
671 fprintf(stderr,"%d : %s\n", i, ctxt->nameTab[i]);
672#endif
673
674 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
675 if (!xmlStrcmp(newtag, ctxt->nameTab[i])) break;
676 }
677 if (i < 0) return;
678
679 while (xmlStrcmp(newtag, ctxt->name)) {
680 info = htmlTagLookup(ctxt->name);
681 if ((info == NULL) || (info->endTag == 1)) {
682#ifdef DEBUG
683 fprintf(stderr,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
684#endif
685 } else {
686 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
687 ctxt->sax->error(ctxt->userData,
688 "Opening and ending tag mismatch: %s and %s\n",
689 newtag, ctxt->name);
690 ctxt->wellFormed = 0;
691 }
692 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
693 ctxt->sax->endElement(ctxt->userData, ctxt->name);
694 oldname = htmlnamePop(ctxt);
695 if (oldname != NULL) {
696#ifdef DEBUG
697 fprintf(stderr,"htmlAutoCloseOnClose: popped %s\n", oldname);
698#endif
699 xmlFree(oldname);
700 }
701 }
702}
703
704/**
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000705 * htmlAutoClose:
706 * @ctxt: an HTML parser context
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000707 * @newtag: The new tag name or NULL
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000708 *
709 * The HTmL DtD allows a tag to implicitely close other tags.
710 * The list is kept in htmlStartClose array. This function is
711 * called when a new tag has been detected and generates the
712 * appropriates closes if possible/needed.
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000713 * If newtag is NULL this mean we are at the end of the resource
Daniel Veillard365e13b2000-07-02 07:56:37 +0000714 * and we should check
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000715 */
716void
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000717htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000718 xmlChar *oldname;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000719 while ((newtag != NULL) && (ctxt->name != NULL) &&
720 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000721#ifdef DEBUG
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000722 fprintf(stderr,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000723#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000724 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000725 ctxt->sax->endElement(ctxt->userData, ctxt->name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +0000726 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000727 if (oldname != NULL) {
728#ifdef DEBUG
729 fprintf(stderr,"htmlAutoClose: popped %s\n", oldname);
730#endif
731 xmlFree(oldname);
732 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000733 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000734 if (newtag == NULL) {
735 htmlAutoCloseOnClose(ctxt, BAD_CAST"head");
736 htmlAutoCloseOnClose(ctxt, BAD_CAST"body");
737 htmlAutoCloseOnClose(ctxt, BAD_CAST"html");
738 }
739 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard365e13b2000-07-02 07:56:37 +0000740 ((!xmlStrcmp(ctxt->name, BAD_CAST"head")) ||
741 (!xmlStrcmp(ctxt->name, BAD_CAST"body")) ||
742 (!xmlStrcmp(ctxt->name, BAD_CAST"html")))) {
743#ifdef DEBUG
744 fprintf(stderr,"htmlAutoClose: EOF closes %s\n", ctxt->name);
745#endif
746 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
747 ctxt->sax->endElement(ctxt->userData, ctxt->name);
748 oldname = htmlnamePop(ctxt);
749 if (oldname != NULL) {
750#ifdef DEBUG
751 fprintf(stderr,"htmlAutoClose: popped %s\n", oldname);
752#endif
753 xmlFree(oldname);
754 }
755 }
756
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000757}
758
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000759/**
Daniel Veillard5cb5ab81999-12-21 15:35:29 +0000760 * htmlAutoCloseTag:
761 * @doc: the HTML document
762 * @name: The tag name
763 * @elem: the HTML element
764 *
765 * The HTmL DtD allows a tag to implicitely close other tags.
766 * The list is kept in htmlStartClose array. This function checks
767 * if the element or one of it's children would autoclose the
768 * given tag.
769 *
770 * Returns 1 if autoclose, 0 otherwise
771 */
772int
773htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
774 htmlNodePtr child;
775
776 if (elem == NULL) return(1);
777 if (!xmlStrcmp(name, elem->name)) return(0);
778 if (htmlCheckAutoClose(elem->name, name)) return(1);
Daniel Veillardcf461992000-03-14 18:30:20 +0000779 child = elem->children;
Daniel Veillard5cb5ab81999-12-21 15:35:29 +0000780 while (child != NULL) {
781 if (htmlAutoCloseTag(doc, name, child)) return(1);
782 child = child->next;
783 }
784 return(0);
785}
786
787/**
788 * htmlIsAutoClosed:
789 * @doc: the HTML document
790 * @elem: the HTML element
791 *
792 * The HTmL DtD allows a tag to implicitely close other tags.
793 * The list is kept in htmlStartClose array. This function checks
794 * if a tag is autoclosed by one of it's child
795 *
796 * Returns 1 if autoclosed, 0 otherwise
797 */
798int
799htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
800 htmlNodePtr child;
801
802 if (elem == NULL) return(1);
Daniel Veillardcf461992000-03-14 18:30:20 +0000803 child = elem->children;
Daniel Veillard5cb5ab81999-12-21 15:35:29 +0000804 while (child != NULL) {
805 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
806 child = child->next;
807 }
808 return(0);
809}
810
811/**
Daniel Veillardbe803962000-06-28 23:40:59 +0000812 * htmlCheckImplied:
813 * @ctxt: an HTML parser context
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000814 * @newtag: The new tag name
Daniel Veillardbe803962000-06-28 23:40:59 +0000815 *
816 * The HTmL DtD allows a tag to exists only implicitely
817 * called when a new tag has been detected and generates the
818 * appropriates implicit tags if missing
819 */
820void
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000821htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
822 if (!xmlStrcmp(newtag, BAD_CAST"html"))
Daniel Veillardbe803962000-06-28 23:40:59 +0000823 return;
824 if (ctxt->nameNr <= 0) {
825#ifdef DEBUG
826 fprintf(stderr,"Implied element html: pushed html\n");
827#endif
828 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
829 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
830 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
831 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000832 if ((!xmlStrcmp(newtag, BAD_CAST"body")) || (!xmlStrcmp(newtag, BAD_CAST"head")))
Daniel Veillardbe803962000-06-28 23:40:59 +0000833 return;
834 if (ctxt->nameNr <= 1) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000835 if ((!xmlStrcmp(newtag, BAD_CAST"script")) ||
836 (!xmlStrcmp(newtag, BAD_CAST"style")) ||
837 (!xmlStrcmp(newtag, BAD_CAST"meta")) ||
838 (!xmlStrcmp(newtag, BAD_CAST"link")) ||
839 (!xmlStrcmp(newtag, BAD_CAST"title")) ||
840 (!xmlStrcmp(newtag, BAD_CAST"base"))) {
Daniel Veillardbe803962000-06-28 23:40:59 +0000841 /*
842 * dropped OBJECT ... i you put it first BODY will be
843 * assumed !
844 */
845#ifdef DEBUG
846 fprintf(stderr,"Implied element head: pushed head\n");
847#endif
848 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
849 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
850 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
851 } else {
852#ifdef DEBUG
853 fprintf(stderr,"Implied element body: pushed body\n");
854#endif
855 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
856 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
857 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
858 }
859 }
860}
861
Daniel Veillardb8f25c92000-08-19 19:52:36 +0000862/**
863 * htmlCheckParagraph
864 * @ctxt: an HTML parser context
865 *
866 * Check whether a p element need to be implied before inserting
867 * characters in the current element.
868 *
869 * Returns 1 if a paragraph has been inserted, 0 if not and -1
870 * in case of error.
871 */
872
873int
874htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
875 const xmlChar *tag;
876 int i;
877
878 if (ctxt == NULL)
879 return(-1);
880 tag = ctxt->name;
881 if (tag == NULL) {
882 htmlAutoClose(ctxt, BAD_CAST"p");
883 htmlCheckImplied(ctxt, BAD_CAST"p");
884 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
885 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
886 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
887 return(1);
888 }
889 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
890 if (!xmlStrcmp(tag, BAD_CAST htmlNoContentElements[i])) {
891#ifdef DEBUG
892 fprintf(stderr,"Implied element paragraph\n");
893#endif
894 htmlAutoClose(ctxt, BAD_CAST"p");
895 htmlCheckImplied(ctxt, BAD_CAST"p");
896 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
897 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
898 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
899 return(1);
900 }
901 }
902 return(0);
903}
904
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000905/************************************************************************
906 * *
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000907 * The list of HTML predefined entities *
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000908 * *
909 ************************************************************************/
910
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000911
912htmlEntityDesc html40EntitiesTable[] = {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000913/*
Daniel Veillard47f3f312000-08-27 22:40:15 +0000914 * the 4 absolute ones, plus apostrophe.
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000915 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000916{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
917{ 38, "amp", "ampersand, U+0026 ISOnum" },
Daniel Veillard47f3f312000-08-27 22:40:15 +0000918{ 39, "apos", "single quote" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000919{ 60, "lt", "less-than sign, U+003C ISOnum" },
920{ 62, "gt", "greater-than sign, U+003E ISOnum" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000921
922/*
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000923 * A bunch still in the 128-255 range
924 * Replacing them depend really on the charset used.
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000925 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000926{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
927{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
928{ 162, "cent", "cent sign, U+00A2 ISOnum" },
929{ 163, "pound","pound sign, U+00A3 ISOnum" },
930{ 164, "curren","currency sign, U+00A4 ISOnum" },
931{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
932{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
933{ 167, "sect", "section sign, U+00A7 ISOnum" },
934{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
935{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
936{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
937{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
938{ 172, "not", "not sign, U+00AC ISOnum" },
939{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
940{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
941{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
942{ 176, "deg", "degree sign, U+00B0 ISOnum" },
943{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
944{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
945{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
946{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
947{ 181, "micro","micro sign, U+00B5 ISOnum" },
948{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000949{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000950{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
951{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
952{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000953{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000954{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
955{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
956{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
957{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
958{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
959{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
960{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
961{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
962{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
963{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
964{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
965{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
966{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
967{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
968{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
969{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
970{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
971{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
972{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
973{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
974{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
975{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
976{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
977{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
978{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
979{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
980{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
981{ 215, "times","multiplication sign, U+00D7 ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000982{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000983{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
984{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
985{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
986{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
987{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
988{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
989{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
990{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
991{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
992{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
993{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
994{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
995{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
996{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
997{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
998{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
999{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1000{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1001{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1002{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1003{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1004{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1005{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1006{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1007{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1008{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1009{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1010{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1011{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1012{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1013{ 247, "divide","division sign, U+00F7 ISOnum" },
1014{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1015{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1016{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1017{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1018{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1019{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1020{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1021{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001022
Daniel Veillard47f3f312000-08-27 22:40:15 +00001023{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1024{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1025{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1026{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1027{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1028
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001029/*
1030 * Anything below should really be kept as entities references
1031 */
1032{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001033
Daniel Veillard47f3f312000-08-27 22:40:15 +00001034{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1035{ 732, "tilde","small tilde, U+02DC ISOdia" },
1036
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001037{ 913, "Alpha","greek capital letter alpha, U+0391" },
1038{ 914, "Beta", "greek capital letter beta, U+0392" },
1039{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1040{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1041{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1042{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1043{ 919, "Eta", "greek capital letter eta, U+0397" },
1044{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1045{ 921, "Iota", "greek capital letter iota, U+0399" },
1046{ 922, "Kappa","greek capital letter kappa, U+039A" },
1047{ 923, "Lambda""greek capital letter lambda, U+039B ISOgrk3" },
1048{ 924, "Mu", "greek capital letter mu, U+039C" },
1049{ 925, "Nu", "greek capital letter nu, U+039D" },
1050{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1051{ 927, "Omicron","greek capital letter omicron, U+039F" },
1052{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1053{ 929, "Rho", "greek capital letter rho, U+03A1" },
1054{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1055{ 932, "Tau", "greek capital letter tau, U+03A4" },
1056{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1057{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1058{ 935, "Chi", "greek capital letter chi, U+03A7" },
1059{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1060{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001061
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001062{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1063{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1064{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1065{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1066{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1067{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1068{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1069{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1070{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1071{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1072{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1073{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1074{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1075{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1076{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1077{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1078{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1079{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1080{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1081{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1082{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1083{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1084{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1085{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1086{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1087{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1088{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1089{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001090
Daniel Veillard47f3f312000-08-27 22:40:15 +00001091{ 8194, "ensp", "en space, U+2002 ISOpub" },
1092{ 8195, "emsp", "em space, U+2003 ISOpub" },
1093{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1094{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1095{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1096{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1097{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1098{ 8211, "ndash","en dash, U+2013 ISOpub" },
1099{ 8212, "mdash","em dash, U+2014 ISOpub" },
1100{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1101{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1102{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1103{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1104{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1105{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1106{ 8224, "dagger","dagger, U+2020 ISOpub" },
1107{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1108
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001109{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1110{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
Daniel Veillard47f3f312000-08-27 22:40:15 +00001111
1112{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1113
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001114{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1115{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
Daniel Veillard47f3f312000-08-27 22:40:15 +00001116
1117{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1118{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1119
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001120{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1121{ 8260, "frasl","fraction slash, U+2044 NEW" },
1122
Daniel Veillard47f3f312000-08-27 22:40:15 +00001123{ 8364, "euro", "euro sign, U+20AC NEW" },
1124
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001125{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
Daniel Veillard47f3f312000-08-27 22:40:15 +00001126{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001127{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1128{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1129{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1130{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1131{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1132{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1133{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1134{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1135{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1136{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1137{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1138{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1139{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1140{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1141
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001142{ 8704, "forall","for all, U+2200 ISOtech" },
1143{ 8706, "part", "partial differential, U+2202 ISOtech" },
1144{ 8707, "exist","there exists, U+2203 ISOtech" },
1145{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1146{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1147{ 8712, "isin", "element of, U+2208 ISOtech" },
1148{ 8713, "notin","not an element of, U+2209 ISOtech" },
1149{ 8715, "ni", "contains as member, U+220B ISOtech" },
1150{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1151{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
1152{ 8722, "minus","minus sign, U+2212 ISOtech" },
1153{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1154{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1155{ 8733, "prop", "proportional to, U+221D ISOtech" },
1156{ 8734, "infin","infinity, U+221E ISOtech" },
1157{ 8736, "ang", "angle, U+2220 ISOamso" },
1158{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1159{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1160{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1161{ 8746, "cup", "union = cup, U+222A ISOtech" },
1162{ 8747, "int", "integral, U+222B ISOtech" },
1163{ 8756, "there4","therefore, U+2234 ISOtech" },
1164{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1165{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1166{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1167{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1168{ 8801, "equiv","identical to, U+2261 ISOtech" },
1169{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1170{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1171{ 8834, "sub", "subset of, U+2282 ISOtech" },
1172{ 8835, "sup", "superset of, U+2283 ISOtech" },
1173{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1174{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1175{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1176{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1177{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1178{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1179{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1180{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1181{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1182{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1183{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1184{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1185{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1186{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1187
1188{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1189{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1190{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1191{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1192
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001193};
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001194
1195/************************************************************************
1196 * *
1197 * Commodity functions to handle entities *
1198 * *
1199 ************************************************************************/
1200
1201/*
1202 * Macro used to grow the current buffer.
1203 */
1204#define growBuffer(buffer) { \
1205 buffer##_size *= 2; \
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001206 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001207 if (buffer == NULL) { \
1208 perror("realloc failed"); \
Daniel Veillard0142b842000-01-14 14:45:24 +00001209 return(NULL); \
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001210 } \
1211}
1212
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001213/**
1214 * htmlEntityLookup:
1215 * @name: the entity name
1216 *
1217 * Lookup the given entity in EntitiesTable
1218 *
1219 * TODO: the linear scan is really ugly, an hash table is really needed.
1220 *
1221 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1222 */
1223htmlEntityDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001224htmlEntityLookup(const xmlChar *name) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001225 int i;
1226
1227 for (i = 0;i < (sizeof(html40EntitiesTable)/
1228 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillardb96e6431999-08-29 21:02:19 +00001229 if (!xmlStrcmp(name, BAD_CAST html40EntitiesTable[i].name)) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001230#ifdef DEBUG
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001231 fprintf(stderr,"Found entity %s\n", name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001232#endif
1233 return(&html40EntitiesTable[i]);
1234 }
1235 }
1236 return(NULL);
1237}
1238
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001239/**
Daniel Veillard47f3f312000-08-27 22:40:15 +00001240 * htmlEntityValueLookup:
1241 * @value: the entity's unicode value
1242 *
1243 * Lookup the given entity in EntitiesTable
1244 *
1245 * TODO: the linear scan is really ugly, an hash table is really needed.
1246 *
1247 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1248 */
1249htmlEntityDescPtr
1250htmlEntityValueLookup(int value) {
1251 int i;
1252#ifdef DEBUG
1253 int lv = 0;
1254#endif
1255
1256 for (i = 0;i < (sizeof(html40EntitiesTable)/
1257 sizeof(html40EntitiesTable[0]));i++) {
1258 if (html40EntitiesTable[i].value >= value) {
1259 if (html40EntitiesTable[i].value > value)
1260 break;
1261#ifdef DEBUG
1262 fprintf(stderr,"Found entity %s\n", html40EntitiesTable[i].name);
1263#endif
1264 return(&html40EntitiesTable[i]);
1265 }
1266#ifdef DEBUG
1267 if (lv > html40EntitiesTable[i].value) {
1268 fprintf(stderr, "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1269 lv, html40EntitiesTable[i].value);
1270 }
1271 lv = html40EntitiesTable[i].value;
1272#endif
1273 }
1274 return(NULL);
1275}
1276
1277/**
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001278 * UTF8ToHtml:
1279 * @out: a pointer to an array of bytes to store the result
1280 * @outlen: the length of @out
1281 * @in: a pointer to an array of UTF-8 chars
1282 * @inlen: the length of @in
1283 *
1284 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1285 * plus HTML entities block of chars out.
1286 *
1287 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1288 * The value of @inlen after return is the number of octets consumed
1289 * as the return value is positive, else unpredictiable.
1290 * The value of @outlen after return is the number of octets consumed.
1291 */
1292int
1293UTF8ToHtml(unsigned char* out, int *outlen,
1294 const unsigned char* in, int *inlen) {
1295 const unsigned char* processed = in;
1296 const unsigned char* outend;
1297 const unsigned char* outstart = out;
1298 const unsigned char* instart = in;
1299 const unsigned char* inend;
1300 unsigned int c, d;
1301 int trailing;
1302
1303 if (in == NULL) {
1304 /*
1305 * initialization nothing to do
1306 */
1307 *outlen = 0;
1308 *inlen = 0;
1309 return(0);
1310 }
1311 inend = in + (*inlen);
1312 outend = out + (*outlen);
1313 while (in < inend) {
1314 d = *in++;
1315 if (d < 0x80) { c= d; trailing= 0; }
1316 else if (d < 0xC0) {
1317 /* trailing byte in leading position */
1318 *outlen = out - outstart;
1319 *inlen = processed - instart;
1320 return(-2);
1321 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1322 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1323 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1324 else {
1325 /* no chance for this in Ascii */
1326 *outlen = out - outstart;
1327 *inlen = processed - instart;
1328 return(-2);
1329 }
1330
1331 if (inend - in < trailing) {
1332 break;
1333 }
1334
1335 for ( ; trailing; trailing--) {
1336 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1337 break;
1338 c <<= 6;
1339 c |= d & 0x3F;
1340 }
1341
1342 /* assertion: c is a single UTF-4 value */
1343 if (c < 0x80) {
Daniel Veillarde010c172000-08-28 10:04:51 +00001344 if (out + 1 >= outend)
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001345 break;
1346 *out++ = c;
1347 } else {
Daniel Veillard47f3f312000-08-27 22:40:15 +00001348 int len;
1349 htmlEntityDescPtr ent;
1350
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001351 /*
1352 * Try to lookup a predefined HTML entity for it
1353 */
1354
Daniel Veillard47f3f312000-08-27 22:40:15 +00001355 ent = htmlEntityValueLookup(c);
1356 if (ent == NULL) {
1357 /* no chance for this in Ascii */
1358 *outlen = out - outstart;
1359 *inlen = processed - instart;
1360 return(-2);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001361 }
Daniel Veillard47f3f312000-08-27 22:40:15 +00001362 len = strlen(ent->name);
Daniel Veillarde010c172000-08-28 10:04:51 +00001363 if (out + 2 + len >= outend)
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001364 break;
1365 *out++ = '&';
Daniel Veillard47f3f312000-08-27 22:40:15 +00001366 memcpy(out, ent->name, len);
1367 out += len;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001368 *out++ = ';';
1369 }
1370 processed = in;
1371 }
1372 *outlen = out - outstart;
1373 *inlen = processed - instart;
1374 return(0);
1375}
1376
Daniel Veillarde010c172000-08-28 10:04:51 +00001377/**
1378 * htmlEncodeEntities:
1379 * @out: a pointer to an array of bytes to store the result
1380 * @outlen: the length of @out
1381 * @in: a pointer to an array of UTF-8 chars
1382 * @inlen: the length of @in
1383 * @quoteChar: the quote character to escape (' or ") or zero.
1384 *
1385 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1386 * plus HTML entities block of chars out.
1387 *
1388 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1389 * The value of @inlen after return is the number of octets consumed
1390 * as the return value is positive, else unpredictiable.
1391 * The value of @outlen after return is the number of octets consumed.
1392 */
1393int
1394htmlEncodeEntities(unsigned char* out, int *outlen,
1395 const unsigned char* in, int *inlen, int quoteChar) {
1396 const unsigned char* processed = in;
1397 const unsigned char* outend = out + (*outlen);
1398 const unsigned char* outstart = out;
1399 const unsigned char* instart = in;
1400 const unsigned char* inend = in + (*inlen);
1401 unsigned int c, d;
1402 int trailing;
1403
1404 while (in < inend) {
1405 d = *in++;
1406 if (d < 0x80) { c= d; trailing= 0; }
1407 else if (d < 0xC0) {
1408 /* trailing byte in leading position */
1409 *outlen = out - outstart;
1410 *inlen = processed - instart;
1411 return(-2);
1412 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1413 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1414 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1415 else {
1416 /* no chance for this in Ascii */
1417 *outlen = out - outstart;
1418 *inlen = processed - instart;
1419 return(-2);
1420 }
1421
1422 if (inend - in < trailing)
1423 break;
1424
1425 while (trailing--) {
1426 if (((d= *in++) & 0xC0) != 0x80) {
1427 *outlen = out - outstart;
1428 *inlen = processed - instart;
1429 return(-2);
1430 }
1431 c <<= 6;
1432 c |= d & 0x3F;
1433 }
1434
1435 /* assertion: c is a single UTF-4 value */
1436 if (c < 0x80 && c != quoteChar && c != '&' && c != '<' && c != '>') {
1437 if (out >= outend)
1438 break;
1439 *out++ = c;
1440 } else {
1441 htmlEntityDescPtr ent;
1442 const char *cp;
1443 char nbuf[16];
1444 int len;
1445
1446 /*
1447 * Try to lookup a predefined HTML entity for it
1448 */
1449 ent = htmlEntityValueLookup(c);
1450 if (ent == NULL) {
1451 sprintf(nbuf, "#%u", c);
1452 cp = nbuf;
1453 }
1454 else
1455 cp = ent->name;
1456 len = strlen(cp);
1457 if (out + 2 + len > outend)
1458 break;
1459 *out++ = '&';
1460 memcpy(out, cp, len);
1461 out += len;
1462 *out++ = ';';
1463 }
1464 processed = in;
1465 }
1466 *outlen = out - outstart;
1467 *inlen = processed - instart;
1468 return(0);
1469}
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001470
1471/**
1472 * htmlDecodeEntities:
1473 * @ctxt: the parser context
1474 * @len: the len to decode (in bytes !), -1 for no size limit
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001475 * @end: an end marker xmlChar, 0 if none
1476 * @end2: an end marker xmlChar, 0 if none
1477 * @end3: an end marker xmlChar, 0 if none
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001478 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001479 * Subtitute the HTML entities by their value
1480 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001481 * DEPRECATED !!!!
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001482 *
1483 * Returns A newly allocated string with the substitution done. The caller
1484 * must deallocate it !
1485 */
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001486xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001487htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001488 xmlChar end, xmlChar end2, xmlChar end3) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001489 xmlChar *name = NULL;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001490 xmlChar *buffer = NULL;
1491 unsigned int buffer_size = 0;
1492 unsigned int nbchars = 0;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001493 htmlEntityDescPtr ent;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001494 unsigned int max = (unsigned int) len;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001495 int c,l;
1496
1497 if (ctxt->depth > 40) {
1498 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1499 ctxt->sax->error(ctxt->userData,
1500 "Detected entity reference loop\n");
1501 ctxt->wellFormed = 0;
1502 ctxt->disableSAX = 1;
1503 ctxt->errNo = XML_ERR_ENTITY_LOOP;
1504 return(NULL);
1505 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001506
1507 /*
1508 * allocate a translation buffer.
1509 */
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001510 buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001511 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001512 if (buffer == NULL) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001513 perror("xmlDecodeEntities: malloc failed");
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001514 return(NULL);
1515 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001516
1517 /*
1518 * Ok loop until we reach one of the ending char or a size limit.
1519 */
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001520 c = CUR_CHAR(l);
1521 while ((nbchars < max) && (c != end) &&
1522 (c != end2) && (c != end3)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001523
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001524 if (c == 0) break;
1525 if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
1526 int val = htmlParseCharRef(ctxt);
1527 COPY_BUF(0,buffer,nbchars,val);
1528 NEXTL(l);
1529 } else if ((c == '&') && (ctxt->token != '&')) {
1530 ent = htmlParseEntityRef(ctxt, &name);
1531 if (name != NULL) {
1532 if (ent != NULL) {
1533 int val = ent->value;
1534 COPY_BUF(0,buffer,nbchars,val);
1535 NEXTL(l);
1536 } else {
1537 const xmlChar *cur = name;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001538
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001539 buffer[nbchars++] = '&';
1540 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1541 growBuffer(buffer);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001542 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001543 while (*cur != 0) {
1544 buffer[nbchars++] = *cur++;
1545 }
1546 buffer[nbchars++] = ';';
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001547 }
1548 }
1549 } else {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001550 COPY_BUF(l,buffer,nbchars,c);
1551 NEXTL(l);
1552 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001553 growBuffer(buffer);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001554 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001555 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001556 c = CUR_CHAR(l);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001557 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001558 buffer[nbchars++] = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001559 return(buffer);
1560}
1561
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001562/************************************************************************
1563 * *
1564 * Commodity functions to handle streams *
1565 * *
1566 ************************************************************************/
1567
1568/**
1569 * htmlFreeInputStream:
1570 * @input: an htmlParserInputPtr
1571 *
1572 * Free up an input stream.
1573 */
1574void
1575htmlFreeInputStream(htmlParserInputPtr input) {
1576 if (input == NULL) return;
1577
1578 if (input->filename != NULL) xmlFree((char *) input->filename);
1579 if (input->directory != NULL) xmlFree((char *) input->directory);
1580 if ((input->free != NULL) && (input->base != NULL))
1581 input->free((xmlChar *) input->base);
1582 if (input->buf != NULL)
1583 xmlFreeParserInputBuffer(input->buf);
1584 memset(input, -1, sizeof(htmlParserInput));
1585 xmlFree(input);
1586}
1587
1588/**
1589 * htmlNewInputStream:
1590 * @ctxt: an HTML parser context
1591 *
1592 * Create a new input stream structure
1593 * Returns the new input stream or NULL
1594 */
1595htmlParserInputPtr
1596htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1597 htmlParserInputPtr input;
1598
1599 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1600 if (input == NULL) {
1601 ctxt->errNo = XML_ERR_NO_MEMORY;
1602 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1603 ctxt->sax->error(ctxt->userData,
1604 "malloc: couldn't allocate a new input stream\n");
1605 ctxt->errNo = XML_ERR_NO_MEMORY;
1606 return(NULL);
1607 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001608 memset(input, 0, sizeof(htmlParserInput));
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001609 input->filename = NULL;
1610 input->directory = NULL;
1611 input->base = NULL;
1612 input->cur = NULL;
1613 input->buf = NULL;
1614 input->line = 1;
1615 input->col = 1;
1616 input->buf = NULL;
1617 input->free = NULL;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001618 input->version = NULL;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001619 input->consumed = 0;
1620 input->length = 0;
1621 return(input);
1622}
1623
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001624
1625/************************************************************************
1626 * *
1627 * Commodity functions, cleanup needed ? *
1628 * *
1629 ************************************************************************/
1630
1631/**
1632 * areBlanks:
1633 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001634 * @str: a xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001635 * @len: the size of @str
1636 *
1637 * Is this a sequence of blank chars that one can ignore ?
1638 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001639 * Returns 1 if ignorable 0 otherwise.
1640 */
1641
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001642static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001643 int i;
1644 xmlNodePtr lastChild;
1645
1646 for (i = 0;i < len;i++)
1647 if (!(IS_BLANK(str[i]))) return(0);
1648
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001649 if (CUR == 0) return(1);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001650 if (CUR != '<') return(0);
Daniel Veillarde010c172000-08-28 10:04:51 +00001651 if (ctxt->name == NULL)
1652 return(1);
Daniel Veillard4948eb42000-08-29 09:41:15 +00001653 if (!xmlStrcmp(ctxt->name, BAD_CAST"html"))
1654 return(1);
Daniel Veillarde010c172000-08-28 10:04:51 +00001655 if (!xmlStrcmp(ctxt->name, BAD_CAST"head"))
1656 return(1);
1657 if (!xmlStrcmp(ctxt->name, BAD_CAST"body"))
1658 return(1);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001659 if (ctxt->node == NULL) return(0);
1660 lastChild = xmlGetLastChild(ctxt->node);
1661 if (lastChild == NULL) {
1662 if (ctxt->node->content != NULL) return(0);
1663 } else if (xmlNodeIsText(lastChild))
1664 return(0);
1665 return(1);
1666}
1667
1668/**
1669 * htmlHandleEntity:
1670 * @ctxt: an HTML parser context
1671 * @entity: an XML entity pointer.
1672 *
1673 * Default handling of an HTML entity, call the parser with the
1674 * substitution string
1675 */
1676
1677void
1678htmlHandleEntity(htmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
1679 int len;
1680
1681 if (entity->content == NULL) {
1682 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1683 ctxt->sax->error(ctxt->userData, "htmlHandleEntity %s: content == NULL\n",
1684 entity->name);
1685 ctxt->wellFormed = 0;
1686 return;
1687 }
1688 len = xmlStrlen(entity->content);
1689
1690 /*
1691 * Just handle the content as a set of chars.
1692 */
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001693 htmlCheckParagraph(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001694 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
1695 ctxt->sax->characters(ctxt->userData, entity->content, len);
1696
1697}
1698
1699/**
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001700 * htmlNewDocNoDtD:
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001701 * @URI: URI for the dtd, or NULL
1702 * @ExternalID: the external ID of the DTD, or NULL
1703 *
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001704 * Returns a new document, do not intialize the DTD if not provided
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001705 */
1706htmlDocPtr
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001707htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001708 xmlDocPtr cur;
1709
1710 /*
1711 * Allocate a new document and fill the fields.
1712 */
Daniel Veillard6454aec1999-09-02 22:04:43 +00001713 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001714 if (cur == NULL) {
1715 fprintf(stderr, "xmlNewDoc : malloc failed\n");
1716 return(NULL);
1717 }
Daniel Veillarde7a5a771999-08-30 13:05:42 +00001718 memset(cur, 0, sizeof(xmlDoc));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001719
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001720 cur->type = XML_HTML_DOCUMENT_NODE;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001721 cur->version = NULL;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001722 cur->intSubset = NULL;
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001723 if ((ExternalID != NULL) ||
1724 (URI != NULL))
Daniel Veillard5cb5ab81999-12-21 15:35:29 +00001725 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
Daniel Veillardbe803962000-06-28 23:40:59 +00001726 cur->doc = cur;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001727 cur->name = NULL;
Daniel Veillardcf461992000-03-14 18:30:20 +00001728 cur->children = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001729 cur->extSubset = NULL;
1730 cur->oldNs = NULL;
1731 cur->encoding = NULL;
1732 cur->standalone = 1;
1733 cur->compression = 0;
Daniel Veillardc08a2c61999-09-08 21:35:25 +00001734 cur->ids = NULL;
1735 cur->refs = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001736#ifndef XML_WITHOUT_CORBA
1737 cur->_private = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001738#endif
1739 return(cur);
1740}
1741
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001742/**
1743 * htmlNewDoc:
1744 * @URI: URI for the dtd, or NULL
1745 * @ExternalID: the external ID of the DTD, or NULL
1746 *
1747 * Returns a new document
1748 */
1749htmlDocPtr
1750htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1751 if ((URI == NULL) && (ExternalID == NULL))
1752 return(htmlNewDocNoDtD(
1753 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
1754 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"));
1755
1756 return(htmlNewDocNoDtD(URI, ExternalID));
1757}
1758
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001759
1760/************************************************************************
1761 * *
1762 * The parser itself *
1763 * Relates to http://www.w3.org/TR/html40 *
1764 * *
1765 ************************************************************************/
1766
1767/************************************************************************
1768 * *
1769 * The parser itself *
1770 * *
1771 ************************************************************************/
1772
1773/**
1774 * htmlParseHTMLName:
1775 * @ctxt: an HTML parser context
1776 *
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001777 * parse an HTML tag or attribute name, note that we convert it to lowercase
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001778 * since HTML names are not case-sensitive.
1779 *
1780 * Returns the Tag Name parsed or NULL
1781 */
1782
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001783xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001784htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001785 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001786 int i = 0;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001787 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001788
1789 if (!IS_LETTER(CUR) && (CUR != '_') &&
1790 (CUR != ':')) return(NULL);
1791
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001792 while ((i < HTML_PARSER_BUFFER_SIZE) &&
Daniel Veillard3f6f7f62000-06-30 17:58:25 +00001793 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1794 (CUR == ':') || (CUR == '_'))) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001795 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001796 else loc[i] = CUR;
1797 i++;
1798
1799 NEXT;
1800 }
1801
1802 ret = xmlStrndup(loc, i);
1803
1804 return(ret);
1805}
1806
1807/**
1808 * htmlParseName:
1809 * @ctxt: an HTML parser context
1810 *
1811 * parse an HTML name, this routine is case sensistive.
1812 *
1813 * Returns the Name parsed or NULL
1814 */
1815
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001816xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001817htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001818 xmlChar buf[HTML_MAX_NAMELEN];
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001819 int len = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001820
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001821 GROW;
1822 if (!IS_LETTER(CUR) && (CUR != '_')) {
1823 return(NULL);
1824 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001825
1826 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1827 (CUR == '.') || (CUR == '-') ||
1828 (CUR == '_') || (CUR == ':') ||
1829 (IS_COMBINING(CUR)) ||
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001830 (IS_EXTENDER(CUR))) {
1831 buf[len++] = CUR;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001832 NEXT;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001833 if (len >= HTML_MAX_NAMELEN) {
1834 fprintf(stderr,
1835 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1836 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1837 (CUR == '.') || (CUR == '-') ||
1838 (CUR == '_') || (CUR == ':') ||
1839 (IS_COMBINING(CUR)) ||
1840 (IS_EXTENDER(CUR)))
1841 NEXT;
1842 break;
1843 }
1844 }
1845 return(xmlStrndup(buf, len));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001846}
1847
1848/**
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001849 * htmlParseHTMLAttribute:
1850 * @ctxt: an HTML parser context
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001851 * @stop: a char stop value
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001852 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001853 * parse an HTML attribute value till the stop (quote), if
1854 * stop is 0 then it stops at the first space
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001855 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001856 * Returns the attribute parsed or NULL
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001857 */
1858
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001859xmlChar *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001860htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00001861#if 0
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001862 xmlChar buf[HTML_MAX_NAMELEN];
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001863 int len = 0;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001864
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001865 GROW;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001866 while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1867 if ((stop == 0) && (IS_BLANK(CUR))) break;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001868 buf[len++] = CUR;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001869 NEXT;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001870 if (len >= HTML_MAX_NAMELEN) {
1871 fprintf(stderr,
1872 "htmlParseHTMLAttribute: reached HTML_MAX_NAMELEN limit\n");
1873 while ((!IS_BLANK(CUR)) && (CUR != '<') &&
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001874 (CUR != '>') &&
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001875 (CUR != '\'') && (CUR != '"'))
1876 NEXT;
1877 break;
1878 }
1879 }
1880 return(xmlStrndup(buf, len));
Daniel Veillard71b656e2000-01-05 14:46:17 +00001881#else
1882 xmlChar *buffer = NULL;
1883 int buffer_size = 0;
1884 xmlChar *out = NULL;
1885 xmlChar *name = NULL;
1886
1887 xmlChar *cur = NULL;
1888 htmlEntityDescPtr ent;
1889
1890 /*
1891 * allocate a translation buffer.
1892 */
1893 buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1894 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1895 if (buffer == NULL) {
1896 perror("htmlParseHTMLAttribute: malloc failed");
1897 return(NULL);
1898 }
1899 out = buffer;
1900
1901 /*
1902 * Ok loop until we reach one of the ending chars
1903 */
1904 while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1905 if ((stop == 0) && (IS_BLANK(CUR))) break;
1906 if (CUR == '&') {
1907 if (NXT(1) == '#') {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001908 unsigned int c;
1909 int bits;
1910
1911 c = htmlParseCharRef(ctxt);
1912 if (c < 0x80)
1913 { *out++ = c; bits= -6; }
1914 else if (c < 0x800)
1915 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1916 else if (c < 0x10000)
1917 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1918 else
1919 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1920
1921 for ( ; bits >= 0; bits-= 6) {
1922 *out++ = ((c >> bits) & 0x3F) | 0x80;
1923 }
Daniel Veillard71b656e2000-01-05 14:46:17 +00001924 } else {
1925 ent = htmlParseEntityRef(ctxt, &name);
1926 if (name == NULL) {
1927 *out++ = '&';
1928 if (out - buffer > buffer_size - 100) {
1929 int index = out - buffer;
1930
1931 growBuffer(buffer);
1932 out = &buffer[index];
1933 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001934 } else if (ent == NULL) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00001935 *out++ = '&';
1936 cur = name;
1937 while (*cur != 0) {
1938 if (out - buffer > buffer_size - 100) {
1939 int index = out - buffer;
1940
1941 growBuffer(buffer);
1942 out = &buffer[index];
1943 }
1944 *out++ = *cur++;
1945 }
1946 xmlFree(name);
1947 } else {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001948 unsigned int c;
1949 int bits;
1950
Daniel Veillard71b656e2000-01-05 14:46:17 +00001951 if (out - buffer > buffer_size - 100) {
1952 int index = out - buffer;
1953
1954 growBuffer(buffer);
1955 out = &buffer[index];
1956 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001957 c = (xmlChar)ent->value;
1958 if (c < 0x80)
1959 { *out++ = c; bits= -6; }
1960 else if (c < 0x800)
1961 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1962 else if (c < 0x10000)
1963 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1964 else
1965 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1966
1967 for ( ; bits >= 0; bits-= 6) {
1968 *out++ = ((c >> bits) & 0x3F) | 0x80;
1969 }
Daniel Veillard71b656e2000-01-05 14:46:17 +00001970 xmlFree(name);
1971 }
1972 }
1973 } else {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001974 unsigned int c;
1975 int bits;
1976
Daniel Veillard71b656e2000-01-05 14:46:17 +00001977 if (out - buffer > buffer_size - 100) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001978 int index = out - buffer;
1979
1980 growBuffer(buffer);
1981 out = &buffer[index];
1982 }
1983 c = CUR;
1984 if (c < 0x80)
1985 { *out++ = c; bits= -6; }
1986 else if (c < 0x800)
1987 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1988 else if (c < 0x10000)
1989 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1990 else
1991 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1992
1993 for ( ; bits >= 0; bits-= 6) {
1994 *out++ = ((c >> bits) & 0x3F) | 0x80;
Daniel Veillard71b656e2000-01-05 14:46:17 +00001995 }
1996 NEXT;
1997 }
1998 }
1999 *out++ = 0;
2000 return(buffer);
2001#endif
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002002}
2003
2004/**
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002005 * htmlParseNmtoken:
2006 * @ctxt: an HTML parser context
2007 *
2008 * parse an HTML Nmtoken.
2009 *
2010 * Returns the Nmtoken parsed or NULL
2011 */
2012
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002013xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002014htmlParseNmtoken(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002015 xmlChar buf[HTML_MAX_NAMELEN];
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002016 int len = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002017
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002018 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002019 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2020 (CUR == '.') || (CUR == '-') ||
2021 (CUR == '_') || (CUR == ':') ||
2022 (IS_COMBINING(CUR)) ||
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002023 (IS_EXTENDER(CUR))) {
2024 buf[len++] = CUR;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002025 NEXT;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002026 if (len >= HTML_MAX_NAMELEN) {
2027 fprintf(stderr,
2028 "htmlParseNmtoken: reached HTML_MAX_NAMELEN limit\n");
2029 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2030 (CUR == '.') || (CUR == '-') ||
2031 (CUR == '_') || (CUR == ':') ||
2032 (IS_COMBINING(CUR)) ||
2033 (IS_EXTENDER(CUR)))
2034 NEXT;
2035 break;
2036 }
2037 }
2038 return(xmlStrndup(buf, len));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002039}
2040
2041/**
2042 * htmlParseEntityRef:
2043 * @ctxt: an HTML parser context
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002044 * @str: location to store the entity name
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002045 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002046 * parse an HTML ENTITY references
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002047 *
2048 * [68] EntityRef ::= '&' Name ';'
2049 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002050 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2051 * if non-NULL *str will have to be freed by the caller.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002052 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002053htmlEntityDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002054htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2055 xmlChar *name;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002056 htmlEntityDescPtr ent = NULL;
2057 *str = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002058
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002059 if (CUR == '&') {
2060 NEXT;
2061 name = htmlParseName(ctxt);
2062 if (name == NULL) {
2063 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2064 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2065 ctxt->wellFormed = 0;
2066 } else {
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002067 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002068 if (CUR == ';') {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002069 *str = name;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002070
2071 /*
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002072 * Lookup the entity in the table.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002073 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002074 ent = htmlEntityLookup(name);
Daniel Veillard71b656e2000-01-05 14:46:17 +00002075 if (ent != NULL) /* OK that's ugly !!! */
2076 NEXT;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002077 } else {
2078 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2079 ctxt->sax->error(ctxt->userData,
2080 "htmlParseEntityRef: expecting ';'\n");
Daniel Veillard71b656e2000-01-05 14:46:17 +00002081 *str = name;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002082 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002083 }
2084 }
2085 return(ent);
2086}
2087
2088/**
2089 * htmlParseAttValue:
2090 * @ctxt: an HTML parser context
2091 *
2092 * parse a value for an attribute
2093 * Note: the parser won't do substitution of entities here, this
2094 * will be handled later in xmlStringGetNodeList, unless it was
2095 * asked for ctxt->replaceEntities != 0
2096 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002097 * Returns the AttValue parsed or NULL.
2098 */
2099
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002100xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002101htmlParseAttValue(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002102 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002103
2104 if (CUR == '"') {
2105 NEXT;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002106 ret = htmlParseHTMLAttribute(ctxt, '"');
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002107 if (CUR != '"') {
2108 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2109 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2110 ctxt->wellFormed = 0;
2111 } else
2112 NEXT;
2113 } else if (CUR == '\'') {
2114 NEXT;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002115 ret = htmlParseHTMLAttribute(ctxt, '\'');
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002116 if (CUR != '\'') {
2117 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2118 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2119 ctxt->wellFormed = 0;
2120 } else
2121 NEXT;
2122 } else {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002123 /*
2124 * That's an HTMLism, the attribute value may not be quoted
2125 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002126 ret = htmlParseHTMLAttribute(ctxt, 0);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002127 if (ret == NULL) {
2128 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2129 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2130 ctxt->wellFormed = 0;
2131 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002132 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002133 return(ret);
2134}
2135
2136/**
2137 * htmlParseSystemLiteral:
2138 * @ctxt: an HTML parser context
2139 *
2140 * parse an HTML Literal
2141 *
2142 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2143 *
2144 * Returns the SystemLiteral parsed or NULL
2145 */
2146
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002147xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002148htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002149 const xmlChar *q;
2150 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002151
2152 if (CUR == '"') {
2153 NEXT;
2154 q = CUR_PTR;
2155 while ((IS_CHAR(CUR)) && (CUR != '"'))
2156 NEXT;
2157 if (!IS_CHAR(CUR)) {
2158 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2159 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2160 ctxt->wellFormed = 0;
2161 } else {
2162 ret = xmlStrndup(q, CUR_PTR - q);
2163 NEXT;
2164 }
2165 } else if (CUR == '\'') {
2166 NEXT;
2167 q = CUR_PTR;
2168 while ((IS_CHAR(CUR)) && (CUR != '\''))
2169 NEXT;
2170 if (!IS_CHAR(CUR)) {
2171 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2172 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2173 ctxt->wellFormed = 0;
2174 } else {
2175 ret = xmlStrndup(q, CUR_PTR - q);
2176 NEXT;
2177 }
2178 } else {
2179 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardcf461992000-03-14 18:30:20 +00002180 ctxt->sax->error(ctxt->userData,
2181 "SystemLiteral \" or ' expected\n");
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002182 ctxt->wellFormed = 0;
2183 }
2184
2185 return(ret);
2186}
2187
2188/**
2189 * htmlParsePubidLiteral:
2190 * @ctxt: an HTML parser context
2191 *
2192 * parse an HTML public literal
2193 *
2194 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2195 *
2196 * Returns the PubidLiteral parsed or NULL.
2197 */
2198
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002199xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002200htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002201 const xmlChar *q;
2202 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002203 /*
2204 * Name ::= (Letter | '_') (NameChar)*
2205 */
2206 if (CUR == '"') {
2207 NEXT;
2208 q = CUR_PTR;
2209 while (IS_PUBIDCHAR(CUR)) NEXT;
2210 if (CUR != '"') {
2211 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2212 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2213 ctxt->wellFormed = 0;
2214 } else {
2215 ret = xmlStrndup(q, CUR_PTR - q);
2216 NEXT;
2217 }
2218 } else if (CUR == '\'') {
2219 NEXT;
2220 q = CUR_PTR;
2221 while ((IS_LETTER(CUR)) && (CUR != '\''))
2222 NEXT;
2223 if (!IS_LETTER(CUR)) {
2224 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2225 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2226 ctxt->wellFormed = 0;
2227 } else {
2228 ret = xmlStrndup(q, CUR_PTR - q);
2229 NEXT;
2230 }
2231 } else {
2232 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2233 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2234 ctxt->wellFormed = 0;
2235 }
2236
2237 return(ret);
2238}
2239
2240/**
2241 * htmlParseCharData:
2242 * @ctxt: an HTML parser context
2243 * @cdata: int indicating whether we are within a CDATA section
2244 *
2245 * parse a CharData section.
2246 * if we are within a CDATA section ']]>' marks an end of section.
2247 *
2248 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2249 */
2250
2251void
2252htmlParseCharData(htmlParserCtxtPtr ctxt, int cdata) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002253 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2254 int nbchar = 0;
2255 int cur, l;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002256
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002257 SHRINK;
2258 cur = CUR_CHAR(l);
2259 while (((cur != '<') || (ctxt->token == '<')) &&
2260 ((cur != '&') || (ctxt->token == '&')) &&
2261 (IS_CHAR(cur))) {
2262 COPY_BUF(l,buf,nbchar,cur);
2263 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2264 /*
2265 * Ok the segment is to be consumed as chars.
2266 */
2267 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2268 if (areBlanks(ctxt, buf, nbchar)) {
2269 if (ctxt->sax->ignorableWhitespace != NULL)
2270 ctxt->sax->ignorableWhitespace(ctxt->userData,
2271 buf, nbchar);
2272 } else {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00002273 htmlCheckParagraph(ctxt);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002274 if (ctxt->sax->characters != NULL)
2275 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2276 }
2277 }
2278 nbchar = 0;
2279 }
2280 NEXTL(l);
2281 cur = CUR_CHAR(l);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002282 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002283 if (nbchar != 0) {
2284 /*
2285 * Ok the segment is to be consumed as chars.
2286 */
2287 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2288 if (areBlanks(ctxt, buf, nbchar)) {
2289 if (ctxt->sax->ignorableWhitespace != NULL)
2290 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2291 } else {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00002292 htmlCheckParagraph(ctxt);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002293 if (ctxt->sax->characters != NULL)
2294 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002295 }
2296 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002297 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002298}
2299
2300/**
2301 * htmlParseExternalID:
2302 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002303 * @publicID: a xmlChar** receiving PubidLiteral
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002304 * @strict: indicate whether we should restrict parsing to only
2305 * production [75], see NOTE below
2306 *
2307 * Parse an External ID or a Public ID
2308 *
2309 * NOTE: Productions [75] and [83] interract badly since [75] can generate
2310 * 'PUBLIC' S PubidLiteral S SystemLiteral
2311 *
2312 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2313 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2314 *
2315 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2316 *
2317 * Returns the function returns SystemLiteral and in the second
2318 * case publicID receives PubidLiteral, is strict is off
2319 * it is possible to return NULL and have publicID set.
2320 */
2321
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002322xmlChar *
2323htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID, int strict) {
2324 xmlChar *URI = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002325
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002326 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2327 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2328 (UPP(4) == 'E') && (UPP(5) == 'M')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002329 SKIP(6);
2330 if (!IS_BLANK(CUR)) {
2331 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2332 ctxt->sax->error(ctxt->userData,
2333 "Space required after 'SYSTEM'\n");
2334 ctxt->wellFormed = 0;
2335 }
2336 SKIP_BLANKS;
2337 URI = htmlParseSystemLiteral(ctxt);
2338 if (URI == NULL) {
2339 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2340 ctxt->sax->error(ctxt->userData,
2341 "htmlParseExternalID: SYSTEM, no URI\n");
2342 ctxt->wellFormed = 0;
2343 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002344 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2345 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2346 (UPP(4) == 'I') && (UPP(5) == 'C')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002347 SKIP(6);
2348 if (!IS_BLANK(CUR)) {
2349 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2350 ctxt->sax->error(ctxt->userData,
2351 "Space required after 'PUBLIC'\n");
2352 ctxt->wellFormed = 0;
2353 }
2354 SKIP_BLANKS;
2355 *publicID = htmlParsePubidLiteral(ctxt);
2356 if (*publicID == NULL) {
2357 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2358 ctxt->sax->error(ctxt->userData,
2359 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2360 ctxt->wellFormed = 0;
2361 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002362 SKIP_BLANKS;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002363 if ((CUR == '"') || (CUR == '\'')) {
2364 URI = htmlParseSystemLiteral(ctxt);
2365 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002366 }
2367 return(URI);
2368}
2369
2370/**
2371 * htmlParseComment:
2372 * @ctxt: an HTML parser context
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002373 *
2374 * Parse an XML (SGML) comment <!-- .... -->
2375 *
2376 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2377 */
2378void
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002379htmlParseComment(htmlParserCtxtPtr ctxt) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002380 xmlChar *buf = NULL;
Daniel Veillard87b95392000-08-12 21:12:04 +00002381 int len;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002382 int size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard87b95392000-08-12 21:12:04 +00002383 int q, ql;
2384 int r, rl;
2385 int cur, l;
2386 xmlParserInputState state;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002387
2388 /*
2389 * Check that there is a comment right here.
2390 */
Daniel Veillard87b95392000-08-12 21:12:04 +00002391 if ((RAW != '<') || (NXT(1) != '!') ||
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002392 (NXT(2) != '-') || (NXT(3) != '-')) return;
2393
Daniel Veillard87b95392000-08-12 21:12:04 +00002394 state = ctxt->instate;
2395 ctxt->instate = XML_PARSER_COMMENT;
2396 SHRINK;
2397 SKIP(4);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002398 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2399 if (buf == NULL) {
2400 fprintf(stderr, "malloc of %d byte failed\n", size);
Daniel Veillard87b95392000-08-12 21:12:04 +00002401 ctxt->instate = state;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002402 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002403 }
Daniel Veillard87b95392000-08-12 21:12:04 +00002404 q = CUR_CHAR(ql);
2405 NEXTL(ql);
2406 r = CUR_CHAR(rl);
2407 NEXTL(rl);
2408 cur = CUR_CHAR(l);
2409 len = 0;
2410 while (IS_CHAR(cur) &&
2411 ((cur != '>') ||
2412 (r != '-') || (q != '-'))) {
2413 if (len + 5 >= size) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002414 size *= 2;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002415 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002416 if (buf == NULL) {
2417 fprintf(stderr, "realloc of %d byte failed\n", size);
Daniel Veillard87b95392000-08-12 21:12:04 +00002418 ctxt->instate = state;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002419 return;
2420 }
2421 }
Daniel Veillard87b95392000-08-12 21:12:04 +00002422 COPY_BUF(ql,buf,len,q);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002423 q = r;
Daniel Veillard87b95392000-08-12 21:12:04 +00002424 ql = rl;
2425 r = cur;
2426 rl = l;
2427 NEXTL(l);
2428 cur = CUR_CHAR(l);
2429 if (cur == 0) {
2430 SHRINK;
2431 GROW;
2432 cur = CUR_CHAR(l);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002433 }
2434 }
Daniel Veillard87b95392000-08-12 21:12:04 +00002435 buf[len] = 0;
2436 if (!IS_CHAR(cur)) {
2437 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2438 ctxt->sax->error(ctxt->userData,
2439 "Comment not terminated \n<!--%.50s\n", buf);
2440 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2441 ctxt->wellFormed = 0;
2442 xmlFree(buf);
2443 } else {
2444 NEXT;
2445 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2446 (!ctxt->disableSAX))
2447 ctxt->sax->comment(ctxt->userData, buf);
2448 xmlFree(buf);
2449 }
2450 ctxt->instate = state;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002451}
2452
2453/**
2454 * htmlParseCharRef:
2455 * @ctxt: an HTML parser context
2456 *
2457 * parse Reference declarations
2458 *
2459 * [66] CharRef ::= '&#' [0-9]+ ';' |
2460 * '&#x' [0-9a-fA-F]+ ';'
2461 *
2462 * Returns the value parsed (as an int)
2463 */
2464int
2465htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2466 int val = 0;
2467
2468 if ((CUR == '&') && (NXT(1) == '#') &&
2469 (NXT(2) == 'x')) {
2470 SKIP(3);
2471 while (CUR != ';') {
2472 if ((CUR >= '0') && (CUR <= '9'))
2473 val = val * 16 + (CUR - '0');
2474 else if ((CUR >= 'a') && (CUR <= 'f'))
2475 val = val * 16 + (CUR - 'a') + 10;
2476 else if ((CUR >= 'A') && (CUR <= 'F'))
2477 val = val * 16 + (CUR - 'A') + 10;
2478 else {
2479 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2480 ctxt->sax->error(ctxt->userData,
2481 "htmlParseCharRef: invalid hexadecimal value\n");
2482 ctxt->wellFormed = 0;
2483 val = 0;
2484 break;
2485 }
2486 NEXT;
2487 }
2488 if (CUR == ';')
2489 NEXT;
2490 } else if ((CUR == '&') && (NXT(1) == '#')) {
2491 SKIP(2);
2492 while (CUR != ';') {
2493 if ((CUR >= '0') && (CUR <= '9'))
2494 val = val * 10 + (CUR - '0');
2495 else {
2496 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2497 ctxt->sax->error(ctxt->userData,
2498 "htmlParseCharRef: invalid decimal value\n");
2499 ctxt->wellFormed = 0;
2500 val = 0;
2501 break;
2502 }
2503 NEXT;
2504 }
2505 if (CUR == ';')
2506 NEXT;
2507 } else {
2508 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2509 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2510 ctxt->wellFormed = 0;
2511 }
2512 /*
2513 * Check the value IS_CHAR ...
2514 */
2515 if (IS_CHAR(val)) {
2516 return(val);
2517 } else {
2518 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002519 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002520 val);
2521 ctxt->wellFormed = 0;
2522 }
2523 return(0);
2524}
2525
2526
2527/**
2528 * htmlParseDocTypeDecl :
2529 * @ctxt: an HTML parser context
2530 *
2531 * parse a DOCTYPE declaration
2532 *
2533 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2534 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2535 */
2536
2537void
2538htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002539 xmlChar *name;
2540 xmlChar *ExternalID = NULL;
2541 xmlChar *URI = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002542
2543 /*
2544 * We know that '<!DOCTYPE' has been detected.
2545 */
2546 SKIP(9);
2547
2548 SKIP_BLANKS;
2549
2550 /*
2551 * Parse the DOCTYPE name.
2552 */
2553 name = htmlParseName(ctxt);
2554 if (name == NULL) {
2555 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2556 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2557 ctxt->wellFormed = 0;
2558 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002559 /*
2560 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2561 */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002562
2563 SKIP_BLANKS;
2564
2565 /*
2566 * Check for SystemID and ExternalID
2567 */
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002568 URI = htmlParseExternalID(ctxt, &ExternalID, 0);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002569 SKIP_BLANKS;
2570
2571 /*
2572 * We should be at the end of the DOCTYPE declaration.
2573 */
2574 if (CUR != '>') {
2575 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2576 ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
2577 ctxt->wellFormed = 0;
2578 /* We shouldn't try to resynchronize ... */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002579 }
2580 NEXT;
2581
2582 /*
Daniel Veillardd83eb822000-06-30 18:39:56 +00002583 * Create or update the document accordingly to the DOCTYPE
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002584 */
Daniel Veillardd83eb822000-06-30 18:39:56 +00002585 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2586 (!ctxt->disableSAX))
2587 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002588
2589 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002590 * Cleanup, since we don't use all those identifiers
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002591 */
Daniel Veillard6454aec1999-09-02 22:04:43 +00002592 if (URI != NULL) xmlFree(URI);
2593 if (ExternalID != NULL) xmlFree(ExternalID);
2594 if (name != NULL) xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002595}
2596
2597/**
2598 * htmlParseAttribute:
2599 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002600 * @value: a xmlChar ** used to store the value of the attribute
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002601 *
2602 * parse an attribute
2603 *
2604 * [41] Attribute ::= Name Eq AttValue
2605 *
2606 * [25] Eq ::= S? '=' S?
2607 *
2608 * With namespace:
2609 *
2610 * [NS 11] Attribute ::= QName Eq AttValue
2611 *
2612 * Also the case QName == xmlns:??? is handled independently as a namespace
2613 * definition.
2614 *
2615 * Returns the attribute name, and the value in *value.
2616 */
2617
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002618xmlChar *
2619htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002620 xmlChar *name, *val = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002621
2622 *value = NULL;
2623 name = htmlParseName(ctxt);
2624 if (name == NULL) {
2625 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2626 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2627 ctxt->wellFormed = 0;
2628 return(NULL);
2629 }
2630
2631 /*
2632 * read the value
2633 */
2634 SKIP_BLANKS;
2635 if (CUR == '=') {
2636 NEXT;
2637 SKIP_BLANKS;
2638 val = htmlParseAttValue(ctxt);
Daniel Veillardbe803962000-06-28 23:40:59 +00002639 /******
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002640 } else {
Daniel Veillardbe803962000-06-28 23:40:59 +00002641 * TODO : some attribute must have values, some may not
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002642 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002643 ctxt->sax->warning(ctxt->userData,
Daniel Veillardbe803962000-06-28 23:40:59 +00002644 "No value for attribute %s\n", name); */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002645 }
2646
2647 *value = val;
2648 return(name);
2649}
2650
2651/**
Daniel Veillard365e13b2000-07-02 07:56:37 +00002652 * htmlCheckEncoding:
2653 * @ctxt: an HTML parser context
2654 * @attvalue: the attribute value
2655 *
2656 * Checks an http-equiv attribute from a Meta tag to detect
2657 * the encoding
2658 * If a new encoding is detected the parser is switched to decode
2659 * it and pass UTF8
2660 */
2661void
2662htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2663 const xmlChar *encoding;
2664
2665 if ((ctxt == NULL) || (attvalue == NULL))
2666 return;
2667
Daniel Veillard365e13b2000-07-02 07:56:37 +00002668 encoding = xmlStrstr(attvalue, BAD_CAST"charset=");
2669 if (encoding == NULL)
2670 encoding = xmlStrstr(attvalue, BAD_CAST"Charset=");
2671 if (encoding == NULL)
2672 encoding = xmlStrstr(attvalue, BAD_CAST"CHARSET=");
2673 if (encoding != NULL) {
2674 encoding += 8;
2675 } else {
2676 encoding = xmlStrstr(attvalue, BAD_CAST"charset =");
2677 if (encoding == NULL)
2678 encoding = xmlStrstr(attvalue, BAD_CAST"Charset =");
2679 if (encoding == NULL)
2680 encoding = xmlStrstr(attvalue, BAD_CAST"CHARSET =");
2681 if (encoding != NULL)
2682 encoding += 9;
2683 }
2684 if (encoding != NULL) {
2685 xmlCharEncoding enc;
2686 xmlCharEncodingHandlerPtr handler;
2687
2688 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2689
2690 if (ctxt->input->encoding != NULL)
2691 xmlFree((xmlChar *) ctxt->input->encoding);
2692 ctxt->input->encoding = xmlStrdup(encoding);
2693
2694 enc = xmlParseCharEncoding((const char *) encoding);
2695 /*
2696 * registered set of known encodings
2697 */
2698 if (enc != XML_CHAR_ENCODING_ERROR) {
2699 xmlSwitchEncoding(ctxt, enc);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002700 ctxt->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillard365e13b2000-07-02 07:56:37 +00002701 } else {
2702 /*
2703 * fallback for unknown encodings
2704 */
2705 handler = xmlFindCharEncodingHandler((const char *) encoding);
2706 if (handler != NULL) {
2707 xmlSwitchToEncoding(ctxt, handler);
Daniel Veillard87b95392000-08-12 21:12:04 +00002708 ctxt->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillard365e13b2000-07-02 07:56:37 +00002709 } else {
2710 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2711 }
2712 }
Daniel Veillard87b95392000-08-12 21:12:04 +00002713
2714 if ((ctxt->input->buf != NULL) &&
2715 (ctxt->input->buf->encoder != NULL) &&
2716 (ctxt->input->buf->raw != NULL) &&
2717 (ctxt->input->buf->buffer != NULL)) {
2718 int nbchars;
2719 int processed;
2720
2721 /*
2722 * convert as much as possible to the parser reading buffer.
2723 */
2724 processed = ctxt->input->cur - ctxt->input->base;
2725 xmlBufferShrink(ctxt->input->buf->buffer, processed);
2726 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2727 ctxt->input->buf->buffer,
2728 ctxt->input->buf->raw);
2729 if (nbchars < 0) {
2730 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2731 ctxt->sax->error(ctxt->userData,
2732 "htmlCheckEncoding: encoder error\n");
2733 ctxt->errNo = XML_ERR_INVALID_ENCODING;
2734 }
2735 ctxt->input->base =
2736 ctxt->input->cur = ctxt->input->buf->buffer->content;
2737 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00002738 }
2739}
2740
2741/**
2742 * htmlCheckMeta:
2743 * @ctxt: an HTML parser context
2744 * @atts: the attributes values
2745 *
2746 * Checks an attributes from a Meta tag
2747 */
2748void
2749htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2750 int i;
2751 const xmlChar *att, *value;
2752 int http = 0;
2753 const xmlChar *content = NULL;
2754
2755 if ((ctxt == NULL) || (atts == NULL))
2756 return;
2757
2758 i = 0;
2759 att = atts[i++];
2760 while (att != NULL) {
2761 value = atts[i++];
2762 if ((value != NULL) &&
2763 ((!xmlStrcmp(att, BAD_CAST"http-equiv")) ||
2764 (!xmlStrcmp(att, BAD_CAST"Http-Equiv")) ||
2765 (!xmlStrcmp(att, BAD_CAST"HTTP-EQUIV"))) &&
2766 ((!xmlStrcmp(value, BAD_CAST"Content-Type")) ||
2767 (!xmlStrcmp(value, BAD_CAST"content-type")) ||
2768 (!xmlStrcmp(value, BAD_CAST"CONTENT-TYPE"))))
2769 http = 1;
2770 else if ((value != NULL) &&
2771 ((!xmlStrcmp(att, BAD_CAST"content")) ||
2772 (!xmlStrcmp(att, BAD_CAST"Content")) ||
2773 (!xmlStrcmp(att, BAD_CAST"CONTENT"))))
2774 content = value;
2775 att = atts[i++];
2776 }
2777 if ((http) && (content != NULL))
2778 htmlCheckEncoding(ctxt, content);
2779
2780}
2781
2782/**
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002783 * htmlParseStartTag:
2784 * @ctxt: an HTML parser context
2785 *
2786 * parse a start of tag either for rule element or
2787 * EmptyElement. In both case we don't parse the tag closing chars.
2788 *
2789 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2790 *
2791 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2792 *
2793 * With namespace:
2794 *
2795 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2796 *
2797 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2798 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002799 */
2800
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002801void
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002802htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002803 xmlChar *name;
2804 xmlChar *attname;
2805 xmlChar *attvalue;
2806 const xmlChar **atts = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002807 int nbatts = 0;
2808 int maxatts = 0;
Daniel Veillard365e13b2000-07-02 07:56:37 +00002809 int meta = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002810 int i;
2811
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002812 if (CUR != '<') return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002813 NEXT;
2814
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002815 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002816 name = htmlParseHTMLName(ctxt);
2817 if (name == NULL) {
2818 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2819 ctxt->sax->error(ctxt->userData,
2820 "htmlParseStartTag: invalid element name\n");
2821 ctxt->wellFormed = 0;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002822 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002823 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00002824 if (!xmlStrcmp(name, BAD_CAST"meta"))
2825 meta = 1;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002826
2827 /*
2828 * Check for auto-closure of HTML elements.
2829 */
2830 htmlAutoClose(ctxt, name);
2831
2832 /*
Daniel Veillardbe803962000-06-28 23:40:59 +00002833 * Check for implied HTML elements.
2834 */
2835 htmlCheckImplied(ctxt, name);
2836
2837 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002838 * Now parse the attributes, it ends up with the ending
2839 *
2840 * (S Attribute)* S?
2841 */
2842 SKIP_BLANKS;
2843 while ((IS_CHAR(CUR)) &&
2844 (CUR != '>') &&
2845 ((CUR != '/') || (NXT(1) != '>'))) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002846 long cons = ctxt->nbChars;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002847
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002848 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002849 attname = htmlParseAttribute(ctxt, &attvalue);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002850 if (attname != NULL) {
Daniel Veillard365e13b2000-07-02 07:56:37 +00002851
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002852 /*
2853 * Well formedness requires at most one declaration of an attribute
2854 */
2855 for (i = 0; i < nbatts;i += 2) {
2856 if (!xmlStrcmp(atts[i], attname)) {
2857 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002858 ctxt->sax->error(ctxt->userData,
2859 "Attribute %s redefined\n",
2860 attname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002861 ctxt->wellFormed = 0;
Daniel Veillard6454aec1999-09-02 22:04:43 +00002862 xmlFree(attname);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002863 if (attvalue != NULL)
2864 xmlFree(attvalue);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002865 goto failed;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002866 }
2867 }
2868
2869 /*
2870 * Add the pair to atts
2871 */
2872 if (atts == NULL) {
2873 maxatts = 10;
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002874 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002875 if (atts == NULL) {
2876 fprintf(stderr, "malloc of %ld byte failed\n",
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002877 maxatts * (long)sizeof(xmlChar *));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002878 if (name != NULL) xmlFree(name);
2879 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002880 }
Daniel Veillard51e3b151999-11-12 17:02:31 +00002881 } else if (nbatts + 4 > maxatts) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002882 maxatts *= 2;
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002883 atts = (const xmlChar **) xmlRealloc(atts, maxatts * sizeof(xmlChar *));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002884 if (atts == NULL) {
2885 fprintf(stderr, "realloc of %ld byte failed\n",
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002886 maxatts * (long)sizeof(xmlChar *));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002887 if (name != NULL) xmlFree(name);
2888 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002889 }
2890 }
2891 atts[nbatts++] = attname;
2892 atts[nbatts++] = attvalue;
2893 atts[nbatts] = NULL;
2894 atts[nbatts + 1] = NULL;
2895 }
2896
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002897failed:
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002898 SKIP_BLANKS;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002899 if (cons == ctxt->nbChars) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002900 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2901 ctxt->sax->error(ctxt->userData,
2902 "htmlParseStartTag: problem parsing attributes\n");
2903 ctxt->wellFormed = 0;
2904 break;
2905 }
2906 }
2907
2908 /*
Daniel Veillard365e13b2000-07-02 07:56:37 +00002909 * Handle specific association to the META tag
2910 */
2911 if (meta)
2912 htmlCheckMeta(ctxt, atts);
2913
2914 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002915 * SAX: Start of Element !
2916 */
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002917 htmlnamePush(ctxt, xmlStrdup(name));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002918#ifdef DEBUG
2919 fprintf(stderr,"Start of element %s: pushed %s\n", name, ctxt->name);
2920#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002921 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
2922 ctxt->sax->startElement(ctxt->userData, name, atts);
2923
2924 if (atts != NULL) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002925 for (i = 0;i < nbatts;i++) {
2926 if (atts[i] != NULL)
2927 xmlFree((xmlChar *) atts[i]);
2928 }
Daniel Veillard3f6f7f62000-06-30 17:58:25 +00002929 xmlFree((void *) atts);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002930 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002931 if (name != NULL) xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002932}
2933
2934/**
2935 * htmlParseEndTag:
2936 * @ctxt: an HTML parser context
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002937 *
2938 * parse an end of tag
2939 *
2940 * [42] ETag ::= '</' Name S? '>'
2941 *
2942 * With namespace
2943 *
2944 * [NS 9] ETag ::= '</' QName S? '>'
2945 */
2946
2947void
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002948htmlParseEndTag(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002949 xmlChar *name;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002950 xmlChar *oldname;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002951 int i;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002952
2953 if ((CUR != '<') || (NXT(1) != '/')) {
2954 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2955 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
2956 ctxt->wellFormed = 0;
2957 return;
2958 }
2959 SKIP(2);
2960
2961 name = htmlParseHTMLName(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002962 if (name == NULL) return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002963
2964 /*
2965 * We should definitely be at the ending "S? '>'" part
2966 */
2967 SKIP_BLANKS;
2968 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
2969 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2970 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
2971 ctxt->wellFormed = 0;
2972 } else
2973 NEXT;
2974
2975 /*
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002976 * If the name read is not one of the element in the parsing stack
2977 * then return, it's just an error.
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002978 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002979 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
2980 if (!xmlStrcmp(name, ctxt->nameTab[i])) break;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002981 }
2982 if (i < 0) {
2983 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002984 ctxt->sax->error(ctxt->userData,
2985 "Unexpected end tag : %s\n", name);
Daniel Veillard6454aec1999-09-02 22:04:43 +00002986 xmlFree(name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002987 ctxt->wellFormed = 0;
2988 return;
2989 }
2990
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002991
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002992 /*
2993 * Check for auto-closure of HTML elements.
2994 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002995
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002996 htmlAutoCloseOnClose(ctxt, name);
2997
2998 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002999 * Well formedness constraints, opening and closing must match.
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003000 * With the exception that the autoclose may have popped stuff out
3001 * of the stack.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003002 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003003 if (xmlStrcmp(name, ctxt->name)) {
3004#ifdef DEBUG
3005 fprintf(stderr,"End of tag %s: expecting %s\n", name, ctxt->name);
3006#endif
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003007 if ((ctxt->name != NULL) &&
3008 (xmlStrcmp(ctxt->name, name))) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003009 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3010 ctxt->sax->error(ctxt->userData,
3011 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003012 name, ctxt->name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003013 ctxt->wellFormed = 0;
3014 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003015 }
3016
3017 /*
3018 * SAX: End of Tag
3019 */
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003020 oldname = ctxt->name;
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003021 if ((oldname != NULL) && (!xmlStrcmp(oldname, name))) {
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003022 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3023 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003024 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003025 if (oldname != NULL) {
3026#ifdef DEBUG
3027 fprintf(stderr,"End of tag %s: popping out %s\n", name, oldname);
3028#endif
3029 xmlFree(oldname);
3030#ifdef DEBUG
3031 } else {
3032 fprintf(stderr,"End of tag %s: stack empty !!!\n", name);
3033#endif
3034 }
3035 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003036
3037 if (name != NULL)
Daniel Veillard6454aec1999-09-02 22:04:43 +00003038 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003039
3040 return;
3041}
3042
3043
3044/**
3045 * htmlParseReference:
3046 * @ctxt: an HTML parser context
3047 *
3048 * parse and handle entity references in content,
3049 * this will end-up in a call to character() since this is either a
3050 * CharRef, or a predefined entity.
3051 */
3052void
3053htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003054 htmlEntityDescPtr ent;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003055 xmlChar out[6];
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003056 xmlChar *name;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003057 if (CUR != '&') return;
3058
3059 if (NXT(1) == '#') {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003060 unsigned int c;
3061 int bits, i = 0;
3062
3063 c = htmlParseCharRef(ctxt);
3064 if (c < 0x80) { out[i++]= c; bits= -6; }
3065 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3066 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3067 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3068
3069 for ( ; bits >= 0; bits-= 6) {
3070 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3071 }
3072 out[i] = 0;
3073
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003074 htmlCheckParagraph(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003075 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003076 ctxt->sax->characters(ctxt->userData, out, i);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003077 } else {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003078 ent = htmlParseEntityRef(ctxt, &name);
Daniel Veillard71b656e2000-01-05 14:46:17 +00003079 if (name == NULL) {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003080 htmlCheckParagraph(ctxt);
Daniel Veillard1255ab72000-08-14 15:13:33 +00003081 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3082 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
Daniel Veillard71b656e2000-01-05 14:46:17 +00003083 return;
3084 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003085 if ((ent == NULL) || (ent->value <= 0)) {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003086 htmlCheckParagraph(ctxt);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003087 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
Daniel Veillardb96e6431999-08-29 21:02:19 +00003088 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003089 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
Daniel Veillard71b656e2000-01-05 14:46:17 +00003090 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003091 }
3092 } else {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003093 unsigned int c;
3094 int bits, i = 0;
3095
3096 c = ent->value;
3097 if (c < 0x80)
3098 { out[i++]= c; bits= -6; }
3099 else if (c < 0x800)
3100 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3101 else if (c < 0x10000)
3102 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3103 else
3104 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3105
3106 for ( ; bits >= 0; bits-= 6) {
3107 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3108 }
3109 out[i] = 0;
3110
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003111 htmlCheckParagraph(ctxt);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003112 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003113 ctxt->sax->characters(ctxt->userData, out, i);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003114 }
Daniel Veillard6454aec1999-09-02 22:04:43 +00003115 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003116 }
3117}
3118
3119/**
3120 * htmlParseContent:
3121 * @ctxt: an HTML parser context
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003122 * @name: the node name
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003123 *
3124 * Parse a content: comment, sub-element, reference or text.
3125 *
3126 */
3127
3128void
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003129htmlParseContent(htmlParserCtxtPtr ctxt) {
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003130 xmlChar *currentNode;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003131 int depth;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003132
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003133 currentNode = xmlStrdup(ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003134 depth = ctxt->nameNr;
3135 while (1) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003136 long cons = ctxt->nbChars;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003137
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003138 GROW;
3139 /*
3140 * Our tag or one of it's parent or children is ending.
3141 */
3142 if ((CUR == '<') && (NXT(1) == '/')) {
3143 htmlParseEndTag(ctxt);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003144 if (currentNode != NULL) xmlFree(currentNode);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003145 return;
3146 }
3147
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003148 /*
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003149 * Has this node been popped out during parsing of
3150 * the next element
3151 */
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003152 if ((xmlStrcmp(currentNode, ctxt->name)) &&
3153 (depth >= ctxt->nameNr)) {
3154 if (currentNode != NULL) xmlFree(currentNode);
3155 return;
3156 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003157
3158 /*
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003159 * Sometimes DOCTYPE arrives in the middle of the document
3160 */
3161 if ((CUR == '<') && (NXT(1) == '!') &&
3162 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3163 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3164 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3165 (UPP(8) == 'E')) {
3166 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3167 ctxt->sax->error(ctxt->userData,
3168 "Misplaced DOCTYPE declaration\n");
3169 ctxt->wellFormed = 0;
3170 htmlParseDocTypeDecl(ctxt);
3171 }
3172
3173 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003174 * First case : a comment
3175 */
3176 if ((CUR == '<') && (NXT(1) == '!') &&
3177 (NXT(2) == '-') && (NXT(3) == '-')) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003178 htmlParseComment(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003179 }
3180
3181 /*
3182 * Second case : a sub-element.
3183 */
3184 else if (CUR == '<') {
3185 htmlParseElement(ctxt);
3186 }
3187
3188 /*
3189 * Third case : a reference. If if has not been resolved,
3190 * parsing returns it's Name, create the node
3191 */
3192 else if (CUR == '&') {
3193 htmlParseReference(ctxt);
3194 }
3195
3196 /*
Daniel Veillard365e13b2000-07-02 07:56:37 +00003197 * Fourth : end of the resource
3198 */
3199 else if (CUR == 0) {
3200 htmlAutoClose(ctxt, NULL);
3201 }
3202
3203 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003204 * Last case, text. Note that References are handled directly.
3205 */
3206 else {
3207 htmlParseCharData(ctxt, 0);
3208 }
3209
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003210 if (cons == ctxt->nbChars) {
Daniel Veillard35008381999-10-25 13:15:52 +00003211 if (ctxt->node != NULL) {
3212 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3213 ctxt->sax->error(ctxt->userData,
3214 "detected an error in element content\n");
3215 ctxt->wellFormed = 0;
3216 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003217 break;
3218 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003219
Daniel Veillarde2d034d1999-07-27 19:52:06 +00003220 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003221 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003222 if (currentNode != NULL) xmlFree(currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003223}
3224
3225/**
3226 * htmlParseElement:
3227 * @ctxt: an HTML parser context
3228 *
3229 * parse an HTML element, this is highly recursive
3230 *
3231 * [39] element ::= EmptyElemTag | STag content ETag
3232 *
3233 * [41] Attribute ::= Name Eq AttValue
3234 */
3235
3236void
3237htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003238 xmlChar *name;
Daniel Veillard7d2c2761999-10-11 15:09:51 +00003239 xmlChar *currentNode = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003240 htmlElemDescPtr info;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003241 htmlParserNodeInfo node_info;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003242 xmlChar *oldname;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003243 int depth = ctxt->nameNr;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003244
3245 /* Capture start position */
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003246 if (ctxt->record_info) {
3247 node_info.begin_pos = ctxt->input->consumed +
3248 (CUR_PTR - ctxt->input->base);
3249 node_info.begin_line = ctxt->input->line;
3250 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003251
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003252 oldname = xmlStrdup(ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003253 htmlParseStartTag(ctxt);
3254 name = ctxt->name;
3255#ifdef DEBUG
3256 if (oldname == NULL)
3257 fprintf(stderr, "Start of element %s\n", name);
3258 else if (name == NULL)
3259 fprintf(stderr, "Start of element failed, was %s\n", oldname);
3260 else
3261 fprintf(stderr, "Start of element %s, was %s\n", name, oldname);
3262#endif
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003263 if (((depth == ctxt->nameNr) && (!xmlStrcmp(oldname, ctxt->name))) ||
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003264 (name == NULL)) {
3265 if (CUR == '>')
3266 NEXT;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003267 if (oldname != NULL)
3268 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003269 return;
3270 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003271 if (oldname != NULL)
3272 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003273
3274 /*
3275 * Lookup the info for that element.
3276 */
3277 info = htmlTagLookup(name);
3278 if (info == NULL) {
3279 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3280 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3281 name);
3282 ctxt->wellFormed = 0;
3283 } else if (info->depr) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003284/***************************
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003285 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3286 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3287 name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003288 ***************************/
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003289 }
3290
3291 /*
3292 * Check for an Empty Element labelled the XML/SGML way
3293 */
3294 if ((CUR == '/') && (NXT(1) == '>')) {
3295 SKIP(2);
3296 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3297 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003298 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003299#ifdef DEBUG
3300 fprintf(stderr,"End of tag the XML way: popping out %s\n", oldname);
3301#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003302 if (oldname != NULL)
3303 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003304 return;
3305 }
3306
Daniel Veillarde2d034d1999-07-27 19:52:06 +00003307 if (CUR == '>') {
3308 NEXT;
3309 } else {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003310 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard87b95392000-08-12 21:12:04 +00003311 ctxt->sax->error(ctxt->userData,
3312 "Couldn't find end of Start Tag %s\n",
3313 name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003314 ctxt->wellFormed = 0;
3315
3316 /*
3317 * end of parsing of this node.
3318 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003319 if (!xmlStrcmp(name, ctxt->name)) {
3320 nodePop(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003321 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003322#ifdef DEBUG
3323 fprintf(stderr,"End of start tag problem: popping out %s\n", oldname);
3324#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003325 if (oldname != NULL)
3326 xmlFree(oldname);
3327 }
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003328
3329 /*
3330 * Capture end position and add node
3331 */
3332 if ( currentNode != NULL && ctxt->record_info ) {
3333 node_info.end_pos = ctxt->input->consumed +
3334 (CUR_PTR - ctxt->input->base);
3335 node_info.end_line = ctxt->input->line;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003336 node_info.node = ctxt->node;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003337 xmlParserAddNodeInfo(ctxt, &node_info);
3338 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003339 return;
3340 }
3341
3342 /*
3343 * Check for an Empty Element from DTD definition
3344 */
3345 if ((info != NULL) && (info->empty)) {
3346 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3347 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003348 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003349#ifdef DEBUG
3350 fprintf(stderr,"End of empty tag %s : popping out %s\n", name, oldname);
3351#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003352 if (oldname != NULL)
3353 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003354 return;
3355 }
3356
3357 /*
3358 * Parse the content of the element:
3359 */
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003360 currentNode = xmlStrdup(ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003361 depth = ctxt->nameNr;
3362 while (IS_CHAR(CUR)) {
3363 htmlParseContent(ctxt);
3364 if (ctxt->nameNr < depth) break;
3365 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003366
3367 if (!IS_CHAR(CUR)) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003368 /************
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003369 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3370 ctxt->sax->error(ctxt->userData,
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003371 "Premature end of data in tag %s\n", currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003372 ctxt->wellFormed = 0;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003373 *************/
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003374
3375 /*
3376 * end of parsing of this node.
3377 */
3378 nodePop(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003379 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003380#ifdef DEBUG
3381 fprintf(stderr,"Premature end of tag %s : popping out %s\n", name, oldname);
3382#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003383 if (oldname != NULL)
3384 xmlFree(oldname);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003385 if (currentNode != NULL)
3386 xmlFree(currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003387 return;
3388 }
3389
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003390 /*
3391 * Capture end position and add node
3392 */
3393 if ( currentNode != NULL && ctxt->record_info ) {
3394 node_info.end_pos = ctxt->input->consumed +
3395 (CUR_PTR - ctxt->input->base);
3396 node_info.end_line = ctxt->input->line;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003397 node_info.node = ctxt->node;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003398 xmlParserAddNodeInfo(ctxt, &node_info);
3399 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003400 if (currentNode != NULL)
3401 xmlFree(currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003402}
3403
3404/**
3405 * htmlParseDocument :
3406 * @ctxt: an HTML parser context
3407 *
3408 * parse an HTML document (and build a tree if using the standard SAX
3409 * interface).
3410 *
3411 * Returns 0, -1 in case of error. the parser context is augmented
3412 * as a result of the parsing.
3413 */
3414
3415int
3416htmlParseDocument(htmlParserCtxtPtr ctxt) {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003417 xmlDtdPtr dtd;
3418
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003419 htmlDefaultSAXHandlerInit();
3420 ctxt->html = 1;
3421
Daniel Veillarde2d034d1999-07-27 19:52:06 +00003422 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003423 /*
Daniel Veillardb96e6431999-08-29 21:02:19 +00003424 * SAX: beginning of the document processing.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003425 */
3426 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3427 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3428
3429 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003430 * Wipe out everything which is before the first '<'
3431 */
Daniel Veillard35008381999-10-25 13:15:52 +00003432 SKIP_BLANKS;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003433 if (CUR == 0) {
3434 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3435 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3436 ctxt->wellFormed = 0;
3437 }
3438
Daniel Veillardbe803962000-06-28 23:40:59 +00003439 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3440 ctxt->sax->startDocument(ctxt->userData);
3441
3442
Daniel Veillard35008381999-10-25 13:15:52 +00003443 /*
3444 * Parse possible comments before any content
3445 */
3446 while ((CUR == '<') && (NXT(1) == '!') &&
3447 (NXT(2) == '-') && (NXT(3) == '-')) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003448 htmlParseComment(ctxt);
Daniel Veillard35008381999-10-25 13:15:52 +00003449 SKIP_BLANKS;
3450 }
3451
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003452
3453 /*
3454 * Then possibly doc type declaration(s) and more Misc
3455 * (doctypedecl Misc*)?
3456 */
3457 if ((CUR == '<') && (NXT(1) == '!') &&
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003458 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3459 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3460 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3461 (UPP(8) == 'E')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003462 htmlParseDocTypeDecl(ctxt);
3463 }
3464 SKIP_BLANKS;
3465
3466 /*
Daniel Veillard87b95392000-08-12 21:12:04 +00003467 * Parse possible comments before any content
3468 */
3469 while ((CUR == '<') && (NXT(1) == '!') &&
3470 (NXT(2) == '-') && (NXT(3) == '-')) {
3471 htmlParseComment(ctxt);
3472 SKIP_BLANKS;
3473 }
3474
3475 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003476 * Time to start parsing the tree itself
3477 */
Daniel Veillard35008381999-10-25 13:15:52 +00003478 htmlParseContent(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003479
3480 /*
Daniel Veillard365e13b2000-07-02 07:56:37 +00003481 * autoclose
3482 */
3483 if (CUR == 0)
3484 htmlAutoClose(ctxt, NULL);
3485
3486
3487 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003488 * SAX: end of the document processing.
3489 */
3490 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3491 ctxt->sax->endDocument(ctxt->userData);
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003492
3493 if (ctxt->myDoc != NULL) {
3494 dtd = xmlGetIntSubset(ctxt->myDoc);
3495 if (dtd == NULL)
3496 ctxt->myDoc->intSubset =
3497 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3498 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3499 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3500 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003501 if (! ctxt->wellFormed) return(-1);
3502 return(0);
3503}
3504
3505
Daniel Veillarddbfd6411999-12-28 16:35:14 +00003506/************************************************************************
3507 * *
3508 * Parser contexts handling *
3509 * *
3510 ************************************************************************/
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003511
3512/**
3513 * xmlInitParserCtxt:
3514 * @ctxt: an HTML parser context
3515 *
3516 * Initialize a parser context
3517 */
3518
3519void
3520htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3521{
3522 htmlSAXHandler *sax;
3523
Daniel Veillard35008381999-10-25 13:15:52 +00003524 if (ctxt == NULL) return;
3525 memset(ctxt, 0, sizeof(htmlParserCtxt));
3526
Daniel Veillard6454aec1999-09-02 22:04:43 +00003527 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003528 if (sax == NULL) {
3529 fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
3530 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003531 memset(sax, 0, sizeof(htmlSAXHandler));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003532
3533 /* Allocate the Input stack */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003534 ctxt->inputTab = (htmlParserInputPtr *)
3535 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3536 if (ctxt->inputTab == NULL) {
3537 fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
3538 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003539 ctxt->inputNr = 0;
3540 ctxt->inputMax = 5;
3541 ctxt->input = NULL;
3542 ctxt->version = NULL;
3543 ctxt->encoding = NULL;
3544 ctxt->standalone = -1;
Daniel Veillarddbfd6411999-12-28 16:35:14 +00003545 ctxt->instate = XML_PARSER_START;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003546
3547 /* Allocate the Node stack */
Daniel Veillard6454aec1999-09-02 22:04:43 +00003548 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003549 ctxt->nodeNr = 0;
3550 ctxt->nodeMax = 10;
3551 ctxt->node = NULL;
3552
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003553 /* Allocate the Name stack */
3554 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
3555 ctxt->nameNr = 0;
3556 ctxt->nameMax = 10;
3557 ctxt->name = NULL;
3558
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003559 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3560 else {
3561 ctxt->sax = sax;
3562 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3563 }
3564 ctxt->userData = ctxt;
3565 ctxt->myDoc = NULL;
3566 ctxt->wellFormed = 1;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003567 ctxt->replaceEntities = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003568 ctxt->html = 1;
3569 ctxt->record_info = 0;
Daniel Veillard35008381999-10-25 13:15:52 +00003570 ctxt->validate = 0;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003571 ctxt->nbChars = 0;
Daniel Veillarddbfd6411999-12-28 16:35:14 +00003572 ctxt->checkIndex = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003573 xmlInitNodeInfoSeq(&ctxt->node_seq);
3574}
3575
3576/**
3577 * htmlFreeParserCtxt:
3578 * @ctxt: an HTML parser context
3579 *
3580 * Free all the memory used by a parser context. However the parsed
3581 * document in ctxt->myDoc is not freed.
3582 */
3583
3584void
3585htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3586{
Daniel Veillard365e13b2000-07-02 07:56:37 +00003587 xmlFreeParserCtxt(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003588}
3589
3590/**
3591 * htmlCreateDocParserCtxt :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003592 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003593 * @encoding: a free form C string describing the HTML document encoding, or NULL
3594 *
3595 * Create a parser context for an HTML document.
3596 *
3597 * Returns the new parser context or NULL
3598 */
3599htmlParserCtxtPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003600htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003601 htmlParserCtxtPtr ctxt;
3602 htmlParserInputPtr input;
3603 /* htmlCharEncoding enc; */
3604
Daniel Veillard6454aec1999-09-02 22:04:43 +00003605 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003606 if (ctxt == NULL) {
3607 perror("malloc");
3608 return(NULL);
3609 }
3610 htmlInitParserCtxt(ctxt);
Daniel Veillard6454aec1999-09-02 22:04:43 +00003611 input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003612 if (input == NULL) {
3613 perror("malloc");
Daniel Veillard6454aec1999-09-02 22:04:43 +00003614 xmlFree(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003615 return(NULL);
3616 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003617 memset(input, 0, sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003618
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003619 input->line = 1;
3620 input->col = 1;
3621 input->base = cur;
3622 input->cur = cur;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003623
3624 inputPush(ctxt, input);
3625 return(ctxt);
3626}
3627
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003628/************************************************************************
3629 * *
3630 * Progressive parsing interfaces *
3631 * *
3632 ************************************************************************/
3633
3634/**
3635 * htmlParseLookupSequence:
3636 * @ctxt: an HTML parser context
3637 * @first: the first char to lookup
3638 * @next: the next char to lookup or zero
3639 * @third: the next char to lookup or zero
3640 *
3641 * Try to find if a sequence (first, next, third) or just (first next) or
3642 * (first) is available in the input stream.
3643 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3644 * to avoid rescanning sequences of bytes, it DOES change the state of the
3645 * parser, do not use liberally.
3646 * This is basically similar to xmlParseLookupSequence()
3647 *
3648 * Returns the index to the current parsing point if the full sequence
3649 * is available, -1 otherwise.
3650 */
3651int
3652htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3653 xmlChar next, xmlChar third) {
3654 int base, len;
3655 htmlParserInputPtr in;
3656 const xmlChar *buf;
3657
3658 in = ctxt->input;
3659 if (in == NULL) return(-1);
3660 base = in->cur - in->base;
3661 if (base < 0) return(-1);
3662 if (ctxt->checkIndex > base)
3663 base = ctxt->checkIndex;
3664 if (in->buf == NULL) {
3665 buf = in->base;
3666 len = in->length;
3667 } else {
3668 buf = in->buf->buffer->content;
3669 len = in->buf->buffer->use;
3670 }
3671 /* take into account the sequence length */
3672 if (third) len -= 2;
3673 else if (next) len --;
3674 for (;base < len;base++) {
3675 if (buf[base] == first) {
3676 if (third != 0) {
3677 if ((buf[base + 1] != next) ||
3678 (buf[base + 2] != third)) continue;
3679 } else if (next != 0) {
3680 if (buf[base + 1] != next) continue;
3681 }
3682 ctxt->checkIndex = 0;
3683#ifdef DEBUG_PUSH
3684 if (next == 0)
3685 fprintf(stderr, "HPP: lookup '%c' found at %d\n",
3686 first, base);
3687 else if (third == 0)
3688 fprintf(stderr, "HPP: lookup '%c%c' found at %d\n",
3689 first, next, base);
3690 else
3691 fprintf(stderr, "HPP: lookup '%c%c%c' found at %d\n",
3692 first, next, third, base);
3693#endif
3694 return(base - (in->cur - in->base));
3695 }
3696 }
3697 ctxt->checkIndex = base;
3698#ifdef DEBUG_PUSH
3699 if (next == 0)
3700 fprintf(stderr, "HPP: lookup '%c' failed\n", first);
3701 else if (third == 0)
3702 fprintf(stderr, "HPP: lookup '%c%c' failed\n", first, next);
3703 else
3704 fprintf(stderr, "HPP: lookup '%c%c%c' failed\n", first, next, third);
3705#endif
3706 return(-1);
3707}
3708
3709/**
Daniel Veillard71b656e2000-01-05 14:46:17 +00003710 * htmlParseTryOrFinish:
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003711 * @ctxt: an HTML parser context
Daniel Veillard71b656e2000-01-05 14:46:17 +00003712 * @terminate: last chunk indicator
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003713 *
3714 * Try to progress on parsing
3715 *
3716 * Returns zero if no parsing was possible
3717 */
3718int
Daniel Veillard71b656e2000-01-05 14:46:17 +00003719htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003720 int ret = 0;
3721 htmlParserInputPtr in;
Daniel Veillard365e13b2000-07-02 07:56:37 +00003722 int avail = 0;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003723 xmlChar cur, next;
3724
3725#ifdef DEBUG_PUSH
3726 switch (ctxt->instate) {
3727 case XML_PARSER_EOF:
3728 fprintf(stderr, "HPP: try EOF\n"); break;
3729 case XML_PARSER_START:
3730 fprintf(stderr, "HPP: try START\n"); break;
3731 case XML_PARSER_MISC:
3732 fprintf(stderr, "HPP: try MISC\n");break;
3733 case XML_PARSER_COMMENT:
3734 fprintf(stderr, "HPP: try COMMENT\n");break;
3735 case XML_PARSER_PROLOG:
3736 fprintf(stderr, "HPP: try PROLOG\n");break;
3737 case XML_PARSER_START_TAG:
3738 fprintf(stderr, "HPP: try START_TAG\n");break;
3739 case XML_PARSER_CONTENT:
3740 fprintf(stderr, "HPP: try CONTENT\n");break;
3741 case XML_PARSER_CDATA_SECTION:
3742 fprintf(stderr, "HPP: try CDATA_SECTION\n");break;
3743 case XML_PARSER_END_TAG:
3744 fprintf(stderr, "HPP: try END_TAG\n");break;
3745 case XML_PARSER_ENTITY_DECL:
3746 fprintf(stderr, "HPP: try ENTITY_DECL\n");break;
3747 case XML_PARSER_ENTITY_VALUE:
3748 fprintf(stderr, "HPP: try ENTITY_VALUE\n");break;
3749 case XML_PARSER_ATTRIBUTE_VALUE:
3750 fprintf(stderr, "HPP: try ATTRIBUTE_VALUE\n");break;
3751 case XML_PARSER_DTD:
3752 fprintf(stderr, "HPP: try DTD\n");break;
3753 case XML_PARSER_EPILOG:
3754 fprintf(stderr, "HPP: try EPILOG\n");break;
3755 case XML_PARSER_PI:
3756 fprintf(stderr, "HPP: try PI\n");break;
3757 }
3758#endif
3759
3760 while (1) {
3761
3762 in = ctxt->input;
3763 if (in == NULL) break;
3764 if (in->buf == NULL)
3765 avail = in->length - (in->cur - in->base);
3766 else
3767 avail = in->buf->buffer->use - (in->cur - in->base);
Daniel Veillard365e13b2000-07-02 07:56:37 +00003768 if ((avail == 0) && (terminate)) {
3769 htmlAutoClose(ctxt, NULL);
Daniel Veillard87b95392000-08-12 21:12:04 +00003770 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
3771 /*
3772 * SAX: end of the document processing.
3773 */
Daniel Veillard365e13b2000-07-02 07:56:37 +00003774 ctxt->instate = XML_PARSER_EOF;
Daniel Veillard87b95392000-08-12 21:12:04 +00003775 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3776 ctxt->sax->endDocument(ctxt->userData);
3777 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00003778 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003779 if (avail < 1)
3780 goto done;
3781 switch (ctxt->instate) {
3782 case XML_PARSER_EOF:
3783 /*
3784 * Document parsing is done !
3785 */
3786 goto done;
3787 case XML_PARSER_START:
3788 /*
3789 * Very first chars read from the document flow.
3790 */
3791 cur = in->cur[0];
3792 if (IS_BLANK(cur)) {
3793 SKIP_BLANKS;
3794 if (in->buf == NULL)
3795 avail = in->length - (in->cur - in->base);
3796 else
3797 avail = in->buf->buffer->use - (in->cur - in->base);
3798 }
3799 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3800 ctxt->sax->setDocumentLocator(ctxt->userData,
3801 &xmlDefaultSAXLocator);
Daniel Veillardd83eb822000-06-30 18:39:56 +00003802 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
3803 (!ctxt->disableSAX))
3804 ctxt->sax->startDocument(ctxt->userData);
3805
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003806 cur = in->cur[0];
3807 next = in->cur[1];
3808 if ((cur == '<') && (next == '!') &&
3809 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3810 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3811 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3812 (UPP(8) == 'E')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003813 if ((!terminate) &&
3814 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003815 goto done;
3816#ifdef DEBUG_PUSH
3817 fprintf(stderr, "HPP: Parsing internal subset\n");
3818#endif
3819 htmlParseDocTypeDecl(ctxt);
3820 ctxt->instate = XML_PARSER_PROLOG;
3821#ifdef DEBUG_PUSH
3822 fprintf(stderr, "HPP: entering PROLOG\n");
3823#endif
3824 } else {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003825 ctxt->instate = XML_PARSER_MISC;
3826 }
3827#ifdef DEBUG_PUSH
3828 fprintf(stderr, "HPP: entering MISC\n");
3829#endif
3830 break;
3831 case XML_PARSER_MISC:
3832 SKIP_BLANKS;
3833 if (in->buf == NULL)
3834 avail = in->length - (in->cur - in->base);
3835 else
3836 avail = in->buf->buffer->use - (in->cur - in->base);
3837 if (avail < 2)
3838 goto done;
3839 cur = in->cur[0];
3840 next = in->cur[1];
3841 if ((cur == '<') && (next == '!') &&
3842 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003843 if ((!terminate) &&
3844 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003845 goto done;
3846#ifdef DEBUG_PUSH
3847 fprintf(stderr, "HPP: Parsing Comment\n");
3848#endif
3849 htmlParseComment(ctxt);
3850 ctxt->instate = XML_PARSER_MISC;
3851 } else if ((cur == '<') && (next == '!') &&
3852 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3853 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3854 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3855 (UPP(8) == 'E')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003856 if ((!terminate) &&
3857 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003858 goto done;
3859#ifdef DEBUG_PUSH
3860 fprintf(stderr, "HPP: Parsing internal subset\n");
3861#endif
3862 htmlParseDocTypeDecl(ctxt);
3863 ctxt->instate = XML_PARSER_PROLOG;
3864#ifdef DEBUG_PUSH
3865 fprintf(stderr, "HPP: entering PROLOG\n");
3866#endif
3867 } else if ((cur == '<') && (next == '!') &&
3868 (avail < 9)) {
3869 goto done;
3870 } else {
3871 ctxt->instate = XML_PARSER_START_TAG;
3872#ifdef DEBUG_PUSH
3873 fprintf(stderr, "HPP: entering START_TAG\n");
3874#endif
3875 }
3876 break;
3877 case XML_PARSER_PROLOG:
3878 SKIP_BLANKS;
3879 if (in->buf == NULL)
3880 avail = in->length - (in->cur - in->base);
3881 else
3882 avail = in->buf->buffer->use - (in->cur - in->base);
3883 if (avail < 2)
3884 goto done;
3885 cur = in->cur[0];
3886 next = in->cur[1];
3887 if ((cur == '<') && (next == '!') &&
3888 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003889 if ((!terminate) &&
3890 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003891 goto done;
3892#ifdef DEBUG_PUSH
3893 fprintf(stderr, "HPP: Parsing Comment\n");
3894#endif
3895 htmlParseComment(ctxt);
3896 ctxt->instate = XML_PARSER_PROLOG;
3897 } else if ((cur == '<') && (next == '!') &&
3898 (avail < 4)) {
3899 goto done;
3900 } else {
3901 ctxt->instate = XML_PARSER_START_TAG;
3902#ifdef DEBUG_PUSH
3903 fprintf(stderr, "HPP: entering START_TAG\n");
3904#endif
3905 }
3906 break;
3907 case XML_PARSER_EPILOG:
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003908 if (in->buf == NULL)
3909 avail = in->length - (in->cur - in->base);
3910 else
3911 avail = in->buf->buffer->use - (in->cur - in->base);
Daniel Veillard87b95392000-08-12 21:12:04 +00003912 if (avail < 1)
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003913 goto done;
3914 cur = in->cur[0];
Daniel Veillard87b95392000-08-12 21:12:04 +00003915 if (IS_BLANK(cur)) {
3916 htmlParseCharData(ctxt, 0);
3917 goto done;
3918 }
3919 if (avail < 2)
3920 goto done;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003921 next = in->cur[1];
3922 if ((cur == '<') && (next == '!') &&
3923 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003924 if ((!terminate) &&
3925 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003926 goto done;
3927#ifdef DEBUG_PUSH
3928 fprintf(stderr, "HPP: Parsing Comment\n");
3929#endif
3930 htmlParseComment(ctxt);
3931 ctxt->instate = XML_PARSER_EPILOG;
3932 } else if ((cur == '<') && (next == '!') &&
3933 (avail < 4)) {
3934 goto done;
3935 } else {
3936 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3937 ctxt->sax->error(ctxt->userData,
3938 "Extra content at the end of the document\n");
3939 ctxt->wellFormed = 0;
3940 ctxt->errNo = XML_ERR_DOCUMENT_END;
3941 ctxt->instate = XML_PARSER_EOF;
3942#ifdef DEBUG_PUSH
3943 fprintf(stderr, "HPP: entering EOF\n");
3944#endif
3945 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3946 ctxt->sax->endDocument(ctxt->userData);
3947 goto done;
3948 }
3949 break;
3950 case XML_PARSER_START_TAG: {
3951 xmlChar *name, *oldname;
3952 int depth = ctxt->nameNr;
3953 htmlElemDescPtr info;
3954
3955 if (avail < 2)
3956 goto done;
3957 cur = in->cur[0];
3958 if (cur != '<') {
3959 ctxt->instate = XML_PARSER_CONTENT;
3960#ifdef DEBUG_PUSH
3961 fprintf(stderr, "HPP: entering CONTENT\n");
3962#endif
3963 break;
3964 }
Daniel Veillard71b656e2000-01-05 14:46:17 +00003965 if ((!terminate) &&
3966 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003967 goto done;
3968
3969 oldname = xmlStrdup(ctxt->name);
3970 htmlParseStartTag(ctxt);
3971 name = ctxt->name;
3972#ifdef DEBUG
3973 if (oldname == NULL)
3974 fprintf(stderr, "Start of element %s\n", name);
3975 else if (name == NULL)
3976 fprintf(stderr, "Start of element failed, was %s\n",
3977 oldname);
3978 else
3979 fprintf(stderr, "Start of element %s, was %s\n",
3980 name, oldname);
3981#endif
3982 if (((depth == ctxt->nameNr) &&
3983 (!xmlStrcmp(oldname, ctxt->name))) ||
3984 (name == NULL)) {
3985 if (CUR == '>')
3986 NEXT;
3987 if (oldname != NULL)
3988 xmlFree(oldname);
3989 break;
3990 }
3991 if (oldname != NULL)
3992 xmlFree(oldname);
3993
3994 /*
3995 * Lookup the info for that element.
3996 */
3997 info = htmlTagLookup(name);
3998 if (info == NULL) {
3999 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4000 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4001 name);
4002 ctxt->wellFormed = 0;
4003 } else if (info->depr) {
4004 /***************************
4005 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4006 ctxt->sax->warning(ctxt->userData,
4007 "Tag %s is deprecated\n",
4008 name);
4009 ***************************/
4010 }
4011
4012 /*
4013 * Check for an Empty Element labelled the XML/SGML way
4014 */
4015 if ((CUR == '/') && (NXT(1) == '>')) {
4016 SKIP(2);
4017 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4018 ctxt->sax->endElement(ctxt->userData, name);
4019 oldname = htmlnamePop(ctxt);
4020#ifdef DEBUG
4021 fprintf(stderr,"End of tag the XML way: popping out %s\n",
4022 oldname);
4023#endif
4024 if (oldname != NULL)
4025 xmlFree(oldname);
4026 ctxt->instate = XML_PARSER_CONTENT;
4027#ifdef DEBUG_PUSH
4028 fprintf(stderr, "HPP: entering CONTENT\n");
4029#endif
4030 break;
4031 }
4032
4033 if (CUR == '>') {
4034 NEXT;
4035 } else {
4036 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4037 ctxt->sax->error(ctxt->userData,
4038 "Couldn't find end of Start Tag %s\n",
4039 name);
4040 ctxt->wellFormed = 0;
4041
4042 /*
4043 * end of parsing of this node.
4044 */
4045 if (!xmlStrcmp(name, ctxt->name)) {
4046 nodePop(ctxt);
4047 oldname = htmlnamePop(ctxt);
4048#ifdef DEBUG
4049 fprintf(stderr,
4050 "End of start tag problem: popping out %s\n", oldname);
4051#endif
4052 if (oldname != NULL)
4053 xmlFree(oldname);
4054 }
4055
4056 ctxt->instate = XML_PARSER_CONTENT;
4057#ifdef DEBUG_PUSH
4058 fprintf(stderr, "HPP: entering CONTENT\n");
4059#endif
4060 break;
4061 }
4062
4063 /*
4064 * Check for an Empty Element from DTD definition
4065 */
4066 if ((info != NULL) && (info->empty)) {
4067 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4068 ctxt->sax->endElement(ctxt->userData, name);
4069 oldname = htmlnamePop(ctxt);
4070#ifdef DEBUG
4071 fprintf(stderr,"End of empty tag %s : popping out %s\n", name, oldname);
4072#endif
4073 if (oldname != NULL)
4074 xmlFree(oldname);
4075 }
4076 ctxt->instate = XML_PARSER_CONTENT;
4077#ifdef DEBUG_PUSH
4078 fprintf(stderr, "HPP: entering CONTENT\n");
4079#endif
4080 break;
4081 }
Daniel Veillard87b95392000-08-12 21:12:04 +00004082 case XML_PARSER_CONTENT: {
4083 long cons;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004084 /*
4085 * Handle preparsed entities and charRef
4086 */
4087 if (ctxt->token != 0) {
Daniel Veillard365e13b2000-07-02 07:56:37 +00004088 xmlChar chr[2] = { 0 , 0 } ;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004089
Daniel Veillard365e13b2000-07-02 07:56:37 +00004090 chr[0] = (xmlChar) ctxt->token;
Daniel Veillardb8f25c92000-08-19 19:52:36 +00004091 htmlCheckParagraph(ctxt);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004092 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
Daniel Veillard365e13b2000-07-02 07:56:37 +00004093 ctxt->sax->characters(ctxt->userData, chr, 1);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004094 ctxt->token = 0;
4095 ctxt->checkIndex = 0;
4096 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00004097 if ((avail == 1) && (terminate)) {
4098 cur = in->cur[0];
4099 if ((cur != '<') && (cur != '&')) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00004100 if (ctxt->sax != NULL) {
4101 if (IS_BLANK(cur)) {
4102 if (ctxt->sax->ignorableWhitespace != NULL)
4103 ctxt->sax->ignorableWhitespace(
4104 ctxt->userData, &cur, 1);
4105 } else {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00004106 htmlCheckParagraph(ctxt);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00004107 if (ctxt->sax->characters != NULL)
4108 ctxt->sax->characters(
4109 ctxt->userData, &cur, 1);
4110 }
4111 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00004112 ctxt->token = 0;
4113 ctxt->checkIndex = 0;
4114 NEXT;
4115 }
4116 break;
4117 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004118 if (avail < 2)
4119 goto done;
4120 cur = in->cur[0];
4121 next = in->cur[1];
Daniel Veillard87b95392000-08-12 21:12:04 +00004122 cons = ctxt->nbChars;
Daniel Veillardb8f25c92000-08-19 19:52:36 +00004123 /*
4124 * Sometimes DOCTYPE arrives in the middle of the document
4125 */
4126 if ((cur == '<') && (next == '!') &&
4127 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4128 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4129 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4130 (UPP(8) == 'E')) {
4131 if ((!terminate) &&
4132 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4133 goto done;
4134 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4135 ctxt->sax->error(ctxt->userData,
4136 "Misplaced DOCTYPE declaration\n");
4137 ctxt->wellFormed = 0;
4138 htmlParseDocTypeDecl(ctxt);
4139 } else if ((cur == '<') && (next == '!') &&
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004140 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00004141 if ((!terminate) &&
4142 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004143 goto done;
4144#ifdef DEBUG_PUSH
4145 fprintf(stderr, "HPP: Parsing Comment\n");
4146#endif
4147 htmlParseComment(ctxt);
4148 ctxt->instate = XML_PARSER_CONTENT;
4149 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4150 goto done;
4151 } else if ((cur == '<') && (next == '/')) {
4152 ctxt->instate = XML_PARSER_END_TAG;
4153 ctxt->checkIndex = 0;
4154#ifdef DEBUG_PUSH
4155 fprintf(stderr, "HPP: entering END_TAG\n");
4156#endif
4157 break;
4158 } else if (cur == '<') {
4159 ctxt->instate = XML_PARSER_START_TAG;
4160 ctxt->checkIndex = 0;
4161#ifdef DEBUG_PUSH
4162 fprintf(stderr, "HPP: entering START_TAG\n");
4163#endif
4164 break;
4165 } else if (cur == '&') {
Daniel Veillard71b656e2000-01-05 14:46:17 +00004166 if ((!terminate) &&
4167 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004168 goto done;
4169#ifdef DEBUG_PUSH
4170 fprintf(stderr, "HPP: Parsing Reference\n");
4171#endif
4172 /* TODO: check generation of subtrees if noent !!! */
4173 htmlParseReference(ctxt);
4174 } else {
4175 /* TODO Avoid the extra copy, handle directly !!!!!! */
4176 /*
4177 * Goal of the following test is :
4178 * - minimize calls to the SAX 'character' callback
4179 * when they are mergeable
4180 */
4181 if ((ctxt->inputNr == 1) &&
4182 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00004183 if ((!terminate) &&
4184 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004185 goto done;
4186 }
4187 ctxt->checkIndex = 0;
4188#ifdef DEBUG_PUSH
4189 fprintf(stderr, "HPP: Parsing char data\n");
4190#endif
4191 htmlParseCharData(ctxt, 0);
4192 }
Daniel Veillard87b95392000-08-12 21:12:04 +00004193 if (cons == ctxt->nbChars) {
4194 if (ctxt->node != NULL) {
4195 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4196 ctxt->sax->error(ctxt->userData,
4197 "detected an error in element content\n");
4198 ctxt->wellFormed = 0;
4199 NEXT;
4200 }
4201 break;
4202 }
4203
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004204 break;
Daniel Veillard87b95392000-08-12 21:12:04 +00004205 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004206 case XML_PARSER_END_TAG:
4207 if (avail < 2)
4208 goto done;
Daniel Veillard71b656e2000-01-05 14:46:17 +00004209 if ((!terminate) &&
4210 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004211 goto done;
4212 htmlParseEndTag(ctxt);
4213 if (ctxt->nameNr == 0) {
4214 ctxt->instate = XML_PARSER_EPILOG;
4215 } else {
4216 ctxt->instate = XML_PARSER_CONTENT;
4217 }
4218 ctxt->checkIndex = 0;
4219#ifdef DEBUG_PUSH
4220 fprintf(stderr, "HPP: entering CONTENT\n");
4221#endif
4222 break;
4223 case XML_PARSER_CDATA_SECTION:
4224 fprintf(stderr, "HPP: internal error, state == CDATA\n");
4225 ctxt->instate = XML_PARSER_CONTENT;
4226 ctxt->checkIndex = 0;
4227#ifdef DEBUG_PUSH
4228 fprintf(stderr, "HPP: entering CONTENT\n");
4229#endif
4230 break;
4231 case XML_PARSER_DTD:
4232 fprintf(stderr, "HPP: internal error, state == DTD\n");
4233 ctxt->instate = XML_PARSER_CONTENT;
4234 ctxt->checkIndex = 0;
4235#ifdef DEBUG_PUSH
4236 fprintf(stderr, "HPP: entering CONTENT\n");
4237#endif
4238 break;
4239 case XML_PARSER_COMMENT:
4240 fprintf(stderr, "HPP: internal error, state == COMMENT\n");
4241 ctxt->instate = XML_PARSER_CONTENT;
4242 ctxt->checkIndex = 0;
4243#ifdef DEBUG_PUSH
4244 fprintf(stderr, "HPP: entering CONTENT\n");
4245#endif
4246 break;
4247 case XML_PARSER_PI:
4248 fprintf(stderr, "HPP: internal error, state == PI\n");
4249 ctxt->instate = XML_PARSER_CONTENT;
4250 ctxt->checkIndex = 0;
4251#ifdef DEBUG_PUSH
4252 fprintf(stderr, "HPP: entering CONTENT\n");
4253#endif
4254 break;
4255 case XML_PARSER_ENTITY_DECL:
4256 fprintf(stderr, "HPP: internal error, state == ENTITY_DECL\n");
4257 ctxt->instate = XML_PARSER_CONTENT;
4258 ctxt->checkIndex = 0;
4259#ifdef DEBUG_PUSH
4260 fprintf(stderr, "HPP: entering CONTENT\n");
4261#endif
4262 break;
4263 case XML_PARSER_ENTITY_VALUE:
4264 fprintf(stderr, "HPP: internal error, state == ENTITY_VALUE\n");
4265 ctxt->instate = XML_PARSER_CONTENT;
4266 ctxt->checkIndex = 0;
4267#ifdef DEBUG_PUSH
4268 fprintf(stderr, "HPP: entering DTD\n");
4269#endif
4270 break;
4271 case XML_PARSER_ATTRIBUTE_VALUE:
4272 fprintf(stderr, "HPP: internal error, state == ATTRIBUTE_VALUE\n");
4273 ctxt->instate = XML_PARSER_START_TAG;
4274 ctxt->checkIndex = 0;
4275#ifdef DEBUG_PUSH
4276 fprintf(stderr, "HPP: entering START_TAG\n");
4277#endif
4278 break;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00004279 case XML_PARSER_SYSTEM_LITERAL:
4280 fprintf(stderr, "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
4281 ctxt->instate = XML_PARSER_CONTENT;
4282 ctxt->checkIndex = 0;
4283#ifdef DEBUG_PUSH
4284 fprintf(stderr, "HPP: entering CONTENT\n");
4285#endif
4286 break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004287 }
4288 }
4289done:
Daniel Veillard365e13b2000-07-02 07:56:37 +00004290 if ((avail == 0) && (terminate)) {
4291 htmlAutoClose(ctxt, NULL);
Daniel Veillard87b95392000-08-12 21:12:04 +00004292 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4293 /*
4294 * SAX: end of the document processing.
4295 */
Daniel Veillard365e13b2000-07-02 07:56:37 +00004296 ctxt->instate = XML_PARSER_EOF;
Daniel Veillard87b95392000-08-12 21:12:04 +00004297 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4298 ctxt->sax->endDocument(ctxt->userData);
4299 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00004300 }
Daniel Veillardb8f25c92000-08-19 19:52:36 +00004301 if ((ctxt->myDoc != NULL) &&
4302 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4303 (ctxt->instate == XML_PARSER_EPILOG))) {
4304 xmlDtdPtr dtd;
4305 dtd = xmlGetIntSubset(ctxt->myDoc);
4306 if (dtd == NULL)
4307 ctxt->myDoc->intSubset =
4308 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4309 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4310 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4311 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004312#ifdef DEBUG_PUSH
4313 fprintf(stderr, "HPP: done %d\n", ret);
4314#endif
4315 return(ret);
4316}
4317
4318/**
Daniel Veillard71b656e2000-01-05 14:46:17 +00004319 * htmlParseTry:
4320 * @ctxt: an HTML parser context
4321 *
4322 * Try to progress on parsing
4323 *
4324 * Returns zero if no parsing was possible
4325 */
4326int
4327htmlParseTry(htmlParserCtxtPtr ctxt) {
4328 return(htmlParseTryOrFinish(ctxt, 0));
4329}
4330
4331/**
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004332 * htmlParseChunk:
4333 * @ctxt: an XML parser context
4334 * @chunk: an char array
4335 * @size: the size in byte of the chunk
4336 * @terminate: last chunk indicator
4337 *
4338 * Parse a Chunk of memory
4339 *
4340 * Returns zero if no error, the xmlParserErrors otherwise.
4341 */
4342int
4343htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4344 int terminate) {
4345 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4346 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4347 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4348 int cur = ctxt->input->cur - ctxt->input->base;
4349
4350 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4351 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4352 ctxt->input->cur = ctxt->input->base + cur;
4353#ifdef DEBUG_PUSH
4354 fprintf(stderr, "HPP: pushed %d\n", size);
4355#endif
4356
Daniel Veillardd0f7f742000-02-02 17:42:48 +00004357 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4358 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard52402ce2000-08-22 23:36:12 +00004359 } else if (ctxt->instate != XML_PARSER_EOF) {
4360 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
Daniel Veillard71b656e2000-01-05 14:46:17 +00004361 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard52402ce2000-08-22 23:36:12 +00004362 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004363 if (terminate) {
4364 if ((ctxt->instate != XML_PARSER_EOF) &&
4365 (ctxt->instate != XML_PARSER_EPILOG) &&
4366 (ctxt->instate != XML_PARSER_MISC)) {
4367 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4368 ctxt->sax->error(ctxt->userData,
4369 "Extra content at the end of the document\n");
4370 ctxt->wellFormed = 0;
4371 ctxt->errNo = XML_ERR_DOCUMENT_END;
4372 }
4373 if (ctxt->instate != XML_PARSER_EOF) {
4374 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4375 ctxt->sax->endDocument(ctxt->userData);
4376 }
4377 ctxt->instate = XML_PARSER_EOF;
4378 }
4379 return((xmlParserErrors) ctxt->errNo);
4380}
4381
4382/************************************************************************
4383 * *
4384 * User entry points *
4385 * *
4386 ************************************************************************/
4387
4388/**
4389 * htmlCreatePushParserCtxt :
4390 * @sax: a SAX handler
4391 * @user_data: The user data returned on SAX callbacks
4392 * @chunk: a pointer to an array of chars
4393 * @size: number of chars in the array
4394 * @filename: an optional file name or URI
4395 * @enc: an optional encoding
4396 *
4397 * Create a parser context for using the HTML parser in push mode
4398 * To allow content encoding detection, @size should be >= 4
4399 * The value of @filename is used for fetching external entities
4400 * and error/warning reports.
4401 *
4402 * Returns the new parser context or NULL
4403 */
4404htmlParserCtxtPtr
4405htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4406 const char *chunk, int size, const char *filename,
4407 xmlCharEncoding enc) {
4408 htmlParserCtxtPtr ctxt;
4409 htmlParserInputPtr inputStream;
4410 xmlParserInputBufferPtr buf;
4411
4412 buf = xmlAllocParserInputBuffer(enc);
4413 if (buf == NULL) return(NULL);
4414
4415 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4416 if (ctxt == NULL) {
4417 xmlFree(buf);
4418 return(NULL);
4419 }
4420 memset(ctxt, 0, sizeof(htmlParserCtxt));
4421 htmlInitParserCtxt(ctxt);
4422 if (sax != NULL) {
4423 if (ctxt->sax != &htmlDefaultSAXHandler)
4424 xmlFree(ctxt->sax);
4425 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4426 if (ctxt->sax == NULL) {
4427 xmlFree(buf);
4428 xmlFree(ctxt);
4429 return(NULL);
4430 }
4431 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4432 if (user_data != NULL)
4433 ctxt->userData = user_data;
4434 }
4435 if (filename == NULL) {
4436 ctxt->directory = NULL;
4437 } else {
4438 ctxt->directory = xmlParserGetDirectory(filename);
4439 }
4440
4441 inputStream = htmlNewInputStream(ctxt);
4442 if (inputStream == NULL) {
4443 xmlFreeParserCtxt(ctxt);
4444 return(NULL);
4445 }
4446
4447 if (filename == NULL)
4448 inputStream->filename = NULL;
4449 else
4450 inputStream->filename = xmlMemStrdup(filename);
4451 inputStream->buf = buf;
4452 inputStream->base = inputStream->buf->buffer->content;
4453 inputStream->cur = inputStream->buf->buffer->content;
4454
4455 inputPush(ctxt, inputStream);
4456
4457 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4458 (ctxt->input->buf != NULL)) {
4459 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4460#ifdef DEBUG_PUSH
4461 fprintf(stderr, "HPP: pushed %d\n", size);
4462#endif
4463 }
4464
4465 return(ctxt);
4466}
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004467
4468/**
4469 * htmlSAXParseDoc :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00004470 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004471 * @encoding: a free form C string describing the HTML document encoding, or NULL
4472 * @sax: the SAX handler block
4473 * @userData: if using SAX, this pointer will be provided on callbacks.
4474 *
4475 * parse an HTML in-memory document and build a tree.
4476 * It use the given SAX function block to handle the parsing callback.
4477 * If sax is NULL, fallback to the default DOM tree building routines.
4478 *
4479 * Returns the resulting document tree
4480 */
4481
4482htmlDocPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00004483htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004484 htmlDocPtr ret;
4485 htmlParserCtxtPtr ctxt;
4486
4487 if (cur == NULL) return(NULL);
4488
4489
4490 ctxt = htmlCreateDocParserCtxt(cur, encoding);
4491 if (ctxt == NULL) return(NULL);
4492 if (sax != NULL) {
4493 ctxt->sax = sax;
4494 ctxt->userData = userData;
4495 }
4496
4497 htmlParseDocument(ctxt);
4498 ret = ctxt->myDoc;
4499 if (sax != NULL) {
4500 ctxt->sax = NULL;
4501 ctxt->userData = NULL;
4502 }
4503 htmlFreeParserCtxt(ctxt);
4504
4505 return(ret);
4506}
4507
4508/**
4509 * htmlParseDoc :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00004510 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004511 * @encoding: a free form C string describing the HTML document encoding, or NULL
4512 *
4513 * parse an HTML in-memory document and build a tree.
4514 *
4515 * Returns the resulting document tree
4516 */
4517
4518htmlDocPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00004519htmlParseDoc(xmlChar *cur, const char *encoding) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004520 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4521}
4522
4523
4524/**
4525 * htmlCreateFileParserCtxt :
4526 * @filename: the filename
4527 * @encoding: a free form C string describing the HTML document encoding, or NULL
4528 *
4529 * Create a parser context for a file content.
4530 * Automatic support for ZLIB/Compress compressed document is provided
4531 * by default if found at compile-time.
4532 *
4533 * Returns the new parser context or NULL
4534 */
4535htmlParserCtxtPtr
4536htmlCreateFileParserCtxt(const char *filename, const char *encoding)
4537{
4538 htmlParserCtxtPtr ctxt;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004539 htmlParserInputPtr inputStream;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00004540 xmlParserInputBufferPtr buf;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004541 /* htmlCharEncoding enc; */
4542
Daniel Veillarde2d034d1999-07-27 19:52:06 +00004543 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
4544 if (buf == NULL) return(NULL);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004545
Daniel Veillard6454aec1999-09-02 22:04:43 +00004546 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004547 if (ctxt == NULL) {
4548 perror("malloc");
4549 return(NULL);
4550 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00004551 memset(ctxt, 0, sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004552 htmlInitParserCtxt(ctxt);
Daniel Veillard6454aec1999-09-02 22:04:43 +00004553 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004554 if (inputStream == NULL) {
4555 perror("malloc");
Daniel Veillard6454aec1999-09-02 22:04:43 +00004556 xmlFree(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004557 return(NULL);
4558 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00004559 memset(inputStream, 0, sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004560
Daniel Veillard6454aec1999-09-02 22:04:43 +00004561 inputStream->filename = xmlMemStrdup(filename);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004562 inputStream->line = 1;
4563 inputStream->col = 1;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00004564 inputStream->buf = buf;
Daniel Veillard35008381999-10-25 13:15:52 +00004565 inputStream->directory = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004566
Daniel Veillarde2d034d1999-07-27 19:52:06 +00004567 inputStream->base = inputStream->buf->buffer->content;
4568 inputStream->cur = inputStream->buf->buffer->content;
4569 inputStream->free = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004570
4571 inputPush(ctxt, inputStream);
4572 return(ctxt);
4573}
4574
4575/**
4576 * htmlSAXParseFile :
4577 * @filename: the filename
4578 * @encoding: a free form C string describing the HTML document encoding, or NULL
4579 * @sax: the SAX handler block
4580 * @userData: if using SAX, this pointer will be provided on callbacks.
4581 *
4582 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4583 * compressed document is provided by default if found at compile-time.
4584 * It use the given SAX function block to handle the parsing callback.
4585 * If sax is NULL, fallback to the default DOM tree building routines.
4586 *
4587 * Returns the resulting document tree
4588 */
4589
4590htmlDocPtr
4591htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
4592 void *userData) {
4593 htmlDocPtr ret;
4594 htmlParserCtxtPtr ctxt;
Daniel Veillard87b95392000-08-12 21:12:04 +00004595 htmlSAXHandlerPtr oldsax = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004596
4597 ctxt = htmlCreateFileParserCtxt(filename, encoding);
4598 if (ctxt == NULL) return(NULL);
4599 if (sax != NULL) {
Daniel Veillard87b95392000-08-12 21:12:04 +00004600 oldsax = ctxt->sax;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004601 ctxt->sax = sax;
4602 ctxt->userData = userData;
4603 }
4604
4605 htmlParseDocument(ctxt);
4606
4607 ret = ctxt->myDoc;
4608 if (sax != NULL) {
Daniel Veillard87b95392000-08-12 21:12:04 +00004609 ctxt->sax = oldsax;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004610 ctxt->userData = NULL;
4611 }
4612 htmlFreeParserCtxt(ctxt);
4613
4614 return(ret);
4615}
4616
4617/**
4618 * htmlParseFile :
4619 * @filename: the filename
4620 * @encoding: a free form C string describing the HTML document encoding, or NULL
4621 *
4622 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4623 * compressed document is provided by default if found at compile-time.
4624 *
4625 * Returns the resulting document tree
4626 */
4627
4628htmlDocPtr
4629htmlParseFile(const char *filename, const char *encoding) {
4630 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
4631}
Daniel Veillard361d8452000-04-03 19:48:13 +00004632
4633#endif /* LIBXML_HTML_ENABLED */