blob: 617b903fc63361d5ef3c9f5149c6138bb85633d4 [file] [log] [blame]
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * Daniel.Veillard@w3.org
7 */
8
9#ifdef WIN32
Daniel Veillard3c558c31999-12-22 11:30:41 +000010#include "win32config.h"
Daniel Veillardbe70ff71999-07-05 16:50:46 +000011#else
Daniel Veillard7f7d1111999-09-22 09:46:25 +000012#include "config.h"
Daniel Veillardbe70ff71999-07-05 16:50:46 +000013#endif
Daniel Veillard7f7d1111999-09-22 09:46:25 +000014
Daniel Veillardb71379b2000-10-09 12:30:39 +000015#include <libxml/xmlversion.h>
Daniel Veillard361d8452000-04-03 19:48:13 +000016#ifdef LIBXML_HTML_ENABLED
Daniel Veillardbe70ff71999-07-05 16:50:46 +000017#include <stdio.h>
Daniel Veillard32bc74e2000-07-14 14:49:25 +000018#include <string.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000019#ifdef HAVE_CTYPE_H
20#include <ctype.h>
21#endif
22#ifdef HAVE_STDLIB_H
Daniel Veillardbe70ff71999-07-05 16:50:46 +000023#include <stdlib.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000024#endif
25#ifdef HAVE_SYS_STAT_H
Daniel Veillardbe70ff71999-07-05 16:50:46 +000026#include <sys/stat.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000027#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +000028#ifdef HAVE_FCNTL_H
29#include <fcntl.h>
30#endif
31#ifdef HAVE_UNISTD_H
32#include <unistd.h>
33#endif
34#ifdef HAVE_ZLIB_H
35#include <zlib.h>
36#endif
37
Daniel Veillard361d8452000-04-03 19:48:13 +000038#include <libxml/xmlmemory.h>
39#include <libxml/tree.h>
Daniel Veillardaaf58b92000-10-06 14:07:26 +000040#include <libxml/parser.h>
41#include <libxml/parserInternals.h>
Daniel Veillardb71379b2000-10-09 12:30:39 +000042#include <libxml/xmlerror.h>
Daniel Veillard361d8452000-04-03 19:48:13 +000043#include <libxml/HTMLparser.h>
44#include <libxml/entities.h>
45#include <libxml/encoding.h>
46#include <libxml/valid.h>
Daniel Veillard361d8452000-04-03 19:48:13 +000047#include <libxml/xmlIO.h>
Daniel Veillarde2d034d1999-07-27 19:52:06 +000048
49#define HTML_MAX_NAMELEN 1000
Daniel Veillard32bc74e2000-07-14 14:49:25 +000050#define HTML_PARSER_BIG_BUFFER_SIZE 1000
Daniel Veillard5e5c6231999-12-29 12:49:06 +000051#define HTML_PARSER_BUFFER_SIZE 100
Daniel Veillardbe70ff71999-07-05 16:50:46 +000052
Daniel Veillard82150d81999-07-07 07:32:15 +000053/* #define DEBUG */
Daniel Veillard5e5c6231999-12-29 12:49:06 +000054/* #define DEBUG_PUSH */
Daniel Veillard5233ffc1999-07-06 22:25:25 +000055
56/************************************************************************
57 * *
58 * Parser stacks related functions and macros *
59 * *
60 ************************************************************************/
61
62/*
63 * Generic function for accessing stacks in the Parser Context
64 */
65
Daniel Veillarddbfd6411999-12-28 16:35:14 +000066#define PUSH_AND_POP(scope, type, name) \
67scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000068 if (ctxt->name##Nr >= ctxt->name##Max) { \
69 ctxt->name##Max *= 2; \
Daniel Veillard32bc74e2000-07-14 14:49:25 +000070 ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000071 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
72 if (ctxt->name##Tab == NULL) { \
73 fprintf(stderr, "realloc failed !\n"); \
Daniel Veillard0142b842000-01-14 14:45:24 +000074 return(0); \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000075 } \
76 } \
77 ctxt->name##Tab[ctxt->name##Nr] = value; \
78 ctxt->name = value; \
79 return(ctxt->name##Nr++); \
80} \
Daniel Veillarddbfd6411999-12-28 16:35:14 +000081scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000082 type ret; \
Daniel Veillard7c1206f1999-10-14 09:10:25 +000083 if (ctxt->name##Nr < 0) return(0); \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000084 ctxt->name##Nr--; \
Daniel Veillard7c1206f1999-10-14 09:10:25 +000085 if (ctxt->name##Nr < 0) return(0); \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000086 if (ctxt->name##Nr > 0) \
87 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
88 else \
89 ctxt->name = NULL; \
90 ret = ctxt->name##Tab[ctxt->name##Nr]; \
91 ctxt->name##Tab[ctxt->name##Nr] = 0; \
92 return(ret); \
93} \
94
Daniel Veillarddbfd6411999-12-28 16:35:14 +000095PUSH_AND_POP(extern, xmlNodePtr, node)
96PUSH_AND_POP(extern, xmlChar*, name)
Daniel Veillard5233ffc1999-07-06 22:25:25 +000097
98/*
99 * Macros for accessing the content. Those should be used only by the parser,
100 * and not exported.
101 *
102 * Dirty macros, i.e. one need to make assumption on the context to use them
103 *
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000104 * CUR_PTR return the current pointer to the xmlChar to be parsed.
105 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000106 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
107 * in UNICODE mode. This should be used internally by the parser
108 * only to compare to ASCII values otherwise it would break when
109 * running with UTF-8 encoding.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000110 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000111 * to compare on ASCII based substring.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000112 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000113 * it should be used only to compare on ASCII based substring.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000114 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000115 * strings within the parser.
116 *
117 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
118 *
119 * CURRENT Returns the current char value, with the full decoding of
120 * UTF-8 if we are using this mode. It returns an int.
121 * NEXT Skip to the next character, this does the proper decoding
122 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000123 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
124 */
125
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000126#define UPPER (toupper(*ctxt->input->cur))
Daniel Veillardcf461992000-03-14 18:30:20 +0000127
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000128#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
Daniel Veillardcf461992000-03-14 18:30:20 +0000129
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000130#define NXT(val) ctxt->input->cur[(val)]
Daniel Veillardcf461992000-03-14 18:30:20 +0000131
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000132#define UPP(val) (toupper(ctxt->input->cur[(val)]))
Daniel Veillardcf461992000-03-14 18:30:20 +0000133
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000134#define CUR_PTR ctxt->input->cur
Daniel Veillardcf461992000-03-14 18:30:20 +0000135
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000136#define SHRINK xmlParserInputShrink(ctxt->input)
Daniel Veillardcf461992000-03-14 18:30:20 +0000137
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000138#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000139
Daniel Veillardcf461992000-03-14 18:30:20 +0000140#define CURRENT ((int) (*ctxt->input->cur))
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000141
Daniel Veillardcf461992000-03-14 18:30:20 +0000142#define SKIP_BLANKS htmlSkipBlankChars(ctxt);
143
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000144/* Inported from XML */
145
146/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
147#define CUR ((int) (*ctxt->input->cur))
148#define NEXT xmlNextChar(ctxt);ctxt->nbChars++;
149
150#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
151#define NXT(val) ctxt->input->cur[(val)]
152#define CUR_PTR ctxt->input->cur
153
154
155#define NEXTL(l) \
156 if (*(ctxt->input->cur) == '\n') { \
157 ctxt->input->line++; ctxt->input->col = 1; \
158 } else ctxt->input->col++; \
159 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++;
160
161/************
162 \
163 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
164 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
165 ************/
166
167#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l);
168#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l);
169
170#define COPY_BUF(l,b,i,v) \
171 if (l == 1) b[i++] = (xmlChar) v; \
172 else i += xmlCopyChar(l,&b[i],v);
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000173
174/**
175 * htmlCurrentChar:
176 * @ctxt: the HTML parser context
177 * @len: pointer to the length of the char read
178 *
179 * The current char value, if using UTF-8 this may actaully span multiple
180 * bytes in the input buffer. Implement the end of line normalization:
181 * 2.11 End-of-Line Handling
182 * If the encoding is unspecified, in the case we find an ISO-Latin-1
183 * char, then the encoding converter is plugged in automatically.
184 *
185 * Returns the current char value and its lenght
186 */
187
188int
189htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
190 if (ctxt->instate == XML_PARSER_EOF)
191 return(0);
192
193 if (ctxt->token != 0) {
194 *len = 0;
195 return(ctxt->token);
196 }
197 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
198 /*
199 * We are supposed to handle UTF8, check it's valid
200 * From rfc2044: encoding of the Unicode values on UTF-8:
201 *
202 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
203 * 0000 0000-0000 007F 0xxxxxxx
204 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
205 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
206 *
207 * Check for the 0x110000 limit too
208 */
209 const unsigned char *cur = ctxt->input->cur;
210 unsigned char c;
211 unsigned int val;
212
213 c = *cur;
214 if (c & 0x80) {
215 if (cur[1] == 0)
216 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
217 if ((cur[1] & 0xc0) != 0x80)
218 goto encoding_error;
219 if ((c & 0xe0) == 0xe0) {
220
221 if (cur[2] == 0)
222 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
223 if ((cur[2] & 0xc0) != 0x80)
224 goto encoding_error;
225 if ((c & 0xf0) == 0xf0) {
226 if (cur[3] == 0)
227 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
228 if (((c & 0xf8) != 0xf0) ||
229 ((cur[3] & 0xc0) != 0x80))
230 goto encoding_error;
231 /* 4-byte code */
232 *len = 4;
233 val = (cur[0] & 0x7) << 18;
234 val |= (cur[1] & 0x3f) << 12;
235 val |= (cur[2] & 0x3f) << 6;
236 val |= cur[3] & 0x3f;
237 } else {
238 /* 3-byte code */
239 *len = 3;
240 val = (cur[0] & 0xf) << 12;
241 val |= (cur[1] & 0x3f) << 6;
242 val |= cur[2] & 0x3f;
243 }
244 } else {
245 /* 2-byte code */
246 *len = 2;
247 val = (cur[0] & 0x1f) << 6;
248 val |= cur[1] & 0x3f;
249 }
250 if (!IS_CHAR(val)) {
Daniel Veillarda2c6da92000-09-16 18:15:00 +0000251 ctxt->errNo = XML_ERR_INVALID_ENCODING;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000252 if ((ctxt->sax != NULL) &&
253 (ctxt->sax->error != NULL))
254 ctxt->sax->error(ctxt->userData,
255 "Char 0x%X out of allowed range\n", val);
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000256 ctxt->wellFormed = 0;
257 ctxt->disableSAX = 1;
258 }
259 return(val);
260 } else {
261 /* 1-byte code */
262 *len = 1;
263 return((int) *ctxt->input->cur);
264 }
265 }
266 /*
267 * Assume it's a fixed lenght encoding (1) with
268 * a compatibke encoding for the ASCII set, since
269 * XML constructs only use < 128 chars
270 */
271 *len = 1;
272 if ((int) *ctxt->input->cur < 0x80)
273 return((int) *ctxt->input->cur);
274
275 /*
276 * Humm this is bad, do an automatic flow conversion
277 */
278 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
279 ctxt->charset = XML_CHAR_ENCODING_UTF8;
280 return(xmlCurrentChar(ctxt, len));
281
282encoding_error:
283 /*
284 * If we detect an UTF8 error that probably mean that the
285 * input encoding didn't get properly advertized in the
286 * declaration header. Report the error and switch the encoding
287 * to ISO-Latin-1 (if you don't like this policy, just declare the
288 * encoding !)
289 */
Daniel Veillarda2c6da92000-09-16 18:15:00 +0000290 ctxt->errNo = XML_ERR_INVALID_ENCODING;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000291 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
292 ctxt->sax->error(ctxt->userData,
293 "Input is not proper UTF-8, indicate encoding !\n");
294 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
295 ctxt->input->cur[0], ctxt->input->cur[1],
296 ctxt->input->cur[2], ctxt->input->cur[3]);
297 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000298
299 ctxt->charset = XML_CHAR_ENCODING_8859_1;
300 *len = 1;
301 return((int) *ctxt->input->cur);
302}
303
Daniel Veillardcf461992000-03-14 18:30:20 +0000304/**
305 * htmlNextChar:
306 * @ctxt: the HTML parser context
307 *
308 * Skip to the next char input char.
309 */
310
311void
312htmlNextChar(htmlParserCtxtPtr ctxt) {
Daniel Veillard3f6f7f62000-06-30 17:58:25 +0000313 if (ctxt->instate == XML_PARSER_EOF)
314 return;
Daniel Veillardcf461992000-03-14 18:30:20 +0000315 if ((*ctxt->input->cur == 0) &&
316 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
317 xmlPopInput(ctxt);
318 } else {
319 if (*(ctxt->input->cur) == '\n') {
320 ctxt->input->line++; ctxt->input->col = 1;
321 } else ctxt->input->col++;
322 ctxt->input->cur++;
323 ctxt->nbChars++;
324 if (*ctxt->input->cur == 0)
325 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
326 }
327}
328
329/**
330 * htmlSkipBlankChars:
331 * @ctxt: the HTML parser context
332 *
333 * skip all blanks character found at that point in the input streams.
334 *
335 * Returns the number of space chars skipped
336 */
337
338int
339htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
340 int res = 0;
341
342 while (IS_BLANK(*(ctxt->input->cur))) {
343 if ((*ctxt->input->cur == 0) &&
344 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
345 xmlPopInput(ctxt);
346 } else {
347 if (*(ctxt->input->cur) == '\n') {
348 ctxt->input->line++; ctxt->input->col = 1;
349 } else ctxt->input->col++;
350 ctxt->input->cur++;
351 ctxt->nbChars++;
352 if (*ctxt->input->cur == 0)
353 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
354 }
355 res++;
356 }
357 return(res);
358}
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000359
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000360
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000361
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000362/************************************************************************
363 * *
364 * The list of HTML elements and their properties *
365 * *
366 ************************************************************************/
367
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000368/*
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000369 * Start Tag: 1 means the start tag can be ommited
370 * End Tag: 1 means the end tag can be ommited
371 * 2 means it's forbidden (empty elements)
372 * Depr: this element is deprecated
373 * DTD: 1 means that this element is valid only in the Loose DTD
374 * 2 means that this element is valid only in the Frameset DTD
375 *
376 * Name,Start Tag,End Tag, Empty, Depr., DTD, Description
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000377 */
378htmlElemDesc html40ElementTable[] = {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000379{ "a", 0, 0, 0, 0, 0, "anchor " },
380{ "abbr", 0, 0, 0, 0, 0, "abbreviated form" },
381{ "acronym", 0, 0, 0, 0, 0, "" },
382{ "address", 0, 0, 0, 0, 0, "information on author " },
383{ "applet", 0, 0, 0, 1, 1, "java applet " },
384{ "area", 0, 2, 1, 0, 0, "client-side image map area " },
385{ "b", 0, 0, 0, 0, 0, "bold text style" },
386{ "base", 0, 2, 1, 0, 0, "document base uri " },
387{ "basefont", 0, 2, 1, 1, 1, "base font size " },
388{ "bdo", 0, 0, 0, 0, 0, "i18n bidi over-ride " },
389{ "big", 0, 0, 0, 0, 0, "large text style" },
390{ "blockquote", 0, 0, 0, 0, 0, "long quotation " },
391{ "body", 1, 1, 0, 0, 0, "document body " },
392{ "br", 0, 2, 1, 0, 0, "forced line break " },
393{ "button", 0, 0, 0, 0, 0, "push button " },
394{ "caption", 0, 0, 0, 0, 0, "table caption " },
395{ "center", 0, 0, 0, 1, 1, "shorthand for div align=center " },
396{ "cite", 0, 0, 0, 0, 0, "citation" },
397{ "code", 0, 0, 0, 0, 0, "computer code fragment" },
398{ "col", 0, 2, 1, 0, 0, "table column " },
399{ "colgroup", 0, 1, 0, 0, 0, "table column group " },
400{ "dd", 0, 1, 0, 0, 0, "definition description " },
401{ "del", 0, 0, 0, 0, 0, "deleted text " },
402{ "dfn", 0, 0, 0, 0, 0, "instance definition" },
403{ "dir", 0, 0, 0, 1, 1, "directory list" },
404{ "div", 0, 0, 0, 0, 0, "generic language/style container"},
405{ "dl", 0, 0, 0, 0, 0, "definition list " },
406{ "dt", 0, 1, 0, 0, 0, "definition term " },
407{ "em", 0, 0, 0, 0, 0, "emphasis" },
408{ "fieldset", 0, 0, 0, 0, 0, "form control group " },
409{ "font", 0, 0, 0, 1, 1, "local change to font " },
410{ "form", 0, 0, 0, 0, 0, "interactive form " },
411{ "frame", 0, 2, 1, 0, 2, "subwindow " },
412{ "frameset", 0, 0, 0, 0, 2, "window subdivision" },
413{ "h1", 0, 0, 0, 0, 0, "heading " },
414{ "h2", 0, 0, 0, 0, 0, "heading " },
415{ "h3", 0, 0, 0, 0, 0, "heading " },
416{ "h4", 0, 0, 0, 0, 0, "heading " },
417{ "h5", 0, 0, 0, 0, 0, "heading " },
418{ "h6", 0, 0, 0, 0, 0, "heading " },
419{ "head", 1, 1, 0, 0, 0, "document head " },
420{ "hr", 0, 2, 1, 0, 0, "horizontal rule " },
421{ "html", 1, 1, 0, 0, 0, "document root element " },
422{ "i", 0, 0, 0, 0, 0, "italic text style" },
423{ "iframe", 0, 0, 0, 0, 1, "inline subwindow " },
424{ "img", 0, 2, 1, 0, 0, "embedded image " },
425{ "input", 0, 2, 1, 0, 0, "form control " },
426{ "ins", 0, 0, 0, 0, 0, "inserted text" },
427{ "isindex", 0, 2, 1, 1, 1, "single line prompt " },
428{ "kbd", 0, 0, 0, 0, 0, "text to be entered by the user" },
429{ "label", 0, 0, 0, 0, 0, "form field label text " },
430{ "legend", 0, 0, 0, 0, 0, "fieldset legend " },
431{ "li", 0, 1, 0, 0, 0, "list item " },
432{ "link", 0, 2, 1, 0, 0, "a media-independent link " },
433{ "map", 0, 0, 0, 0, 0, "client-side image map " },
434{ "menu", 0, 0, 0, 1, 1, "menu list " },
435{ "meta", 0, 2, 1, 0, 0, "generic metainformation " },
436{ "noframes", 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " },
437{ "noscript", 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
438{ "object", 0, 0, 0, 0, 0, "generic embedded object " },
439{ "ol", 0, 0, 0, 0, 0, "ordered list " },
440{ "optgroup", 0, 0, 0, 0, 0, "option group " },
441{ "option", 0, 1, 0, 0, 0, "selectable choice " },
442{ "p", 0, 1, 0, 0, 0, "paragraph " },
443{ "param", 0, 2, 1, 0, 0, "named property value " },
444{ "pre", 0, 0, 0, 0, 0, "preformatted text " },
445{ "q", 0, 0, 0, 0, 0, "short inline quotation " },
446{ "s", 0, 0, 0, 1, 1, "strike-through text style" },
447{ "samp", 0, 0, 0, 0, 0, "sample program output, scripts, etc." },
448{ "script", 0, 0, 0, 0, 0, "script statements " },
449{ "select", 0, 0, 0, 0, 0, "option selector " },
450{ "small", 0, 0, 0, 0, 0, "small text style" },
451{ "span", 0, 0, 0, 0, 0, "generic language/style container " },
452{ "strike", 0, 0, 0, 1, 1, "strike-through text" },
453{ "strong", 0, 0, 0, 0, 0, "strong emphasis" },
454{ "style", 0, 0, 0, 0, 0, "style info " },
455{ "sub", 0, 0, 0, 0, 0, "subscript" },
456{ "sup", 0, 0, 0, 0, 0, "superscript " },
457{ "table", 0, 0, 0, 0, 0, "&#160;" },
458{ "tbody", 1, 1, 0, 0, 0, "table body " },
459{ "td", 0, 1, 0, 0, 0, "table data cell" },
460{ "textarea", 0, 0, 0, 0, 0, "multi-line text field " },
461{ "tfoot", 0, 1, 0, 0, 0, "table footer " },
462{ "th", 0, 1, 0, 0, 0, "table header cell" },
463{ "thead", 0, 1, 0, 0, 0, "table header " },
464{ "title", 0, 0, 0, 0, 0, "document title " },
465{ "tr", 0, 1, 0, 0, 0, "table row " },
466{ "tt", 0, 0, 0, 0, 0, "teletype or monospaced text style" },
467{ "u", 0, 0, 0, 1, 1, "underlined text style" },
468{ "ul", 0, 0, 0, 0, 0, "unordered list " },
469{ "var", 0, 0, 0, 0, 0, "instance of a variable or program argument" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000470};
471
472/*
473 * start tags that imply the end of a current element
474 * any tag of each line implies the end of the current element if the type of
475 * that element is in the same line
476 */
Daniel Veillardb96e6431999-08-29 21:02:19 +0000477char *htmlEquEnd[] = {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000478"dt", "dd", "li", "option", NULL,
479"h1", "h2", "h3", "h4", "h5", "h6", NULL,
480"ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000481NULL
482};
483/*
484 * acording the HTML DTD, HR should be added to the 2nd line above, as it
485 * is not allowed within a H1, H2, H3, etc. But we should tolerate that case
486 * because many documents contain rules in headings...
487 */
488
489/*
490 * start tags that imply the end of current element
491 */
Daniel Veillardb96e6431999-08-29 21:02:19 +0000492char *htmlStartClose[] = {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000493"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
494 "dl", "ul", "ol", "menu", "dir", "address", "pre",
495 "listing", "xmp", "head", NULL,
496"head", "p", NULL,
497"title", "p", NULL,
498"body", "head", "style", "link", "title", "p", NULL,
499"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
500 "pre", "listing", "xmp", "head", "li", NULL,
501"hr", "p", "head", NULL,
502"h1", "p", "head", NULL,
503"h2", "p", "head", NULL,
504"h3", "p", "head", NULL,
505"h4", "p", "head", NULL,
506"h5", "p", "head", NULL,
507"h6", "p", "head", NULL,
508"dir", "p", "head", NULL,
509"address", "p", "head", "ul", NULL,
510"pre", "p", "head", "ul", NULL,
511"listing", "p", "head", NULL,
512"xmp", "p", "head", NULL,
513"blockquote", "p", "head", NULL,
514"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
515 "xmp", "head", NULL,
516"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
517 "head", "dd", NULL,
518"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
519 "head", "dt", NULL,
520"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
521 "listing", "xmp", NULL,
522"ol", "p", "head", "ul", NULL,
523"menu", "p", "head", "ul", NULL,
524"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
525"div", "p", "head", NULL,
526"noscript", "p", "head", NULL,
527"center", "font", "b", "i", "p", "head", NULL,
528"a", "a", NULL,
529"caption", "p", NULL,
530"colgroup", "caption", "colgroup", "col", "p", NULL,
531"col", "caption", "col", "p", NULL,
532"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
533 "listing", "xmp", "a", NULL,
534"th", "th", "td", NULL,
535"td", "th", "td", "p", NULL,
536"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
537"thead", "caption", "col", "colgroup", NULL,
538"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
539 "tbody", "p", NULL,
540"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
541 "tfoot", "tbody", "p", NULL,
542"optgroup", "option", NULL,
543"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
544 "pre", "listing", "xmp", "a", NULL,
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000545NULL
546};
547
Daniel Veillardb8f25c92000-08-19 19:52:36 +0000548/*
549 * The list of HTML elements which are supposed not to have
550 * CDATA content and where a p element will be implied
551 *
552 * TODO: extend that list by reading the HTML SGML DtD on
553 * implied paragraph
554 */
555static char *htmlNoContentElements[] = {
556 "html",
557 "head",
558 "body",
559 NULL
560};
561
Daniel Veillardb96e6431999-08-29 21:02:19 +0000562static char** htmlStartCloseIndex[100];
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000563static int htmlStartCloseIndexinitialized = 0;
564
565/************************************************************************
566 * *
567 * functions to handle HTML specific data *
568 * *
569 ************************************************************************/
570
571/**
572 * htmlInitAutoClose:
573 *
574 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
Daniel Veillardbc765302000-10-01 18:23:35 +0000575 * This is not reentrant. Call xmlInitParser() once before processing in
576 * case of use in multithreaded programs.
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000577 */
578void
579htmlInitAutoClose(void) {
580 int index, i = 0;
581
582 if (htmlStartCloseIndexinitialized) return;
583
584 for (index = 0;index < 100;index ++) htmlStartCloseIndex[index] = NULL;
585 index = 0;
586 while ((htmlStartClose[i] != NULL) && (index < 100 - 1)) {
587 htmlStartCloseIndex[index++] = &htmlStartClose[i];
588 while (htmlStartClose[i] != NULL) i++;
589 i++;
590 }
Daniel Veillardbc765302000-10-01 18:23:35 +0000591 htmlStartCloseIndexinitialized = 1;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000592}
593
594/**
595 * htmlTagLookup:
Daniel Veillardb656ebe2000-09-22 13:51:48 +0000596 * @tag: The tag name in lowercase
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000597 *
598 * Lookup the HTML tag in the ElementTable
599 *
600 * Returns the related htmlElemDescPtr or NULL if not found.
601 */
602htmlElemDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000603htmlTagLookup(const xmlChar *tag) {
Daniel Veillard47f3f312000-08-27 22:40:15 +0000604 int i;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000605
606 for (i = 0; i < (sizeof(html40ElementTable) /
607 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000608 if (xmlStrEqual(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000609 return(&html40ElementTable[i]);
610 }
611 return(NULL);
612}
613
614/**
615 * htmlCheckAutoClose:
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000616 * @newtag: The new tag name
617 * @oldtag: The old tag name
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000618 *
619 * Checks wether the new tag is one of the registered valid tags for closing old.
620 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
621 *
622 * Returns 0 if no, 1 if yes.
623 */
624int
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000625htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000626 int i, index;
Daniel Veillard39c7d712000-09-10 16:14:55 +0000627 char **close = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000628
629 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
630
631 /* inefficient, but not a big deal */
632 for (index = 0; index < 100;index++) {
633 close = htmlStartCloseIndex[index];
634 if (close == NULL) return(0);
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000635 if (xmlStrEqual(BAD_CAST *close, newtag)) break;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000636 }
637
638 i = close - htmlStartClose;
639 i++;
640 while (htmlStartClose[i] != NULL) {
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000641 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000642 return(1);
643 }
644 i++;
645 }
646 return(0);
647}
648
649/**
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000650 * htmlAutoCloseOnClose:
651 * @ctxt: an HTML parser context
652 * @newtag: The new tag name
653 *
654 * The HTmL DtD allows an ending tag to implicitely close other tags.
655 */
656void
657htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
658 htmlElemDescPtr info;
659 xmlChar *oldname;
660 int i;
661
662#ifdef DEBUG
663 fprintf(stderr,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
664 for (i = 0;i < ctxt->nameNr;i++)
665 fprintf(stderr,"%d : %s\n", i, ctxt->nameTab[i]);
666#endif
667
668 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000669 if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000670 }
671 if (i < 0) return;
672
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000673 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000674 info = htmlTagLookup(ctxt->name);
675 if ((info == NULL) || (info->endTag == 1)) {
676#ifdef DEBUG
677 fprintf(stderr,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
678#endif
679 } else {
680 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
681 ctxt->sax->error(ctxt->userData,
682 "Opening and ending tag mismatch: %s and %s\n",
683 newtag, ctxt->name);
684 ctxt->wellFormed = 0;
685 }
686 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
687 ctxt->sax->endElement(ctxt->userData, ctxt->name);
688 oldname = htmlnamePop(ctxt);
689 if (oldname != NULL) {
690#ifdef DEBUG
691 fprintf(stderr,"htmlAutoCloseOnClose: popped %s\n", oldname);
692#endif
693 xmlFree(oldname);
694 }
695 }
696}
697
698/**
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000699 * htmlAutoClose:
700 * @ctxt: an HTML parser context
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000701 * @newtag: The new tag name or NULL
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000702 *
703 * The HTmL DtD allows a tag to implicitely close other tags.
704 * The list is kept in htmlStartClose array. This function is
705 * called when a new tag has been detected and generates the
706 * appropriates closes if possible/needed.
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000707 * If newtag is NULL this mean we are at the end of the resource
Daniel Veillard365e13b2000-07-02 07:56:37 +0000708 * and we should check
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000709 */
710void
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000711htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000712 xmlChar *oldname;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000713 while ((newtag != NULL) && (ctxt->name != NULL) &&
714 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000715#ifdef DEBUG
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000716 fprintf(stderr,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000717#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000718 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000719 ctxt->sax->endElement(ctxt->userData, ctxt->name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +0000720 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000721 if (oldname != NULL) {
722#ifdef DEBUG
723 fprintf(stderr,"htmlAutoClose: popped %s\n", oldname);
724#endif
725 xmlFree(oldname);
726 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000727 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000728 if (newtag == NULL) {
729 htmlAutoCloseOnClose(ctxt, BAD_CAST"head");
730 htmlAutoCloseOnClose(ctxt, BAD_CAST"body");
731 htmlAutoCloseOnClose(ctxt, BAD_CAST"html");
732 }
733 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000734 ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
735 (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
736 (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
Daniel Veillard365e13b2000-07-02 07:56:37 +0000737#ifdef DEBUG
738 fprintf(stderr,"htmlAutoClose: EOF closes %s\n", ctxt->name);
739#endif
740 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
741 ctxt->sax->endElement(ctxt->userData, ctxt->name);
742 oldname = htmlnamePop(ctxt);
743 if (oldname != NULL) {
744#ifdef DEBUG
745 fprintf(stderr,"htmlAutoClose: popped %s\n", oldname);
746#endif
747 xmlFree(oldname);
748 }
749 }
750
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000751}
752
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000753/**
Daniel Veillard5cb5ab81999-12-21 15:35:29 +0000754 * htmlAutoCloseTag:
755 * @doc: the HTML document
756 * @name: The tag name
757 * @elem: the HTML element
758 *
759 * The HTmL DtD allows a tag to implicitely close other tags.
760 * The list is kept in htmlStartClose array. This function checks
761 * if the element or one of it's children would autoclose the
762 * given tag.
763 *
764 * Returns 1 if autoclose, 0 otherwise
765 */
766int
767htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
768 htmlNodePtr child;
769
770 if (elem == NULL) return(1);
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000771 if (xmlStrEqual(name, elem->name)) return(0);
Daniel Veillard5cb5ab81999-12-21 15:35:29 +0000772 if (htmlCheckAutoClose(elem->name, name)) return(1);
Daniel Veillardcf461992000-03-14 18:30:20 +0000773 child = elem->children;
Daniel Veillard5cb5ab81999-12-21 15:35:29 +0000774 while (child != NULL) {
775 if (htmlAutoCloseTag(doc, name, child)) return(1);
776 child = child->next;
777 }
778 return(0);
779}
780
781/**
782 * htmlIsAutoClosed:
783 * @doc: the HTML document
784 * @elem: the HTML element
785 *
786 * The HTmL DtD allows a tag to implicitely close other tags.
787 * The list is kept in htmlStartClose array. This function checks
788 * if a tag is autoclosed by one of it's child
789 *
790 * Returns 1 if autoclosed, 0 otherwise
791 */
792int
793htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
794 htmlNodePtr child;
795
796 if (elem == NULL) return(1);
Daniel Veillardcf461992000-03-14 18:30:20 +0000797 child = elem->children;
Daniel Veillard5cb5ab81999-12-21 15:35:29 +0000798 while (child != NULL) {
799 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
800 child = child->next;
801 }
802 return(0);
803}
804
805/**
Daniel Veillardbe803962000-06-28 23:40:59 +0000806 * htmlCheckImplied:
807 * @ctxt: an HTML parser context
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000808 * @newtag: The new tag name
Daniel Veillardbe803962000-06-28 23:40:59 +0000809 *
810 * The HTmL DtD allows a tag to exists only implicitely
811 * called when a new tag has been detected and generates the
812 * appropriates implicit tags if missing
813 */
814void
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000815htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000816 if (xmlStrEqual(newtag, BAD_CAST"html"))
Daniel Veillardbe803962000-06-28 23:40:59 +0000817 return;
818 if (ctxt->nameNr <= 0) {
819#ifdef DEBUG
820 fprintf(stderr,"Implied element html: pushed html\n");
821#endif
822 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
823 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
824 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
825 }
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000826 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
Daniel Veillardbe803962000-06-28 23:40:59 +0000827 return;
828 if (ctxt->nameNr <= 1) {
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000829 if ((xmlStrEqual(newtag, BAD_CAST"script")) ||
830 (xmlStrEqual(newtag, BAD_CAST"style")) ||
831 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
832 (xmlStrEqual(newtag, BAD_CAST"link")) ||
833 (xmlStrEqual(newtag, BAD_CAST"title")) ||
834 (xmlStrEqual(newtag, BAD_CAST"base"))) {
Daniel Veillardbe803962000-06-28 23:40:59 +0000835 /*
836 * dropped OBJECT ... i you put it first BODY will be
837 * assumed !
838 */
839#ifdef DEBUG
840 fprintf(stderr,"Implied element head: pushed head\n");
841#endif
842 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
843 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
844 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
845 } else {
846#ifdef DEBUG
847 fprintf(stderr,"Implied element body: pushed body\n");
848#endif
849 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
850 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
851 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
852 }
853 }
854}
855
Daniel Veillardb8f25c92000-08-19 19:52:36 +0000856/**
857 * htmlCheckParagraph
858 * @ctxt: an HTML parser context
859 *
860 * Check whether a p element need to be implied before inserting
861 * characters in the current element.
862 *
863 * Returns 1 if a paragraph has been inserted, 0 if not and -1
864 * in case of error.
865 */
866
867int
868htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
869 const xmlChar *tag;
870 int i;
871
872 if (ctxt == NULL)
873 return(-1);
874 tag = ctxt->name;
875 if (tag == NULL) {
876 htmlAutoClose(ctxt, BAD_CAST"p");
877 htmlCheckImplied(ctxt, BAD_CAST"p");
878 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
879 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
880 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
881 return(1);
882 }
883 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000884 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Daniel Veillardb8f25c92000-08-19 19:52:36 +0000885#ifdef DEBUG
886 fprintf(stderr,"Implied element paragraph\n");
887#endif
888 htmlAutoClose(ctxt, BAD_CAST"p");
889 htmlCheckImplied(ctxt, BAD_CAST"p");
890 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
891 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
892 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
893 return(1);
894 }
895 }
896 return(0);
897}
898
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000899/************************************************************************
900 * *
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000901 * The list of HTML predefined entities *
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000902 * *
903 ************************************************************************/
904
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000905
906htmlEntityDesc html40EntitiesTable[] = {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000907/*
Daniel Veillard47f3f312000-08-27 22:40:15 +0000908 * the 4 absolute ones, plus apostrophe.
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000909 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000910{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
911{ 38, "amp", "ampersand, U+0026 ISOnum" },
Daniel Veillard47f3f312000-08-27 22:40:15 +0000912{ 39, "apos", "single quote" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000913{ 60, "lt", "less-than sign, U+003C ISOnum" },
914{ 62, "gt", "greater-than sign, U+003E ISOnum" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000915
916/*
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000917 * A bunch still in the 128-255 range
918 * Replacing them depend really on the charset used.
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000919 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000920{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
921{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
922{ 162, "cent", "cent sign, U+00A2 ISOnum" },
923{ 163, "pound","pound sign, U+00A3 ISOnum" },
924{ 164, "curren","currency sign, U+00A4 ISOnum" },
925{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
926{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
927{ 167, "sect", "section sign, U+00A7 ISOnum" },
928{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
929{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
930{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
931{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
932{ 172, "not", "not sign, U+00AC ISOnum" },
933{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
934{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
935{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
936{ 176, "deg", "degree sign, U+00B0 ISOnum" },
937{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
938{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
939{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
940{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
941{ 181, "micro","micro sign, U+00B5 ISOnum" },
942{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000943{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000944{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
945{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
946{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000947{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000948{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
949{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
950{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
951{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
952{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
953{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
954{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
955{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
956{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
957{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
958{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
959{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
960{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
961{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
962{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
963{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
964{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
965{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
966{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
967{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
968{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
969{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
970{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
971{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
972{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
973{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
974{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
975{ 215, "times","multiplication sign, U+00D7 ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000976{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000977{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
978{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
979{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
980{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
981{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
982{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
983{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
984{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
985{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
986{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
987{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
988{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
989{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
990{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
991{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
992{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
993{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
994{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
995{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
996{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
997{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
998{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
999{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1000{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1001{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1002{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1003{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1004{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1005{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1006{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1007{ 247, "divide","division sign, U+00F7 ISOnum" },
1008{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1009{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1010{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1011{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1012{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1013{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1014{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1015{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001016
Daniel Veillard47f3f312000-08-27 22:40:15 +00001017{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1018{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1019{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1020{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1021{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1022
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001023/*
1024 * Anything below should really be kept as entities references
1025 */
1026{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001027
Daniel Veillard47f3f312000-08-27 22:40:15 +00001028{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1029{ 732, "tilde","small tilde, U+02DC ISOdia" },
1030
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001031{ 913, "Alpha","greek capital letter alpha, U+0391" },
1032{ 914, "Beta", "greek capital letter beta, U+0392" },
1033{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1034{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1035{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1036{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1037{ 919, "Eta", "greek capital letter eta, U+0397" },
1038{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1039{ 921, "Iota", "greek capital letter iota, U+0399" },
1040{ 922, "Kappa","greek capital letter kappa, U+039A" },
1041{ 923, "Lambda""greek capital letter lambda, U+039B ISOgrk3" },
1042{ 924, "Mu", "greek capital letter mu, U+039C" },
1043{ 925, "Nu", "greek capital letter nu, U+039D" },
1044{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1045{ 927, "Omicron","greek capital letter omicron, U+039F" },
1046{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1047{ 929, "Rho", "greek capital letter rho, U+03A1" },
1048{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1049{ 932, "Tau", "greek capital letter tau, U+03A4" },
1050{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1051{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1052{ 935, "Chi", "greek capital letter chi, U+03A7" },
1053{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1054{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001055
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001056{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1057{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1058{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1059{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1060{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1061{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1062{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1063{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1064{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1065{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1066{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1067{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1068{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1069{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1070{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1071{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1072{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1073{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1074{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1075{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1076{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1077{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1078{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1079{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1080{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1081{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1082{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1083{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001084
Daniel Veillard47f3f312000-08-27 22:40:15 +00001085{ 8194, "ensp", "en space, U+2002 ISOpub" },
1086{ 8195, "emsp", "em space, U+2003 ISOpub" },
1087{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1088{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1089{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1090{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1091{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1092{ 8211, "ndash","en dash, U+2013 ISOpub" },
1093{ 8212, "mdash","em dash, U+2014 ISOpub" },
1094{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1095{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1096{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1097{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1098{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1099{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1100{ 8224, "dagger","dagger, U+2020 ISOpub" },
1101{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1102
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001103{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1104{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
Daniel Veillard47f3f312000-08-27 22:40:15 +00001105
1106{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1107
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001108{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1109{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
Daniel Veillard47f3f312000-08-27 22:40:15 +00001110
1111{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1112{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1113
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001114{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1115{ 8260, "frasl","fraction slash, U+2044 NEW" },
1116
Daniel Veillard47f3f312000-08-27 22:40:15 +00001117{ 8364, "euro", "euro sign, U+20AC NEW" },
1118
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001119{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
Daniel Veillard47f3f312000-08-27 22:40:15 +00001120{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001121{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1122{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1123{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1124{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1125{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1126{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1127{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1128{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1129{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1130{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1131{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1132{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1133{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1134{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1135
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001136{ 8704, "forall","for all, U+2200 ISOtech" },
1137{ 8706, "part", "partial differential, U+2202 ISOtech" },
1138{ 8707, "exist","there exists, U+2203 ISOtech" },
1139{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1140{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1141{ 8712, "isin", "element of, U+2208 ISOtech" },
1142{ 8713, "notin","not an element of, U+2209 ISOtech" },
1143{ 8715, "ni", "contains as member, U+220B ISOtech" },
1144{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1145{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
1146{ 8722, "minus","minus sign, U+2212 ISOtech" },
1147{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1148{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1149{ 8733, "prop", "proportional to, U+221D ISOtech" },
1150{ 8734, "infin","infinity, U+221E ISOtech" },
1151{ 8736, "ang", "angle, U+2220 ISOamso" },
1152{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1153{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1154{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1155{ 8746, "cup", "union = cup, U+222A ISOtech" },
1156{ 8747, "int", "integral, U+222B ISOtech" },
1157{ 8756, "there4","therefore, U+2234 ISOtech" },
1158{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1159{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1160{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1161{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1162{ 8801, "equiv","identical to, U+2261 ISOtech" },
1163{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1164{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1165{ 8834, "sub", "subset of, U+2282 ISOtech" },
1166{ 8835, "sup", "superset of, U+2283 ISOtech" },
1167{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1168{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1169{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1170{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1171{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1172{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1173{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1174{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1175{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1176{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1177{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1178{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1179{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1180{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1181
1182{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1183{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1184{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1185{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1186
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001187};
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001188
1189/************************************************************************
1190 * *
1191 * Commodity functions to handle entities *
1192 * *
1193 ************************************************************************/
1194
1195/*
1196 * Macro used to grow the current buffer.
1197 */
1198#define growBuffer(buffer) { \
1199 buffer##_size *= 2; \
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001200 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001201 if (buffer == NULL) { \
1202 perror("realloc failed"); \
Daniel Veillard0142b842000-01-14 14:45:24 +00001203 return(NULL); \
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001204 } \
1205}
1206
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001207/**
1208 * htmlEntityLookup:
1209 * @name: the entity name
1210 *
1211 * Lookup the given entity in EntitiesTable
1212 *
1213 * TODO: the linear scan is really ugly, an hash table is really needed.
1214 *
1215 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1216 */
1217htmlEntityDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001218htmlEntityLookup(const xmlChar *name) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001219 int i;
1220
1221 for (i = 0;i < (sizeof(html40EntitiesTable)/
1222 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard8b5dd832000-10-01 20:28:44 +00001223 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001224#ifdef DEBUG
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001225 fprintf(stderr,"Found entity %s\n", name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001226#endif
1227 return(&html40EntitiesTable[i]);
1228 }
1229 }
1230 return(NULL);
1231}
1232
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001233/**
Daniel Veillard47f3f312000-08-27 22:40:15 +00001234 * htmlEntityValueLookup:
1235 * @value: the entity's unicode value
1236 *
1237 * Lookup the given entity in EntitiesTable
1238 *
1239 * TODO: the linear scan is really ugly, an hash table is really needed.
1240 *
1241 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1242 */
1243htmlEntityDescPtr
1244htmlEntityValueLookup(int value) {
1245 int i;
1246#ifdef DEBUG
1247 int lv = 0;
1248#endif
1249
1250 for (i = 0;i < (sizeof(html40EntitiesTable)/
1251 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard4b0755c2000-09-25 14:26:28 +00001252 if ((unsigned int) html40EntitiesTable[i].value >= value) {
1253 if ((unsigned int) html40EntitiesTable[i].value > value)
Daniel Veillard47f3f312000-08-27 22:40:15 +00001254 break;
1255#ifdef DEBUG
1256 fprintf(stderr,"Found entity %s\n", html40EntitiesTable[i].name);
1257#endif
1258 return(&html40EntitiesTable[i]);
1259 }
1260#ifdef DEBUG
1261 if (lv > html40EntitiesTable[i].value) {
1262 fprintf(stderr, "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1263 lv, html40EntitiesTable[i].value);
1264 }
1265 lv = html40EntitiesTable[i].value;
1266#endif
1267 }
1268 return(NULL);
1269}
1270
1271/**
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001272 * UTF8ToHtml:
1273 * @out: a pointer to an array of bytes to store the result
1274 * @outlen: the length of @out
1275 * @in: a pointer to an array of UTF-8 chars
1276 * @inlen: the length of @in
1277 *
1278 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1279 * plus HTML entities block of chars out.
1280 *
1281 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1282 * The value of @inlen after return is the number of octets consumed
1283 * as the return value is positive, else unpredictiable.
1284 * The value of @outlen after return is the number of octets consumed.
1285 */
1286int
1287UTF8ToHtml(unsigned char* out, int *outlen,
1288 const unsigned char* in, int *inlen) {
1289 const unsigned char* processed = in;
1290 const unsigned char* outend;
1291 const unsigned char* outstart = out;
1292 const unsigned char* instart = in;
1293 const unsigned char* inend;
1294 unsigned int c, d;
1295 int trailing;
1296
1297 if (in == NULL) {
1298 /*
1299 * initialization nothing to do
1300 */
1301 *outlen = 0;
1302 *inlen = 0;
1303 return(0);
1304 }
1305 inend = in + (*inlen);
1306 outend = out + (*outlen);
1307 while (in < inend) {
1308 d = *in++;
1309 if (d < 0x80) { c= d; trailing= 0; }
1310 else if (d < 0xC0) {
1311 /* trailing byte in leading position */
1312 *outlen = out - outstart;
1313 *inlen = processed - instart;
1314 return(-2);
1315 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1316 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1317 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1318 else {
1319 /* no chance for this in Ascii */
1320 *outlen = out - outstart;
1321 *inlen = processed - instart;
1322 return(-2);
1323 }
1324
1325 if (inend - in < trailing) {
1326 break;
1327 }
1328
1329 for ( ; trailing; trailing--) {
1330 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1331 break;
1332 c <<= 6;
1333 c |= d & 0x3F;
1334 }
1335
1336 /* assertion: c is a single UTF-4 value */
1337 if (c < 0x80) {
Daniel Veillarde010c172000-08-28 10:04:51 +00001338 if (out + 1 >= outend)
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001339 break;
1340 *out++ = c;
1341 } else {
Daniel Veillard47f3f312000-08-27 22:40:15 +00001342 int len;
1343 htmlEntityDescPtr ent;
1344
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001345 /*
1346 * Try to lookup a predefined HTML entity for it
1347 */
1348
Daniel Veillard47f3f312000-08-27 22:40:15 +00001349 ent = htmlEntityValueLookup(c);
1350 if (ent == NULL) {
1351 /* no chance for this in Ascii */
1352 *outlen = out - outstart;
1353 *inlen = processed - instart;
1354 return(-2);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001355 }
Daniel Veillard47f3f312000-08-27 22:40:15 +00001356 len = strlen(ent->name);
Daniel Veillarde010c172000-08-28 10:04:51 +00001357 if (out + 2 + len >= outend)
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001358 break;
1359 *out++ = '&';
Daniel Veillard47f3f312000-08-27 22:40:15 +00001360 memcpy(out, ent->name, len);
1361 out += len;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001362 *out++ = ';';
1363 }
1364 processed = in;
1365 }
1366 *outlen = out - outstart;
1367 *inlen = processed - instart;
1368 return(0);
1369}
1370
Daniel Veillarde010c172000-08-28 10:04:51 +00001371/**
1372 * htmlEncodeEntities:
1373 * @out: a pointer to an array of bytes to store the result
1374 * @outlen: the length of @out
1375 * @in: a pointer to an array of UTF-8 chars
1376 * @inlen: the length of @in
1377 * @quoteChar: the quote character to escape (' or ") or zero.
1378 *
1379 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1380 * plus HTML entities block of chars out.
1381 *
1382 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1383 * The value of @inlen after return is the number of octets consumed
1384 * as the return value is positive, else unpredictiable.
1385 * The value of @outlen after return is the number of octets consumed.
1386 */
1387int
1388htmlEncodeEntities(unsigned char* out, int *outlen,
1389 const unsigned char* in, int *inlen, int quoteChar) {
1390 const unsigned char* processed = in;
1391 const unsigned char* outend = out + (*outlen);
1392 const unsigned char* outstart = out;
1393 const unsigned char* instart = in;
1394 const unsigned char* inend = in + (*inlen);
1395 unsigned int c, d;
1396 int trailing;
1397
1398 while (in < inend) {
1399 d = *in++;
1400 if (d < 0x80) { c= d; trailing= 0; }
1401 else if (d < 0xC0) {
1402 /* trailing byte in leading position */
1403 *outlen = out - outstart;
1404 *inlen = processed - instart;
1405 return(-2);
1406 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1407 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1408 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1409 else {
1410 /* no chance for this in Ascii */
1411 *outlen = out - outstart;
1412 *inlen = processed - instart;
1413 return(-2);
1414 }
1415
1416 if (inend - in < trailing)
1417 break;
1418
1419 while (trailing--) {
1420 if (((d= *in++) & 0xC0) != 0x80) {
1421 *outlen = out - outstart;
1422 *inlen = processed - instart;
1423 return(-2);
1424 }
1425 c <<= 6;
1426 c |= d & 0x3F;
1427 }
1428
1429 /* assertion: c is a single UTF-4 value */
1430 if (c < 0x80 && c != quoteChar && c != '&' && c != '<' && c != '>') {
1431 if (out >= outend)
1432 break;
1433 *out++ = c;
1434 } else {
1435 htmlEntityDescPtr ent;
1436 const char *cp;
1437 char nbuf[16];
1438 int len;
1439
1440 /*
1441 * Try to lookup a predefined HTML entity for it
1442 */
1443 ent = htmlEntityValueLookup(c);
1444 if (ent == NULL) {
1445 sprintf(nbuf, "#%u", c);
1446 cp = nbuf;
1447 }
1448 else
1449 cp = ent->name;
1450 len = strlen(cp);
1451 if (out + 2 + len > outend)
1452 break;
1453 *out++ = '&';
1454 memcpy(out, cp, len);
1455 out += len;
1456 *out++ = ';';
1457 }
1458 processed = in;
1459 }
1460 *outlen = out - outstart;
1461 *inlen = processed - instart;
1462 return(0);
1463}
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001464
1465/**
1466 * htmlDecodeEntities:
1467 * @ctxt: the parser context
1468 * @len: the len to decode (in bytes !), -1 for no size limit
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001469 * @end: an end marker xmlChar, 0 if none
1470 * @end2: an end marker xmlChar, 0 if none
1471 * @end3: an end marker xmlChar, 0 if none
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001472 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001473 * Subtitute the HTML entities by their value
1474 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001475 * DEPRECATED !!!!
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001476 *
1477 * Returns A newly allocated string with the substitution done. The caller
1478 * must deallocate it !
1479 */
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001480xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001481htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001482 xmlChar end, xmlChar end2, xmlChar end3) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001483 xmlChar *name = NULL;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001484 xmlChar *buffer = NULL;
1485 unsigned int buffer_size = 0;
1486 unsigned int nbchars = 0;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001487 htmlEntityDescPtr ent;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001488 unsigned int max = (unsigned int) len;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001489 int c,l;
1490
1491 if (ctxt->depth > 40) {
Daniel Veillarda2c6da92000-09-16 18:15:00 +00001492 ctxt->errNo = XML_ERR_ENTITY_LOOP;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001493 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1494 ctxt->sax->error(ctxt->userData,
1495 "Detected entity reference loop\n");
1496 ctxt->wellFormed = 0;
1497 ctxt->disableSAX = 1;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001498 return(NULL);
1499 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001500
1501 /*
1502 * allocate a translation buffer.
1503 */
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001504 buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001505 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001506 if (buffer == NULL) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001507 perror("xmlDecodeEntities: malloc failed");
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001508 return(NULL);
1509 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001510
1511 /*
1512 * Ok loop until we reach one of the ending char or a size limit.
1513 */
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001514 c = CUR_CHAR(l);
1515 while ((nbchars < max) && (c != end) &&
1516 (c != end2) && (c != end3)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001517
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001518 if (c == 0) break;
1519 if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
1520 int val = htmlParseCharRef(ctxt);
1521 COPY_BUF(0,buffer,nbchars,val);
1522 NEXTL(l);
1523 } else if ((c == '&') && (ctxt->token != '&')) {
1524 ent = htmlParseEntityRef(ctxt, &name);
1525 if (name != NULL) {
1526 if (ent != NULL) {
1527 int val = ent->value;
1528 COPY_BUF(0,buffer,nbchars,val);
1529 NEXTL(l);
1530 } else {
1531 const xmlChar *cur = name;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001532
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001533 buffer[nbchars++] = '&';
1534 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1535 growBuffer(buffer);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001536 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001537 while (*cur != 0) {
1538 buffer[nbchars++] = *cur++;
1539 }
1540 buffer[nbchars++] = ';';
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001541 }
1542 }
1543 } else {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001544 COPY_BUF(l,buffer,nbchars,c);
1545 NEXTL(l);
1546 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001547 growBuffer(buffer);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001548 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001549 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001550 c = CUR_CHAR(l);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001551 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001552 buffer[nbchars++] = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001553 return(buffer);
1554}
1555
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001556/************************************************************************
1557 * *
1558 * Commodity functions to handle streams *
1559 * *
1560 ************************************************************************/
1561
1562/**
1563 * htmlFreeInputStream:
1564 * @input: an htmlParserInputPtr
1565 *
1566 * Free up an input stream.
1567 */
1568void
1569htmlFreeInputStream(htmlParserInputPtr input) {
1570 if (input == NULL) return;
1571
1572 if (input->filename != NULL) xmlFree((char *) input->filename);
1573 if (input->directory != NULL) xmlFree((char *) input->directory);
1574 if ((input->free != NULL) && (input->base != NULL))
1575 input->free((xmlChar *) input->base);
1576 if (input->buf != NULL)
1577 xmlFreeParserInputBuffer(input->buf);
1578 memset(input, -1, sizeof(htmlParserInput));
1579 xmlFree(input);
1580}
1581
1582/**
1583 * htmlNewInputStream:
1584 * @ctxt: an HTML parser context
1585 *
1586 * Create a new input stream structure
1587 * Returns the new input stream or NULL
1588 */
1589htmlParserInputPtr
1590htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1591 htmlParserInputPtr input;
1592
1593 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1594 if (input == NULL) {
1595 ctxt->errNo = XML_ERR_NO_MEMORY;
1596 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1597 ctxt->sax->error(ctxt->userData,
1598 "malloc: couldn't allocate a new input stream\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001599 return(NULL);
1600 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001601 memset(input, 0, sizeof(htmlParserInput));
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001602 input->filename = NULL;
1603 input->directory = NULL;
1604 input->base = NULL;
1605 input->cur = NULL;
1606 input->buf = NULL;
1607 input->line = 1;
1608 input->col = 1;
1609 input->buf = NULL;
1610 input->free = NULL;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001611 input->version = NULL;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001612 input->consumed = 0;
1613 input->length = 0;
1614 return(input);
1615}
1616
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001617
1618/************************************************************************
1619 * *
1620 * Commodity functions, cleanup needed ? *
1621 * *
1622 ************************************************************************/
1623
1624/**
1625 * areBlanks:
1626 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001627 * @str: a xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001628 * @len: the size of @str
1629 *
1630 * Is this a sequence of blank chars that one can ignore ?
1631 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001632 * Returns 1 if ignorable 0 otherwise.
1633 */
1634
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001635static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001636 int i;
1637 xmlNodePtr lastChild;
1638
1639 for (i = 0;i < len;i++)
1640 if (!(IS_BLANK(str[i]))) return(0);
1641
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001642 if (CUR == 0) return(1);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001643 if (CUR != '<') return(0);
Daniel Veillarde010c172000-08-28 10:04:51 +00001644 if (ctxt->name == NULL)
1645 return(1);
Daniel Veillard8b5dd832000-10-01 20:28:44 +00001646 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
Daniel Veillard4948eb42000-08-29 09:41:15 +00001647 return(1);
Daniel Veillard8b5dd832000-10-01 20:28:44 +00001648 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
Daniel Veillarde010c172000-08-28 10:04:51 +00001649 return(1);
Daniel Veillard8b5dd832000-10-01 20:28:44 +00001650 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
Daniel Veillarde010c172000-08-28 10:04:51 +00001651 return(1);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001652 if (ctxt->node == NULL) return(0);
1653 lastChild = xmlGetLastChild(ctxt->node);
1654 if (lastChild == NULL) {
1655 if (ctxt->node->content != NULL) return(0);
1656 } else if (xmlNodeIsText(lastChild))
1657 return(0);
1658 return(1);
1659}
1660
1661/**
1662 * htmlHandleEntity:
1663 * @ctxt: an HTML parser context
1664 * @entity: an XML entity pointer.
1665 *
1666 * Default handling of an HTML entity, call the parser with the
1667 * substitution string
1668 */
1669
1670void
1671htmlHandleEntity(htmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
1672 int len;
1673
1674 if (entity->content == NULL) {
1675 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1676 ctxt->sax->error(ctxt->userData, "htmlHandleEntity %s: content == NULL\n",
1677 entity->name);
1678 ctxt->wellFormed = 0;
1679 return;
1680 }
1681 len = xmlStrlen(entity->content);
1682
1683 /*
1684 * Just handle the content as a set of chars.
1685 */
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001686 htmlCheckParagraph(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001687 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
1688 ctxt->sax->characters(ctxt->userData, entity->content, len);
1689
1690}
1691
1692/**
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001693 * htmlNewDocNoDtD:
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001694 * @URI: URI for the dtd, or NULL
1695 * @ExternalID: the external ID of the DTD, or NULL
1696 *
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001697 * Returns a new document, do not intialize the DTD if not provided
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001698 */
1699htmlDocPtr
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001700htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001701 xmlDocPtr cur;
1702
1703 /*
1704 * Allocate a new document and fill the fields.
1705 */
Daniel Veillard6454aec1999-09-02 22:04:43 +00001706 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001707 if (cur == NULL) {
1708 fprintf(stderr, "xmlNewDoc : malloc failed\n");
1709 return(NULL);
1710 }
Daniel Veillarde7a5a771999-08-30 13:05:42 +00001711 memset(cur, 0, sizeof(xmlDoc));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001712
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001713 cur->type = XML_HTML_DOCUMENT_NODE;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001714 cur->version = NULL;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001715 cur->intSubset = NULL;
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001716 if ((ExternalID != NULL) ||
1717 (URI != NULL))
Daniel Veillard5cb5ab81999-12-21 15:35:29 +00001718 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
Daniel Veillardbe803962000-06-28 23:40:59 +00001719 cur->doc = cur;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001720 cur->name = NULL;
Daniel Veillardcf461992000-03-14 18:30:20 +00001721 cur->children = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001722 cur->extSubset = NULL;
1723 cur->oldNs = NULL;
1724 cur->encoding = NULL;
1725 cur->standalone = 1;
1726 cur->compression = 0;
Daniel Veillardc08a2c61999-09-08 21:35:25 +00001727 cur->ids = NULL;
1728 cur->refs = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001729#ifndef XML_WITHOUT_CORBA
1730 cur->_private = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001731#endif
1732 return(cur);
1733}
1734
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001735/**
1736 * htmlNewDoc:
1737 * @URI: URI for the dtd, or NULL
1738 * @ExternalID: the external ID of the DTD, or NULL
1739 *
1740 * Returns a new document
1741 */
1742htmlDocPtr
1743htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1744 if ((URI == NULL) && (ExternalID == NULL))
1745 return(htmlNewDocNoDtD(
1746 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
1747 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"));
1748
1749 return(htmlNewDocNoDtD(URI, ExternalID));
1750}
1751
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001752
1753/************************************************************************
1754 * *
1755 * The parser itself *
1756 * Relates to http://www.w3.org/TR/html40 *
1757 * *
1758 ************************************************************************/
1759
1760/************************************************************************
1761 * *
1762 * The parser itself *
1763 * *
1764 ************************************************************************/
1765
1766/**
1767 * htmlParseHTMLName:
1768 * @ctxt: an HTML parser context
1769 *
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001770 * parse an HTML tag or attribute name, note that we convert it to lowercase
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001771 * since HTML names are not case-sensitive.
1772 *
1773 * Returns the Tag Name parsed or NULL
1774 */
1775
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001776xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001777htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001778 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001779 int i = 0;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001780 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001781
1782 if (!IS_LETTER(CUR) && (CUR != '_') &&
1783 (CUR != ':')) return(NULL);
1784
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001785 while ((i < HTML_PARSER_BUFFER_SIZE) &&
Daniel Veillard3f6f7f62000-06-30 17:58:25 +00001786 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
Daniel Veillarde8282ed2000-10-10 23:01:31 +00001787 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001788 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001789 else loc[i] = CUR;
1790 i++;
1791
1792 NEXT;
1793 }
1794
1795 ret = xmlStrndup(loc, i);
1796
1797 return(ret);
1798}
1799
1800/**
1801 * htmlParseName:
1802 * @ctxt: an HTML parser context
1803 *
1804 * parse an HTML name, this routine is case sensistive.
1805 *
1806 * Returns the Name parsed or NULL
1807 */
1808
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001809xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001810htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001811 xmlChar buf[HTML_MAX_NAMELEN];
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001812 int len = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001813
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001814 GROW;
1815 if (!IS_LETTER(CUR) && (CUR != '_')) {
1816 return(NULL);
1817 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001818
1819 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1820 (CUR == '.') || (CUR == '-') ||
1821 (CUR == '_') || (CUR == ':') ||
1822 (IS_COMBINING(CUR)) ||
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001823 (IS_EXTENDER(CUR))) {
1824 buf[len++] = CUR;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001825 NEXT;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001826 if (len >= HTML_MAX_NAMELEN) {
1827 fprintf(stderr,
1828 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1829 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1830 (CUR == '.') || (CUR == '-') ||
1831 (CUR == '_') || (CUR == ':') ||
1832 (IS_COMBINING(CUR)) ||
1833 (IS_EXTENDER(CUR)))
1834 NEXT;
1835 break;
1836 }
1837 }
1838 return(xmlStrndup(buf, len));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001839}
1840
1841/**
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001842 * htmlParseHTMLAttribute:
1843 * @ctxt: an HTML parser context
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001844 * @stop: a char stop value
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001845 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001846 * parse an HTML attribute value till the stop (quote), if
1847 * stop is 0 then it stops at the first space
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001848 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001849 * Returns the attribute parsed or NULL
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001850 */
1851
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001852xmlChar *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001853htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00001854 xmlChar *buffer = NULL;
1855 int buffer_size = 0;
1856 xmlChar *out = NULL;
1857 xmlChar *name = NULL;
1858
1859 xmlChar *cur = NULL;
1860 htmlEntityDescPtr ent;
1861
1862 /*
1863 * allocate a translation buffer.
1864 */
Daniel Veillard7eda8452000-10-14 23:38:43 +00001865 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard71b656e2000-01-05 14:46:17 +00001866 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1867 if (buffer == NULL) {
1868 perror("htmlParseHTMLAttribute: malloc failed");
1869 return(NULL);
1870 }
1871 out = buffer;
1872
1873 /*
1874 * Ok loop until we reach one of the ending chars
1875 */
1876 while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1877 if ((stop == 0) && (IS_BLANK(CUR))) break;
1878 if (CUR == '&') {
1879 if (NXT(1) == '#') {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001880 unsigned int c;
1881 int bits;
1882
1883 c = htmlParseCharRef(ctxt);
1884 if (c < 0x80)
1885 { *out++ = c; bits= -6; }
1886 else if (c < 0x800)
1887 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1888 else if (c < 0x10000)
1889 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1890 else
1891 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1892
1893 for ( ; bits >= 0; bits-= 6) {
1894 *out++ = ((c >> bits) & 0x3F) | 0x80;
1895 }
Daniel Veillard71b656e2000-01-05 14:46:17 +00001896 } else {
1897 ent = htmlParseEntityRef(ctxt, &name);
1898 if (name == NULL) {
1899 *out++ = '&';
1900 if (out - buffer > buffer_size - 100) {
1901 int index = out - buffer;
1902
1903 growBuffer(buffer);
1904 out = &buffer[index];
1905 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001906 } else if (ent == NULL) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00001907 *out++ = '&';
1908 cur = name;
1909 while (*cur != 0) {
1910 if (out - buffer > buffer_size - 100) {
1911 int index = out - buffer;
1912
1913 growBuffer(buffer);
1914 out = &buffer[index];
1915 }
1916 *out++ = *cur++;
1917 }
1918 xmlFree(name);
1919 } else {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001920 unsigned int c;
1921 int bits;
1922
Daniel Veillard71b656e2000-01-05 14:46:17 +00001923 if (out - buffer > buffer_size - 100) {
1924 int index = out - buffer;
1925
1926 growBuffer(buffer);
1927 out = &buffer[index];
1928 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001929 c = (xmlChar)ent->value;
1930 if (c < 0x80)
1931 { *out++ = c; bits= -6; }
1932 else if (c < 0x800)
1933 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1934 else if (c < 0x10000)
1935 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1936 else
1937 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1938
1939 for ( ; bits >= 0; bits-= 6) {
1940 *out++ = ((c >> bits) & 0x3F) | 0x80;
1941 }
Daniel Veillard71b656e2000-01-05 14:46:17 +00001942 xmlFree(name);
1943 }
1944 }
1945 } else {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001946 unsigned int c;
Daniel Veillard4fb87ee2000-09-19 12:25:59 +00001947 int bits, l;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001948
Daniel Veillard71b656e2000-01-05 14:46:17 +00001949 if (out - buffer > buffer_size - 100) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001950 int index = out - buffer;
1951
1952 growBuffer(buffer);
1953 out = &buffer[index];
1954 }
Daniel Veillard4fb87ee2000-09-19 12:25:59 +00001955 c = CUR_CHAR(l);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001956 if (c < 0x80)
1957 { *out++ = c; bits= -6; }
1958 else if (c < 0x800)
1959 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1960 else if (c < 0x10000)
1961 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1962 else
1963 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1964
1965 for ( ; bits >= 0; bits-= 6) {
1966 *out++ = ((c >> bits) & 0x3F) | 0x80;
Daniel Veillard71b656e2000-01-05 14:46:17 +00001967 }
1968 NEXT;
1969 }
1970 }
1971 *out++ = 0;
1972 return(buffer);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001973}
1974
1975/**
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001976 * htmlParseNmtoken:
1977 * @ctxt: an HTML parser context
1978 *
1979 * parse an HTML Nmtoken.
1980 *
1981 * Returns the Nmtoken parsed or NULL
1982 */
1983
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001984xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001985htmlParseNmtoken(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001986 xmlChar buf[HTML_MAX_NAMELEN];
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001987 int len = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001988
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001989 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001990 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1991 (CUR == '.') || (CUR == '-') ||
1992 (CUR == '_') || (CUR == ':') ||
1993 (IS_COMBINING(CUR)) ||
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001994 (IS_EXTENDER(CUR))) {
1995 buf[len++] = CUR;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001996 NEXT;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001997 if (len >= HTML_MAX_NAMELEN) {
1998 fprintf(stderr,
1999 "htmlParseNmtoken: reached HTML_MAX_NAMELEN limit\n");
2000 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2001 (CUR == '.') || (CUR == '-') ||
2002 (CUR == '_') || (CUR == ':') ||
2003 (IS_COMBINING(CUR)) ||
2004 (IS_EXTENDER(CUR)))
2005 NEXT;
2006 break;
2007 }
2008 }
2009 return(xmlStrndup(buf, len));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002010}
2011
2012/**
2013 * htmlParseEntityRef:
2014 * @ctxt: an HTML parser context
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002015 * @str: location to store the entity name
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002016 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002017 * parse an HTML ENTITY references
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002018 *
2019 * [68] EntityRef ::= '&' Name ';'
2020 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002021 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2022 * if non-NULL *str will have to be freed by the caller.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002023 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002024htmlEntityDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002025htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2026 xmlChar *name;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002027 htmlEntityDescPtr ent = NULL;
2028 *str = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002029
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002030 if (CUR == '&') {
2031 NEXT;
2032 name = htmlParseName(ctxt);
2033 if (name == NULL) {
2034 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2035 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2036 ctxt->wellFormed = 0;
2037 } else {
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002038 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002039 if (CUR == ';') {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002040 *str = name;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002041
2042 /*
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002043 * Lookup the entity in the table.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002044 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002045 ent = htmlEntityLookup(name);
Daniel Veillard71b656e2000-01-05 14:46:17 +00002046 if (ent != NULL) /* OK that's ugly !!! */
2047 NEXT;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002048 } else {
2049 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2050 ctxt->sax->error(ctxt->userData,
2051 "htmlParseEntityRef: expecting ';'\n");
Daniel Veillard71b656e2000-01-05 14:46:17 +00002052 *str = name;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002053 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002054 }
2055 }
2056 return(ent);
2057}
2058
2059/**
2060 * htmlParseAttValue:
2061 * @ctxt: an HTML parser context
2062 *
2063 * parse a value for an attribute
2064 * Note: the parser won't do substitution of entities here, this
2065 * will be handled later in xmlStringGetNodeList, unless it was
2066 * asked for ctxt->replaceEntities != 0
2067 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002068 * Returns the AttValue parsed or NULL.
2069 */
2070
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002071xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002072htmlParseAttValue(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002073 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002074
2075 if (CUR == '"') {
2076 NEXT;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002077 ret = htmlParseHTMLAttribute(ctxt, '"');
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002078 if (CUR != '"') {
2079 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2080 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2081 ctxt->wellFormed = 0;
2082 } else
2083 NEXT;
2084 } else if (CUR == '\'') {
2085 NEXT;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002086 ret = htmlParseHTMLAttribute(ctxt, '\'');
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002087 if (CUR != '\'') {
2088 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2089 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2090 ctxt->wellFormed = 0;
2091 } else
2092 NEXT;
2093 } else {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002094 /*
2095 * That's an HTMLism, the attribute value may not be quoted
2096 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002097 ret = htmlParseHTMLAttribute(ctxt, 0);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002098 if (ret == NULL) {
2099 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2100 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2101 ctxt->wellFormed = 0;
2102 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002103 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002104 return(ret);
2105}
2106
2107/**
2108 * htmlParseSystemLiteral:
2109 * @ctxt: an HTML parser context
2110 *
2111 * parse an HTML Literal
2112 *
2113 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2114 *
2115 * Returns the SystemLiteral parsed or NULL
2116 */
2117
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002118xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002119htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002120 const xmlChar *q;
2121 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002122
2123 if (CUR == '"') {
2124 NEXT;
2125 q = CUR_PTR;
2126 while ((IS_CHAR(CUR)) && (CUR != '"'))
2127 NEXT;
2128 if (!IS_CHAR(CUR)) {
2129 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2130 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2131 ctxt->wellFormed = 0;
2132 } else {
2133 ret = xmlStrndup(q, CUR_PTR - q);
2134 NEXT;
2135 }
2136 } else if (CUR == '\'') {
2137 NEXT;
2138 q = CUR_PTR;
2139 while ((IS_CHAR(CUR)) && (CUR != '\''))
2140 NEXT;
2141 if (!IS_CHAR(CUR)) {
2142 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2143 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2144 ctxt->wellFormed = 0;
2145 } else {
2146 ret = xmlStrndup(q, CUR_PTR - q);
2147 NEXT;
2148 }
2149 } else {
2150 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardcf461992000-03-14 18:30:20 +00002151 ctxt->sax->error(ctxt->userData,
2152 "SystemLiteral \" or ' expected\n");
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002153 ctxt->wellFormed = 0;
2154 }
2155
2156 return(ret);
2157}
2158
2159/**
2160 * htmlParsePubidLiteral:
2161 * @ctxt: an HTML parser context
2162 *
2163 * parse an HTML public literal
2164 *
2165 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2166 *
2167 * Returns the PubidLiteral parsed or NULL.
2168 */
2169
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002170xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002171htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002172 const xmlChar *q;
2173 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002174 /*
2175 * Name ::= (Letter | '_') (NameChar)*
2176 */
2177 if (CUR == '"') {
2178 NEXT;
2179 q = CUR_PTR;
2180 while (IS_PUBIDCHAR(CUR)) NEXT;
2181 if (CUR != '"') {
2182 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2183 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2184 ctxt->wellFormed = 0;
2185 } else {
2186 ret = xmlStrndup(q, CUR_PTR - q);
2187 NEXT;
2188 }
2189 } else if (CUR == '\'') {
2190 NEXT;
2191 q = CUR_PTR;
2192 while ((IS_LETTER(CUR)) && (CUR != '\''))
2193 NEXT;
2194 if (!IS_LETTER(CUR)) {
2195 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2196 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2197 ctxt->wellFormed = 0;
2198 } else {
2199 ret = xmlStrndup(q, CUR_PTR - q);
2200 NEXT;
2201 }
2202 } else {
2203 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2204 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2205 ctxt->wellFormed = 0;
2206 }
2207
2208 return(ret);
2209}
2210
2211/**
Daniel Veillard7eda8452000-10-14 23:38:43 +00002212 * htmlParseScript:
2213 * @ctxt: an HTML parser context
2214 *
2215 * parse the content of an HTML SCRIPT or STYLE element
2216 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2217 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2218 * http://www.w3.org/TR/html4/types.html#type-script
2219 * http://www.w3.org/TR/html4/types.html#h-6.15
2220 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2221 *
2222 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2223 * element and the value of intrinsic event attributes. User agents must
2224 * not evaluate script data as HTML markup but instead must pass it on as
2225 * data to a script engine.
2226 * NOTES:
2227 * - The content is passed like CDATA
2228 * - the attributes for style and scripting "onXXX" are also described
2229 * as CDATA but SGML allows entities references in attributes so their
2230 * processing is identical as other attributes
2231 */
2232void
2233htmlParseScript(htmlParserCtxtPtr ctxt) {
2234 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2235 int nbchar = 0;
2236 xmlChar cur;
2237
2238 SHRINK;
2239 cur = CUR;
2240 while (IS_CHAR(cur)) {
2241 if ((cur == '<') && (NXT(1) == '/')) {
2242 /*
2243 * One should break here, the specification is clear:
2244 * Authors should therefore escape "</" within the content.
2245 * Escape mechanisms are specific to each scripting or
2246 * style sheet language.
2247 */
2248 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2249 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2250 break; /* while */
2251 }
2252 buf[nbchar++] = cur;
2253 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2254 if (ctxt->sax->cdataBlock!= NULL) {
2255 /*
2256 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2257 */
2258 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2259 }
2260 nbchar = 0;
2261 }
2262 NEXT;
2263 cur = CUR;
2264 }
2265 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2266 if (ctxt->sax->cdataBlock!= NULL) {
2267 /*
2268 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2269 */
2270 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2271 }
2272 }
2273}
2274
2275
2276/**
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002277 * htmlParseCharData:
2278 * @ctxt: an HTML parser context
2279 * @cdata: int indicating whether we are within a CDATA section
2280 *
2281 * parse a CharData section.
2282 * if we are within a CDATA section ']]>' marks an end of section.
2283 *
2284 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2285 */
2286
2287void
2288htmlParseCharData(htmlParserCtxtPtr ctxt, int cdata) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002289 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2290 int nbchar = 0;
2291 int cur, l;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002292
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002293 SHRINK;
2294 cur = CUR_CHAR(l);
2295 while (((cur != '<') || (ctxt->token == '<')) &&
2296 ((cur != '&') || (ctxt->token == '&')) &&
2297 (IS_CHAR(cur))) {
2298 COPY_BUF(l,buf,nbchar,cur);
2299 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2300 /*
2301 * Ok the segment is to be consumed as chars.
2302 */
2303 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2304 if (areBlanks(ctxt, buf, nbchar)) {
2305 if (ctxt->sax->ignorableWhitespace != NULL)
2306 ctxt->sax->ignorableWhitespace(ctxt->userData,
2307 buf, nbchar);
2308 } else {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00002309 htmlCheckParagraph(ctxt);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002310 if (ctxt->sax->characters != NULL)
2311 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2312 }
2313 }
2314 nbchar = 0;
2315 }
2316 NEXTL(l);
2317 cur = CUR_CHAR(l);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002318 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002319 if (nbchar != 0) {
2320 /*
2321 * Ok the segment is to be consumed as chars.
2322 */
2323 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2324 if (areBlanks(ctxt, buf, nbchar)) {
2325 if (ctxt->sax->ignorableWhitespace != NULL)
2326 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2327 } else {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00002328 htmlCheckParagraph(ctxt);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002329 if (ctxt->sax->characters != NULL)
2330 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002331 }
2332 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002333 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002334}
2335
2336/**
2337 * htmlParseExternalID:
2338 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002339 * @publicID: a xmlChar** receiving PubidLiteral
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002340 * @strict: indicate whether we should restrict parsing to only
2341 * production [75], see NOTE below
2342 *
2343 * Parse an External ID or a Public ID
2344 *
2345 * NOTE: Productions [75] and [83] interract badly since [75] can generate
2346 * 'PUBLIC' S PubidLiteral S SystemLiteral
2347 *
2348 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2349 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2350 *
2351 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2352 *
2353 * Returns the function returns SystemLiteral and in the second
2354 * case publicID receives PubidLiteral, is strict is off
2355 * it is possible to return NULL and have publicID set.
2356 */
2357
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002358xmlChar *
2359htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID, int strict) {
2360 xmlChar *URI = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002361
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002362 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2363 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2364 (UPP(4) == 'E') && (UPP(5) == 'M')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002365 SKIP(6);
2366 if (!IS_BLANK(CUR)) {
2367 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2368 ctxt->sax->error(ctxt->userData,
2369 "Space required after 'SYSTEM'\n");
2370 ctxt->wellFormed = 0;
2371 }
2372 SKIP_BLANKS;
2373 URI = htmlParseSystemLiteral(ctxt);
2374 if (URI == NULL) {
2375 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2376 ctxt->sax->error(ctxt->userData,
2377 "htmlParseExternalID: SYSTEM, no URI\n");
2378 ctxt->wellFormed = 0;
2379 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002380 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2381 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2382 (UPP(4) == 'I') && (UPP(5) == 'C')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002383 SKIP(6);
2384 if (!IS_BLANK(CUR)) {
2385 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2386 ctxt->sax->error(ctxt->userData,
2387 "Space required after 'PUBLIC'\n");
2388 ctxt->wellFormed = 0;
2389 }
2390 SKIP_BLANKS;
2391 *publicID = htmlParsePubidLiteral(ctxt);
2392 if (*publicID == NULL) {
2393 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2394 ctxt->sax->error(ctxt->userData,
2395 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2396 ctxt->wellFormed = 0;
2397 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002398 SKIP_BLANKS;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002399 if ((CUR == '"') || (CUR == '\'')) {
2400 URI = htmlParseSystemLiteral(ctxt);
2401 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002402 }
2403 return(URI);
2404}
2405
2406/**
2407 * htmlParseComment:
2408 * @ctxt: an HTML parser context
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002409 *
2410 * Parse an XML (SGML) comment <!-- .... -->
2411 *
2412 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2413 */
2414void
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002415htmlParseComment(htmlParserCtxtPtr ctxt) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002416 xmlChar *buf = NULL;
Daniel Veillard87b95392000-08-12 21:12:04 +00002417 int len;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002418 int size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard87b95392000-08-12 21:12:04 +00002419 int q, ql;
2420 int r, rl;
2421 int cur, l;
2422 xmlParserInputState state;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002423
2424 /*
2425 * Check that there is a comment right here.
2426 */
Daniel Veillard87b95392000-08-12 21:12:04 +00002427 if ((RAW != '<') || (NXT(1) != '!') ||
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002428 (NXT(2) != '-') || (NXT(3) != '-')) return;
2429
Daniel Veillard87b95392000-08-12 21:12:04 +00002430 state = ctxt->instate;
2431 ctxt->instate = XML_PARSER_COMMENT;
2432 SHRINK;
2433 SKIP(4);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002434 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2435 if (buf == NULL) {
2436 fprintf(stderr, "malloc of %d byte failed\n", size);
Daniel Veillard87b95392000-08-12 21:12:04 +00002437 ctxt->instate = state;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002438 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002439 }
Daniel Veillard87b95392000-08-12 21:12:04 +00002440 q = CUR_CHAR(ql);
2441 NEXTL(ql);
2442 r = CUR_CHAR(rl);
2443 NEXTL(rl);
2444 cur = CUR_CHAR(l);
2445 len = 0;
2446 while (IS_CHAR(cur) &&
2447 ((cur != '>') ||
2448 (r != '-') || (q != '-'))) {
2449 if (len + 5 >= size) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002450 size *= 2;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002451 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002452 if (buf == NULL) {
2453 fprintf(stderr, "realloc of %d byte failed\n", size);
Daniel Veillard87b95392000-08-12 21:12:04 +00002454 ctxt->instate = state;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002455 return;
2456 }
2457 }
Daniel Veillard87b95392000-08-12 21:12:04 +00002458 COPY_BUF(ql,buf,len,q);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002459 q = r;
Daniel Veillard87b95392000-08-12 21:12:04 +00002460 ql = rl;
2461 r = cur;
2462 rl = l;
2463 NEXTL(l);
2464 cur = CUR_CHAR(l);
2465 if (cur == 0) {
2466 SHRINK;
2467 GROW;
2468 cur = CUR_CHAR(l);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002469 }
2470 }
Daniel Veillard87b95392000-08-12 21:12:04 +00002471 buf[len] = 0;
2472 if (!IS_CHAR(cur)) {
Daniel Veillarda2c6da92000-09-16 18:15:00 +00002473 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
Daniel Veillard87b95392000-08-12 21:12:04 +00002474 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2475 ctxt->sax->error(ctxt->userData,
2476 "Comment not terminated \n<!--%.50s\n", buf);
Daniel Veillard87b95392000-08-12 21:12:04 +00002477 ctxt->wellFormed = 0;
2478 xmlFree(buf);
2479 } else {
2480 NEXT;
2481 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2482 (!ctxt->disableSAX))
2483 ctxt->sax->comment(ctxt->userData, buf);
2484 xmlFree(buf);
2485 }
2486 ctxt->instate = state;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002487}
2488
2489/**
2490 * htmlParseCharRef:
2491 * @ctxt: an HTML parser context
2492 *
2493 * parse Reference declarations
2494 *
2495 * [66] CharRef ::= '&#' [0-9]+ ';' |
2496 * '&#x' [0-9a-fA-F]+ ';'
2497 *
2498 * Returns the value parsed (as an int)
2499 */
2500int
2501htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2502 int val = 0;
2503
2504 if ((CUR == '&') && (NXT(1) == '#') &&
2505 (NXT(2) == 'x')) {
2506 SKIP(3);
2507 while (CUR != ';') {
2508 if ((CUR >= '0') && (CUR <= '9'))
2509 val = val * 16 + (CUR - '0');
2510 else if ((CUR >= 'a') && (CUR <= 'f'))
2511 val = val * 16 + (CUR - 'a') + 10;
2512 else if ((CUR >= 'A') && (CUR <= 'F'))
2513 val = val * 16 + (CUR - 'A') + 10;
2514 else {
2515 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2516 ctxt->sax->error(ctxt->userData,
2517 "htmlParseCharRef: invalid hexadecimal value\n");
2518 ctxt->wellFormed = 0;
2519 val = 0;
2520 break;
2521 }
2522 NEXT;
2523 }
2524 if (CUR == ';')
2525 NEXT;
2526 } else if ((CUR == '&') && (NXT(1) == '#')) {
2527 SKIP(2);
2528 while (CUR != ';') {
2529 if ((CUR >= '0') && (CUR <= '9'))
2530 val = val * 10 + (CUR - '0');
2531 else {
2532 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2533 ctxt->sax->error(ctxt->userData,
2534 "htmlParseCharRef: invalid decimal value\n");
2535 ctxt->wellFormed = 0;
2536 val = 0;
2537 break;
2538 }
2539 NEXT;
2540 }
2541 if (CUR == ';')
2542 NEXT;
2543 } else {
2544 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2545 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2546 ctxt->wellFormed = 0;
2547 }
2548 /*
2549 * Check the value IS_CHAR ...
2550 */
2551 if (IS_CHAR(val)) {
2552 return(val);
2553 } else {
2554 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002555 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002556 val);
2557 ctxt->wellFormed = 0;
2558 }
2559 return(0);
2560}
2561
2562
2563/**
2564 * htmlParseDocTypeDecl :
2565 * @ctxt: an HTML parser context
2566 *
2567 * parse a DOCTYPE declaration
2568 *
2569 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2570 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2571 */
2572
2573void
2574htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002575 xmlChar *name;
2576 xmlChar *ExternalID = NULL;
2577 xmlChar *URI = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002578
2579 /*
2580 * We know that '<!DOCTYPE' has been detected.
2581 */
2582 SKIP(9);
2583
2584 SKIP_BLANKS;
2585
2586 /*
2587 * Parse the DOCTYPE name.
2588 */
2589 name = htmlParseName(ctxt);
2590 if (name == NULL) {
2591 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2592 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2593 ctxt->wellFormed = 0;
2594 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002595 /*
2596 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2597 */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002598
2599 SKIP_BLANKS;
2600
2601 /*
2602 * Check for SystemID and ExternalID
2603 */
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002604 URI = htmlParseExternalID(ctxt, &ExternalID, 0);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002605 SKIP_BLANKS;
2606
2607 /*
2608 * We should be at the end of the DOCTYPE declaration.
2609 */
2610 if (CUR != '>') {
2611 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2612 ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
2613 ctxt->wellFormed = 0;
2614 /* We shouldn't try to resynchronize ... */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002615 }
2616 NEXT;
2617
2618 /*
Daniel Veillardd83eb822000-06-30 18:39:56 +00002619 * Create or update the document accordingly to the DOCTYPE
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002620 */
Daniel Veillardd83eb822000-06-30 18:39:56 +00002621 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2622 (!ctxt->disableSAX))
2623 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002624
2625 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002626 * Cleanup, since we don't use all those identifiers
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002627 */
Daniel Veillard6454aec1999-09-02 22:04:43 +00002628 if (URI != NULL) xmlFree(URI);
2629 if (ExternalID != NULL) xmlFree(ExternalID);
2630 if (name != NULL) xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002631}
2632
2633/**
2634 * htmlParseAttribute:
2635 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002636 * @value: a xmlChar ** used to store the value of the attribute
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002637 *
2638 * parse an attribute
2639 *
2640 * [41] Attribute ::= Name Eq AttValue
2641 *
2642 * [25] Eq ::= S? '=' S?
2643 *
2644 * With namespace:
2645 *
2646 * [NS 11] Attribute ::= QName Eq AttValue
2647 *
2648 * Also the case QName == xmlns:??? is handled independently as a namespace
2649 * definition.
2650 *
2651 * Returns the attribute name, and the value in *value.
2652 */
2653
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002654xmlChar *
2655htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002656 xmlChar *name, *val = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002657
2658 *value = NULL;
Daniel Veillard970112a2000-10-03 09:33:21 +00002659 name = htmlParseHTMLName(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002660 if (name == NULL) {
2661 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2662 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2663 ctxt->wellFormed = 0;
2664 return(NULL);
2665 }
2666
2667 /*
2668 * read the value
2669 */
2670 SKIP_BLANKS;
2671 if (CUR == '=') {
2672 NEXT;
2673 SKIP_BLANKS;
2674 val = htmlParseAttValue(ctxt);
Daniel Veillardbe803962000-06-28 23:40:59 +00002675 /******
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002676 } else {
Daniel Veillardbe803962000-06-28 23:40:59 +00002677 * TODO : some attribute must have values, some may not
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002678 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002679 ctxt->sax->warning(ctxt->userData,
Daniel Veillardbe803962000-06-28 23:40:59 +00002680 "No value for attribute %s\n", name); */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002681 }
2682
2683 *value = val;
2684 return(name);
2685}
2686
2687/**
Daniel Veillard365e13b2000-07-02 07:56:37 +00002688 * htmlCheckEncoding:
2689 * @ctxt: an HTML parser context
2690 * @attvalue: the attribute value
2691 *
2692 * Checks an http-equiv attribute from a Meta tag to detect
2693 * the encoding
2694 * If a new encoding is detected the parser is switched to decode
2695 * it and pass UTF8
2696 */
2697void
2698htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2699 const xmlChar *encoding;
2700
2701 if ((ctxt == NULL) || (attvalue == NULL))
2702 return;
2703
Daniel Veillardb656ebe2000-09-22 13:51:48 +00002704 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
Daniel Veillard365e13b2000-07-02 07:56:37 +00002705 if (encoding != NULL) {
2706 encoding += 8;
2707 } else {
Daniel Veillardb656ebe2000-09-22 13:51:48 +00002708 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
Daniel Veillard365e13b2000-07-02 07:56:37 +00002709 if (encoding != NULL)
2710 encoding += 9;
2711 }
2712 if (encoding != NULL) {
2713 xmlCharEncoding enc;
2714 xmlCharEncodingHandlerPtr handler;
2715
2716 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2717
2718 if (ctxt->input->encoding != NULL)
2719 xmlFree((xmlChar *) ctxt->input->encoding);
2720 ctxt->input->encoding = xmlStrdup(encoding);
2721
2722 enc = xmlParseCharEncoding((const char *) encoding);
2723 /*
2724 * registered set of known encodings
2725 */
2726 if (enc != XML_CHAR_ENCODING_ERROR) {
2727 xmlSwitchEncoding(ctxt, enc);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002728 ctxt->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillard365e13b2000-07-02 07:56:37 +00002729 } else {
2730 /*
2731 * fallback for unknown encodings
2732 */
2733 handler = xmlFindCharEncodingHandler((const char *) encoding);
2734 if (handler != NULL) {
2735 xmlSwitchToEncoding(ctxt, handler);
Daniel Veillard87b95392000-08-12 21:12:04 +00002736 ctxt->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillard365e13b2000-07-02 07:56:37 +00002737 } else {
2738 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2739 }
2740 }
Daniel Veillard87b95392000-08-12 21:12:04 +00002741
2742 if ((ctxt->input->buf != NULL) &&
2743 (ctxt->input->buf->encoder != NULL) &&
2744 (ctxt->input->buf->raw != NULL) &&
2745 (ctxt->input->buf->buffer != NULL)) {
2746 int nbchars;
2747 int processed;
2748
2749 /*
2750 * convert as much as possible to the parser reading buffer.
2751 */
2752 processed = ctxt->input->cur - ctxt->input->base;
2753 xmlBufferShrink(ctxt->input->buf->buffer, processed);
2754 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2755 ctxt->input->buf->buffer,
2756 ctxt->input->buf->raw);
2757 if (nbchars < 0) {
Daniel Veillarda2c6da92000-09-16 18:15:00 +00002758 ctxt->errNo = XML_ERR_INVALID_ENCODING;
Daniel Veillard87b95392000-08-12 21:12:04 +00002759 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2760 ctxt->sax->error(ctxt->userData,
2761 "htmlCheckEncoding: encoder error\n");
Daniel Veillard87b95392000-08-12 21:12:04 +00002762 }
2763 ctxt->input->base =
2764 ctxt->input->cur = ctxt->input->buf->buffer->content;
2765 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00002766 }
2767}
2768
2769/**
2770 * htmlCheckMeta:
2771 * @ctxt: an HTML parser context
2772 * @atts: the attributes values
2773 *
2774 * Checks an attributes from a Meta tag
2775 */
2776void
2777htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2778 int i;
2779 const xmlChar *att, *value;
2780 int http = 0;
2781 const xmlChar *content = NULL;
2782
2783 if ((ctxt == NULL) || (atts == NULL))
2784 return;
2785
2786 i = 0;
2787 att = atts[i++];
2788 while (att != NULL) {
2789 value = atts[i++];
Daniel Veillardb656ebe2000-09-22 13:51:48 +00002790 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
2791 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
Daniel Veillard365e13b2000-07-02 07:56:37 +00002792 http = 1;
Daniel Veillardb656ebe2000-09-22 13:51:48 +00002793 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
Daniel Veillard365e13b2000-07-02 07:56:37 +00002794 content = value;
2795 att = atts[i++];
2796 }
2797 if ((http) && (content != NULL))
2798 htmlCheckEncoding(ctxt, content);
2799
2800}
2801
2802/**
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002803 * htmlParseStartTag:
2804 * @ctxt: an HTML parser context
2805 *
2806 * parse a start of tag either for rule element or
2807 * EmptyElement. In both case we don't parse the tag closing chars.
2808 *
2809 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2810 *
2811 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2812 *
2813 * With namespace:
2814 *
2815 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2816 *
2817 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2818 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002819 */
2820
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002821void
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002822htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002823 xmlChar *name;
2824 xmlChar *attname;
2825 xmlChar *attvalue;
2826 const xmlChar **atts = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002827 int nbatts = 0;
2828 int maxatts = 0;
Daniel Veillard365e13b2000-07-02 07:56:37 +00002829 int meta = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002830 int i;
2831
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002832 if (CUR != '<') return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002833 NEXT;
2834
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002835 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002836 name = htmlParseHTMLName(ctxt);
2837 if (name == NULL) {
2838 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2839 ctxt->sax->error(ctxt->userData,
2840 "htmlParseStartTag: invalid element name\n");
2841 ctxt->wellFormed = 0;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002842 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002843 }
Daniel Veillard8b5dd832000-10-01 20:28:44 +00002844 if (xmlStrEqual(name, BAD_CAST"meta"))
Daniel Veillard365e13b2000-07-02 07:56:37 +00002845 meta = 1;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002846
2847 /*
2848 * Check for auto-closure of HTML elements.
2849 */
2850 htmlAutoClose(ctxt, name);
2851
2852 /*
Daniel Veillardbe803962000-06-28 23:40:59 +00002853 * Check for implied HTML elements.
2854 */
2855 htmlCheckImplied(ctxt, name);
2856
2857 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002858 * Now parse the attributes, it ends up with the ending
2859 *
2860 * (S Attribute)* S?
2861 */
2862 SKIP_BLANKS;
2863 while ((IS_CHAR(CUR)) &&
2864 (CUR != '>') &&
2865 ((CUR != '/') || (NXT(1) != '>'))) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002866 long cons = ctxt->nbChars;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002867
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002868 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002869 attname = htmlParseAttribute(ctxt, &attvalue);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002870 if (attname != NULL) {
Daniel Veillard365e13b2000-07-02 07:56:37 +00002871
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002872 /*
2873 * Well formedness requires at most one declaration of an attribute
2874 */
2875 for (i = 0; i < nbatts;i += 2) {
Daniel Veillard8b5dd832000-10-01 20:28:44 +00002876 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002877 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002878 ctxt->sax->error(ctxt->userData,
2879 "Attribute %s redefined\n",
2880 attname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002881 ctxt->wellFormed = 0;
Daniel Veillard6454aec1999-09-02 22:04:43 +00002882 xmlFree(attname);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002883 if (attvalue != NULL)
2884 xmlFree(attvalue);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002885 goto failed;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002886 }
2887 }
2888
2889 /*
2890 * Add the pair to atts
2891 */
2892 if (atts == NULL) {
2893 maxatts = 10;
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002894 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002895 if (atts == NULL) {
2896 fprintf(stderr, "malloc of %ld byte failed\n",
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002897 maxatts * (long)sizeof(xmlChar *));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002898 if (name != NULL) xmlFree(name);
2899 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002900 }
Daniel Veillard51e3b151999-11-12 17:02:31 +00002901 } else if (nbatts + 4 > maxatts) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002902 maxatts *= 2;
Daniel Veillard4b0755c2000-09-25 14:26:28 +00002903 atts = (const xmlChar **) xmlRealloc((void *) atts,
2904 maxatts * sizeof(xmlChar *));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002905 if (atts == NULL) {
2906 fprintf(stderr, "realloc of %ld byte failed\n",
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002907 maxatts * (long)sizeof(xmlChar *));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002908 if (name != NULL) xmlFree(name);
2909 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002910 }
2911 }
2912 atts[nbatts++] = attname;
2913 atts[nbatts++] = attvalue;
2914 atts[nbatts] = NULL;
2915 atts[nbatts + 1] = NULL;
2916 }
2917
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002918failed:
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002919 SKIP_BLANKS;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002920 if (cons == ctxt->nbChars) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002921 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2922 ctxt->sax->error(ctxt->userData,
2923 "htmlParseStartTag: problem parsing attributes\n");
2924 ctxt->wellFormed = 0;
2925 break;
2926 }
2927 }
2928
2929 /*
Daniel Veillard365e13b2000-07-02 07:56:37 +00002930 * Handle specific association to the META tag
2931 */
2932 if (meta)
2933 htmlCheckMeta(ctxt, atts);
2934
2935 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002936 * SAX: Start of Element !
2937 */
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002938 htmlnamePush(ctxt, xmlStrdup(name));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002939#ifdef DEBUG
2940 fprintf(stderr,"Start of element %s: pushed %s\n", name, ctxt->name);
2941#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002942 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
2943 ctxt->sax->startElement(ctxt->userData, name, atts);
2944
2945 if (atts != NULL) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002946 for (i = 0;i < nbatts;i++) {
2947 if (atts[i] != NULL)
2948 xmlFree((xmlChar *) atts[i]);
2949 }
Daniel Veillard3f6f7f62000-06-30 17:58:25 +00002950 xmlFree((void *) atts);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002951 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002952 if (name != NULL) xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002953}
2954
2955/**
2956 * htmlParseEndTag:
2957 * @ctxt: an HTML parser context
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002958 *
2959 * parse an end of tag
2960 *
2961 * [42] ETag ::= '</' Name S? '>'
2962 *
2963 * With namespace
2964 *
2965 * [NS 9] ETag ::= '</' QName S? '>'
2966 */
2967
2968void
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002969htmlParseEndTag(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002970 xmlChar *name;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002971 xmlChar *oldname;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002972 int i;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002973
2974 if ((CUR != '<') || (NXT(1) != '/')) {
2975 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2976 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
2977 ctxt->wellFormed = 0;
2978 return;
2979 }
2980 SKIP(2);
2981
2982 name = htmlParseHTMLName(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002983 if (name == NULL) return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002984
2985 /*
2986 * We should definitely be at the ending "S? '>'" part
2987 */
2988 SKIP_BLANKS;
2989 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
2990 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2991 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
2992 ctxt->wellFormed = 0;
2993 } else
2994 NEXT;
2995
2996 /*
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002997 * If the name read is not one of the element in the parsing stack
2998 * then return, it's just an error.
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002999 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003000 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard8b5dd832000-10-01 20:28:44 +00003001 if (xmlStrEqual(name, ctxt->nameTab[i])) break;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003002 }
3003 if (i < 0) {
3004 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003005 ctxt->sax->error(ctxt->userData,
3006 "Unexpected end tag : %s\n", name);
Daniel Veillard6454aec1999-09-02 22:04:43 +00003007 xmlFree(name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003008 ctxt->wellFormed = 0;
3009 return;
3010 }
3011
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003012
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003013 /*
3014 * Check for auto-closure of HTML elements.
3015 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003016
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003017 htmlAutoCloseOnClose(ctxt, name);
3018
3019 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003020 * Well formedness constraints, opening and closing must match.
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003021 * With the exception that the autoclose may have popped stuff out
3022 * of the stack.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003023 */
Daniel Veillard8b5dd832000-10-01 20:28:44 +00003024 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003025#ifdef DEBUG
3026 fprintf(stderr,"End of tag %s: expecting %s\n", name, ctxt->name);
3027#endif
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003028 if ((ctxt->name != NULL) &&
Daniel Veillard8b5dd832000-10-01 20:28:44 +00003029 (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003030 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3031 ctxt->sax->error(ctxt->userData,
3032 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003033 name, ctxt->name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003034 ctxt->wellFormed = 0;
3035 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003036 }
3037
3038 /*
3039 * SAX: End of Tag
3040 */
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003041 oldname = ctxt->name;
Daniel Veillard8b5dd832000-10-01 20:28:44 +00003042 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003043 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3044 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003045 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003046 if (oldname != NULL) {
3047#ifdef DEBUG
3048 fprintf(stderr,"End of tag %s: popping out %s\n", name, oldname);
3049#endif
3050 xmlFree(oldname);
3051#ifdef DEBUG
3052 } else {
3053 fprintf(stderr,"End of tag %s: stack empty !!!\n", name);
3054#endif
3055 }
3056 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003057
3058 if (name != NULL)
Daniel Veillard6454aec1999-09-02 22:04:43 +00003059 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003060
3061 return;
3062}
3063
3064
3065/**
3066 * htmlParseReference:
3067 * @ctxt: an HTML parser context
3068 *
3069 * parse and handle entity references in content,
3070 * this will end-up in a call to character() since this is either a
3071 * CharRef, or a predefined entity.
3072 */
3073void
3074htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003075 htmlEntityDescPtr ent;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003076 xmlChar out[6];
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003077 xmlChar *name;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003078 if (CUR != '&') return;
3079
3080 if (NXT(1) == '#') {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003081 unsigned int c;
3082 int bits, i = 0;
3083
3084 c = htmlParseCharRef(ctxt);
3085 if (c < 0x80) { out[i++]= c; bits= -6; }
3086 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3087 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3088 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3089
3090 for ( ; bits >= 0; bits-= 6) {
3091 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3092 }
3093 out[i] = 0;
3094
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003095 htmlCheckParagraph(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003096 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003097 ctxt->sax->characters(ctxt->userData, out, i);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003098 } else {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003099 ent = htmlParseEntityRef(ctxt, &name);
Daniel Veillard71b656e2000-01-05 14:46:17 +00003100 if (name == NULL) {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003101 htmlCheckParagraph(ctxt);
Daniel Veillard1255ab72000-08-14 15:13:33 +00003102 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3103 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
Daniel Veillard71b656e2000-01-05 14:46:17 +00003104 return;
3105 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003106 if ((ent == NULL) || (ent->value <= 0)) {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003107 htmlCheckParagraph(ctxt);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003108 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
Daniel Veillardb96e6431999-08-29 21:02:19 +00003109 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003110 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
Daniel Veillard71b656e2000-01-05 14:46:17 +00003111 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003112 }
3113 } else {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003114 unsigned int c;
3115 int bits, i = 0;
3116
3117 c = ent->value;
3118 if (c < 0x80)
3119 { out[i++]= c; bits= -6; }
3120 else if (c < 0x800)
3121 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3122 else if (c < 0x10000)
3123 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3124 else
3125 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3126
3127 for ( ; bits >= 0; bits-= 6) {
3128 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3129 }
3130 out[i] = 0;
3131
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003132 htmlCheckParagraph(ctxt);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003133 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003134 ctxt->sax->characters(ctxt->userData, out, i);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003135 }
Daniel Veillard6454aec1999-09-02 22:04:43 +00003136 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003137 }
3138}
3139
3140/**
3141 * htmlParseContent:
3142 * @ctxt: an HTML parser context
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003143 * @name: the node name
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003144 *
3145 * Parse a content: comment, sub-element, reference or text.
3146 *
3147 */
3148
3149void
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003150htmlParseContent(htmlParserCtxtPtr ctxt) {
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003151 xmlChar *currentNode;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003152 int depth;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003153
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003154 currentNode = xmlStrdup(ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003155 depth = ctxt->nameNr;
3156 while (1) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003157 long cons = ctxt->nbChars;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003158
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003159 GROW;
3160 /*
3161 * Our tag or one of it's parent or children is ending.
3162 */
3163 if ((CUR == '<') && (NXT(1) == '/')) {
3164 htmlParseEndTag(ctxt);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003165 if (currentNode != NULL) xmlFree(currentNode);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003166 return;
3167 }
3168
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003169 /*
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003170 * Has this node been popped out during parsing of
3171 * the next element
3172 */
Daniel Veillard8b5dd832000-10-01 20:28:44 +00003173 if ((!xmlStrEqual(currentNode, ctxt->name)) &&
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003174 (depth >= ctxt->nameNr)) {
3175 if (currentNode != NULL) xmlFree(currentNode);
3176 return;
3177 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003178
Daniel Veillard7eda8452000-10-14 23:38:43 +00003179 if ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3180 (xmlStrEqual(currentNode, BAD_CAST"style"))) {
3181 /*
3182 * Handle SCRIPT/STYLE separately
3183 */
3184 htmlParseScript(ctxt);
3185 } else {
3186 /*
3187 * Sometimes DOCTYPE arrives in the middle of the document
3188 */
3189 if ((CUR == '<') && (NXT(1) == '!') &&
3190 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3191 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3192 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3193 (UPP(8) == 'E')) {
Daniel Veillard35008381999-10-25 13:15:52 +00003194 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3195 ctxt->sax->error(ctxt->userData,
Daniel Veillard7eda8452000-10-14 23:38:43 +00003196 "Misplaced DOCTYPE declaration\n");
Daniel Veillard35008381999-10-25 13:15:52 +00003197 ctxt->wellFormed = 0;
Daniel Veillard7eda8452000-10-14 23:38:43 +00003198 htmlParseDocTypeDecl(ctxt);
Daniel Veillard35008381999-10-25 13:15:52 +00003199 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003200
Daniel Veillard7eda8452000-10-14 23:38:43 +00003201 /*
3202 * First case : a comment
3203 */
3204 if ((CUR == '<') && (NXT(1) == '!') &&
3205 (NXT(2) == '-') && (NXT(3) == '-')) {
3206 htmlParseComment(ctxt);
3207 }
3208
3209 /*
3210 * Second case : a sub-element.
3211 */
3212 else if (CUR == '<') {
3213 htmlParseElement(ctxt);
3214 }
3215
3216 /*
3217 * Third case : a reference. If if has not been resolved,
3218 * parsing returns it's Name, create the node
3219 */
3220 else if (CUR == '&') {
3221 htmlParseReference(ctxt);
3222 }
3223
3224 /*
3225 * Fourth : end of the resource
3226 */
3227 else if (CUR == 0) {
3228 htmlAutoClose(ctxt, NULL);
3229 }
3230
3231 /*
3232 * Last case, text. Note that References are handled directly.
3233 */
3234 else {
3235 htmlParseCharData(ctxt, 0);
3236 }
3237
3238 if (cons == ctxt->nbChars) {
3239 if (ctxt->node != NULL) {
3240 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3241 ctxt->sax->error(ctxt->userData,
3242 "detected an error in element content\n");
3243 ctxt->wellFormed = 0;
3244 }
3245 break;
3246 }
3247 }
Daniel Veillarde2d034d1999-07-27 19:52:06 +00003248 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003249 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003250 if (currentNode != NULL) xmlFree(currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003251}
3252
3253/**
3254 * htmlParseElement:
3255 * @ctxt: an HTML parser context
3256 *
3257 * parse an HTML element, this is highly recursive
3258 *
3259 * [39] element ::= EmptyElemTag | STag content ETag
3260 *
3261 * [41] Attribute ::= Name Eq AttValue
3262 */
3263
3264void
3265htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003266 xmlChar *name;
Daniel Veillard7d2c2761999-10-11 15:09:51 +00003267 xmlChar *currentNode = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003268 htmlElemDescPtr info;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003269 htmlParserNodeInfo node_info;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003270 xmlChar *oldname;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003271 int depth = ctxt->nameNr;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003272
3273 /* Capture start position */
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003274 if (ctxt->record_info) {
3275 node_info.begin_pos = ctxt->input->consumed +
3276 (CUR_PTR - ctxt->input->base);
3277 node_info.begin_line = ctxt->input->line;
3278 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003279
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003280 oldname = xmlStrdup(ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003281 htmlParseStartTag(ctxt);
3282 name = ctxt->name;
3283#ifdef DEBUG
3284 if (oldname == NULL)
3285 fprintf(stderr, "Start of element %s\n", name);
3286 else if (name == NULL)
3287 fprintf(stderr, "Start of element failed, was %s\n", oldname);
3288 else
3289 fprintf(stderr, "Start of element %s, was %s\n", name, oldname);
3290#endif
Daniel Veillard8b5dd832000-10-01 20:28:44 +00003291 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003292 (name == NULL)) {
3293 if (CUR == '>')
3294 NEXT;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003295 if (oldname != NULL)
3296 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003297 return;
3298 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003299 if (oldname != NULL)
3300 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003301
3302 /*
3303 * Lookup the info for that element.
3304 */
3305 info = htmlTagLookup(name);
3306 if (info == NULL) {
3307 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3308 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3309 name);
3310 ctxt->wellFormed = 0;
3311 } else if (info->depr) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003312/***************************
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003313 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3314 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3315 name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003316 ***************************/
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003317 }
3318
3319 /*
3320 * Check for an Empty Element labelled the XML/SGML way
3321 */
3322 if ((CUR == '/') && (NXT(1) == '>')) {
3323 SKIP(2);
3324 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3325 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003326 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003327#ifdef DEBUG
3328 fprintf(stderr,"End of tag the XML way: popping out %s\n", oldname);
3329#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003330 if (oldname != NULL)
3331 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003332 return;
3333 }
3334
Daniel Veillarde2d034d1999-07-27 19:52:06 +00003335 if (CUR == '>') {
3336 NEXT;
3337 } else {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003338 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard87b95392000-08-12 21:12:04 +00003339 ctxt->sax->error(ctxt->userData,
3340 "Couldn't find end of Start Tag %s\n",
3341 name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003342 ctxt->wellFormed = 0;
3343
3344 /*
3345 * end of parsing of this node.
3346 */
Daniel Veillard8b5dd832000-10-01 20:28:44 +00003347 if (xmlStrEqual(name, ctxt->name)) {
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003348 nodePop(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003349 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003350#ifdef DEBUG
3351 fprintf(stderr,"End of start tag problem: popping out %s\n", oldname);
3352#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003353 if (oldname != NULL)
3354 xmlFree(oldname);
3355 }
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003356
3357 /*
3358 * Capture end position and add node
3359 */
3360 if ( currentNode != NULL && ctxt->record_info ) {
3361 node_info.end_pos = ctxt->input->consumed +
3362 (CUR_PTR - ctxt->input->base);
3363 node_info.end_line = ctxt->input->line;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003364 node_info.node = ctxt->node;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003365 xmlParserAddNodeInfo(ctxt, &node_info);
3366 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003367 return;
3368 }
3369
3370 /*
3371 * Check for an Empty Element from DTD definition
3372 */
3373 if ((info != NULL) && (info->empty)) {
3374 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3375 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003376 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003377#ifdef DEBUG
3378 fprintf(stderr,"End of empty tag %s : popping out %s\n", name, oldname);
3379#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003380 if (oldname != NULL)
3381 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003382 return;
3383 }
3384
3385 /*
3386 * Parse the content of the element:
3387 */
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003388 currentNode = xmlStrdup(ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003389 depth = ctxt->nameNr;
3390 while (IS_CHAR(CUR)) {
3391 htmlParseContent(ctxt);
3392 if (ctxt->nameNr < depth) break;
3393 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003394
3395 if (!IS_CHAR(CUR)) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003396 /************
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003397 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3398 ctxt->sax->error(ctxt->userData,
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003399 "Premature end of data in tag %s\n", currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003400 ctxt->wellFormed = 0;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003401 *************/
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003402
3403 /*
3404 * end of parsing of this node.
3405 */
3406 nodePop(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003407 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003408#ifdef DEBUG
3409 fprintf(stderr,"Premature end of tag %s : popping out %s\n", name, oldname);
3410#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003411 if (oldname != NULL)
3412 xmlFree(oldname);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003413 if (currentNode != NULL)
3414 xmlFree(currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003415 return;
3416 }
3417
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003418 /*
3419 * Capture end position and add node
3420 */
3421 if ( currentNode != NULL && ctxt->record_info ) {
3422 node_info.end_pos = ctxt->input->consumed +
3423 (CUR_PTR - ctxt->input->base);
3424 node_info.end_line = ctxt->input->line;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003425 node_info.node = ctxt->node;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003426 xmlParserAddNodeInfo(ctxt, &node_info);
3427 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003428 if (currentNode != NULL)
3429 xmlFree(currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003430}
3431
3432/**
3433 * htmlParseDocument :
3434 * @ctxt: an HTML parser context
3435 *
3436 * parse an HTML document (and build a tree if using the standard SAX
3437 * interface).
3438 *
3439 * Returns 0, -1 in case of error. the parser context is augmented
3440 * as a result of the parsing.
3441 */
3442
3443int
3444htmlParseDocument(htmlParserCtxtPtr ctxt) {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003445 xmlDtdPtr dtd;
3446
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003447 htmlDefaultSAXHandlerInit();
3448 ctxt->html = 1;
3449
Daniel Veillarde2d034d1999-07-27 19:52:06 +00003450 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003451 /*
Daniel Veillardb96e6431999-08-29 21:02:19 +00003452 * SAX: beginning of the document processing.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003453 */
3454 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3455 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3456
3457 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003458 * Wipe out everything which is before the first '<'
3459 */
Daniel Veillard35008381999-10-25 13:15:52 +00003460 SKIP_BLANKS;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003461 if (CUR == 0) {
3462 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3463 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3464 ctxt->wellFormed = 0;
3465 }
3466
Daniel Veillardbe803962000-06-28 23:40:59 +00003467 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3468 ctxt->sax->startDocument(ctxt->userData);
3469
3470
Daniel Veillard35008381999-10-25 13:15:52 +00003471 /*
3472 * Parse possible comments before any content
3473 */
3474 while ((CUR == '<') && (NXT(1) == '!') &&
3475 (NXT(2) == '-') && (NXT(3) == '-')) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003476 htmlParseComment(ctxt);
Daniel Veillard35008381999-10-25 13:15:52 +00003477 SKIP_BLANKS;
3478 }
3479
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003480
3481 /*
3482 * Then possibly doc type declaration(s) and more Misc
3483 * (doctypedecl Misc*)?
3484 */
3485 if ((CUR == '<') && (NXT(1) == '!') &&
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003486 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3487 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3488 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3489 (UPP(8) == 'E')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003490 htmlParseDocTypeDecl(ctxt);
3491 }
3492 SKIP_BLANKS;
3493
3494 /*
Daniel Veillard87b95392000-08-12 21:12:04 +00003495 * Parse possible comments before any content
3496 */
3497 while ((CUR == '<') && (NXT(1) == '!') &&
3498 (NXT(2) == '-') && (NXT(3) == '-')) {
3499 htmlParseComment(ctxt);
3500 SKIP_BLANKS;
3501 }
3502
3503 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003504 * Time to start parsing the tree itself
3505 */
Daniel Veillard35008381999-10-25 13:15:52 +00003506 htmlParseContent(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003507
3508 /*
Daniel Veillard365e13b2000-07-02 07:56:37 +00003509 * autoclose
3510 */
3511 if (CUR == 0)
3512 htmlAutoClose(ctxt, NULL);
3513
3514
3515 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003516 * SAX: end of the document processing.
3517 */
3518 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3519 ctxt->sax->endDocument(ctxt->userData);
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003520
3521 if (ctxt->myDoc != NULL) {
3522 dtd = xmlGetIntSubset(ctxt->myDoc);
3523 if (dtd == NULL)
3524 ctxt->myDoc->intSubset =
3525 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3526 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3527 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3528 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003529 if (! ctxt->wellFormed) return(-1);
3530 return(0);
3531}
3532
3533
Daniel Veillarddbfd6411999-12-28 16:35:14 +00003534/************************************************************************
3535 * *
3536 * Parser contexts handling *
3537 * *
3538 ************************************************************************/
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003539
3540/**
3541 * xmlInitParserCtxt:
3542 * @ctxt: an HTML parser context
3543 *
3544 * Initialize a parser context
3545 */
3546
3547void
3548htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3549{
3550 htmlSAXHandler *sax;
3551
Daniel Veillard35008381999-10-25 13:15:52 +00003552 if (ctxt == NULL) return;
3553 memset(ctxt, 0, sizeof(htmlParserCtxt));
3554
Daniel Veillard6454aec1999-09-02 22:04:43 +00003555 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003556 if (sax == NULL) {
3557 fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
3558 }
Daniel Veillard4fb87ee2000-09-19 12:25:59 +00003559 else
3560 memset(sax, 0, sizeof(htmlSAXHandler));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003561
3562 /* Allocate the Input stack */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003563 ctxt->inputTab = (htmlParserInputPtr *)
3564 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3565 if (ctxt->inputTab == NULL) {
3566 fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
Daniel Veillard39c7d712000-09-10 16:14:55 +00003567 ctxt->inputNr = 0;
3568 ctxt->inputMax = 0;
3569 ctxt->input = NULL;
3570 return;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003571 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003572 ctxt->inputNr = 0;
3573 ctxt->inputMax = 5;
3574 ctxt->input = NULL;
3575 ctxt->version = NULL;
3576 ctxt->encoding = NULL;
3577 ctxt->standalone = -1;
Daniel Veillarddbfd6411999-12-28 16:35:14 +00003578 ctxt->instate = XML_PARSER_START;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003579
3580 /* Allocate the Node stack */
Daniel Veillard6454aec1999-09-02 22:04:43 +00003581 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
Daniel Veillard39c7d712000-09-10 16:14:55 +00003582 if (ctxt->nodeTab == NULL) {
3583 fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
3584 ctxt->nodeNr = 0;
3585 ctxt->nodeMax = 0;
3586 ctxt->node = NULL;
3587 ctxt->inputNr = 0;
3588 ctxt->inputMax = 0;
3589 ctxt->input = NULL;
3590 return;
3591 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003592 ctxt->nodeNr = 0;
3593 ctxt->nodeMax = 10;
3594 ctxt->node = NULL;
3595
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003596 /* Allocate the Name stack */
3597 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Daniel Veillard39c7d712000-09-10 16:14:55 +00003598 if (ctxt->nameTab == NULL) {
3599 fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
3600 ctxt->nameNr = 0;
3601 ctxt->nameMax = 10;
3602 ctxt->name = NULL;
3603 ctxt->nodeNr = 0;
3604 ctxt->nodeMax = 0;
3605 ctxt->node = NULL;
3606 ctxt->inputNr = 0;
3607 ctxt->inputMax = 0;
3608 ctxt->input = NULL;
3609 return;
3610 }
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003611 ctxt->nameNr = 0;
3612 ctxt->nameMax = 10;
3613 ctxt->name = NULL;
3614
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003615 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3616 else {
3617 ctxt->sax = sax;
3618 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3619 }
3620 ctxt->userData = ctxt;
3621 ctxt->myDoc = NULL;
3622 ctxt->wellFormed = 1;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003623 ctxt->replaceEntities = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003624 ctxt->html = 1;
3625 ctxt->record_info = 0;
Daniel Veillard35008381999-10-25 13:15:52 +00003626 ctxt->validate = 0;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003627 ctxt->nbChars = 0;
Daniel Veillarddbfd6411999-12-28 16:35:14 +00003628 ctxt->checkIndex = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003629 xmlInitNodeInfoSeq(&ctxt->node_seq);
3630}
3631
3632/**
3633 * htmlFreeParserCtxt:
3634 * @ctxt: an HTML parser context
3635 *
3636 * Free all the memory used by a parser context. However the parsed
3637 * document in ctxt->myDoc is not freed.
3638 */
3639
3640void
3641htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3642{
Daniel Veillard365e13b2000-07-02 07:56:37 +00003643 xmlFreeParserCtxt(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003644}
3645
3646/**
3647 * htmlCreateDocParserCtxt :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003648 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003649 * @encoding: a free form C string describing the HTML document encoding, or NULL
3650 *
3651 * Create a parser context for an HTML document.
3652 *
3653 * Returns the new parser context or NULL
3654 */
3655htmlParserCtxtPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003656htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003657 htmlParserCtxtPtr ctxt;
3658 htmlParserInputPtr input;
3659 /* htmlCharEncoding enc; */
3660
Daniel Veillard6454aec1999-09-02 22:04:43 +00003661 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003662 if (ctxt == NULL) {
3663 perror("malloc");
3664 return(NULL);
3665 }
3666 htmlInitParserCtxt(ctxt);
Daniel Veillard6454aec1999-09-02 22:04:43 +00003667 input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003668 if (input == NULL) {
3669 perror("malloc");
Daniel Veillard6454aec1999-09-02 22:04:43 +00003670 xmlFree(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003671 return(NULL);
3672 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003673 memset(input, 0, sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003674
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003675 input->line = 1;
3676 input->col = 1;
3677 input->base = cur;
3678 input->cur = cur;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003679
3680 inputPush(ctxt, input);
3681 return(ctxt);
3682}
3683
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003684/************************************************************************
3685 * *
3686 * Progressive parsing interfaces *
3687 * *
3688 ************************************************************************/
3689
3690/**
3691 * htmlParseLookupSequence:
3692 * @ctxt: an HTML parser context
3693 * @first: the first char to lookup
3694 * @next: the next char to lookup or zero
3695 * @third: the next char to lookup or zero
3696 *
3697 * Try to find if a sequence (first, next, third) or just (first next) or
3698 * (first) is available in the input stream.
3699 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3700 * to avoid rescanning sequences of bytes, it DOES change the state of the
3701 * parser, do not use liberally.
3702 * This is basically similar to xmlParseLookupSequence()
3703 *
3704 * Returns the index to the current parsing point if the full sequence
3705 * is available, -1 otherwise.
3706 */
3707int
3708htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3709 xmlChar next, xmlChar third) {
3710 int base, len;
3711 htmlParserInputPtr in;
3712 const xmlChar *buf;
3713
3714 in = ctxt->input;
3715 if (in == NULL) return(-1);
3716 base = in->cur - in->base;
3717 if (base < 0) return(-1);
3718 if (ctxt->checkIndex > base)
3719 base = ctxt->checkIndex;
3720 if (in->buf == NULL) {
3721 buf = in->base;
3722 len = in->length;
3723 } else {
3724 buf = in->buf->buffer->content;
3725 len = in->buf->buffer->use;
3726 }
3727 /* take into account the sequence length */
3728 if (third) len -= 2;
3729 else if (next) len --;
3730 for (;base < len;base++) {
3731 if (buf[base] == first) {
3732 if (third != 0) {
3733 if ((buf[base + 1] != next) ||
3734 (buf[base + 2] != third)) continue;
3735 } else if (next != 0) {
3736 if (buf[base + 1] != next) continue;
3737 }
3738 ctxt->checkIndex = 0;
3739#ifdef DEBUG_PUSH
3740 if (next == 0)
3741 fprintf(stderr, "HPP: lookup '%c' found at %d\n",
3742 first, base);
3743 else if (third == 0)
3744 fprintf(stderr, "HPP: lookup '%c%c' found at %d\n",
3745 first, next, base);
3746 else
3747 fprintf(stderr, "HPP: lookup '%c%c%c' found at %d\n",
3748 first, next, third, base);
3749#endif
3750 return(base - (in->cur - in->base));
3751 }
3752 }
3753 ctxt->checkIndex = base;
3754#ifdef DEBUG_PUSH
3755 if (next == 0)
3756 fprintf(stderr, "HPP: lookup '%c' failed\n", first);
3757 else if (third == 0)
3758 fprintf(stderr, "HPP: lookup '%c%c' failed\n", first, next);
3759 else
3760 fprintf(stderr, "HPP: lookup '%c%c%c' failed\n", first, next, third);
3761#endif
3762 return(-1);
3763}
3764
3765/**
Daniel Veillard71b656e2000-01-05 14:46:17 +00003766 * htmlParseTryOrFinish:
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003767 * @ctxt: an HTML parser context
Daniel Veillard71b656e2000-01-05 14:46:17 +00003768 * @terminate: last chunk indicator
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003769 *
3770 * Try to progress on parsing
3771 *
3772 * Returns zero if no parsing was possible
3773 */
3774int
Daniel Veillard71b656e2000-01-05 14:46:17 +00003775htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003776 int ret = 0;
3777 htmlParserInputPtr in;
Daniel Veillard365e13b2000-07-02 07:56:37 +00003778 int avail = 0;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003779 xmlChar cur, next;
3780
3781#ifdef DEBUG_PUSH
3782 switch (ctxt->instate) {
3783 case XML_PARSER_EOF:
3784 fprintf(stderr, "HPP: try EOF\n"); break;
3785 case XML_PARSER_START:
3786 fprintf(stderr, "HPP: try START\n"); break;
3787 case XML_PARSER_MISC:
3788 fprintf(stderr, "HPP: try MISC\n");break;
3789 case XML_PARSER_COMMENT:
3790 fprintf(stderr, "HPP: try COMMENT\n");break;
3791 case XML_PARSER_PROLOG:
3792 fprintf(stderr, "HPP: try PROLOG\n");break;
3793 case XML_PARSER_START_TAG:
3794 fprintf(stderr, "HPP: try START_TAG\n");break;
3795 case XML_PARSER_CONTENT:
3796 fprintf(stderr, "HPP: try CONTENT\n");break;
3797 case XML_PARSER_CDATA_SECTION:
3798 fprintf(stderr, "HPP: try CDATA_SECTION\n");break;
3799 case XML_PARSER_END_TAG:
3800 fprintf(stderr, "HPP: try END_TAG\n");break;
3801 case XML_PARSER_ENTITY_DECL:
3802 fprintf(stderr, "HPP: try ENTITY_DECL\n");break;
3803 case XML_PARSER_ENTITY_VALUE:
3804 fprintf(stderr, "HPP: try ENTITY_VALUE\n");break;
3805 case XML_PARSER_ATTRIBUTE_VALUE:
3806 fprintf(stderr, "HPP: try ATTRIBUTE_VALUE\n");break;
3807 case XML_PARSER_DTD:
3808 fprintf(stderr, "HPP: try DTD\n");break;
3809 case XML_PARSER_EPILOG:
3810 fprintf(stderr, "HPP: try EPILOG\n");break;
3811 case XML_PARSER_PI:
3812 fprintf(stderr, "HPP: try PI\n");break;
Daniel Veillard7eda8452000-10-14 23:38:43 +00003813 case XML_PARSER_SYSTEM_LITERAL:
3814 fprintf(stderr, "HPP: try SYSTEM_LITERAL\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003815 }
3816#endif
3817
3818 while (1) {
3819
3820 in = ctxt->input;
3821 if (in == NULL) break;
3822 if (in->buf == NULL)
3823 avail = in->length - (in->cur - in->base);
3824 else
3825 avail = in->buf->buffer->use - (in->cur - in->base);
Daniel Veillard365e13b2000-07-02 07:56:37 +00003826 if ((avail == 0) && (terminate)) {
3827 htmlAutoClose(ctxt, NULL);
Daniel Veillard87b95392000-08-12 21:12:04 +00003828 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
3829 /*
3830 * SAX: end of the document processing.
3831 */
Daniel Veillard365e13b2000-07-02 07:56:37 +00003832 ctxt->instate = XML_PARSER_EOF;
Daniel Veillard87b95392000-08-12 21:12:04 +00003833 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3834 ctxt->sax->endDocument(ctxt->userData);
3835 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00003836 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003837 if (avail < 1)
3838 goto done;
3839 switch (ctxt->instate) {
3840 case XML_PARSER_EOF:
3841 /*
3842 * Document parsing is done !
3843 */
3844 goto done;
3845 case XML_PARSER_START:
3846 /*
3847 * Very first chars read from the document flow.
3848 */
3849 cur = in->cur[0];
3850 if (IS_BLANK(cur)) {
3851 SKIP_BLANKS;
3852 if (in->buf == NULL)
3853 avail = in->length - (in->cur - in->base);
3854 else
3855 avail = in->buf->buffer->use - (in->cur - in->base);
3856 }
3857 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3858 ctxt->sax->setDocumentLocator(ctxt->userData,
3859 &xmlDefaultSAXLocator);
Daniel Veillardd83eb822000-06-30 18:39:56 +00003860 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
3861 (!ctxt->disableSAX))
3862 ctxt->sax->startDocument(ctxt->userData);
3863
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003864 cur = in->cur[0];
3865 next = in->cur[1];
3866 if ((cur == '<') && (next == '!') &&
3867 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3868 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3869 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3870 (UPP(8) == 'E')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003871 if ((!terminate) &&
3872 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003873 goto done;
3874#ifdef DEBUG_PUSH
3875 fprintf(stderr, "HPP: Parsing internal subset\n");
3876#endif
3877 htmlParseDocTypeDecl(ctxt);
3878 ctxt->instate = XML_PARSER_PROLOG;
3879#ifdef DEBUG_PUSH
3880 fprintf(stderr, "HPP: entering PROLOG\n");
3881#endif
3882 } else {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003883 ctxt->instate = XML_PARSER_MISC;
3884 }
3885#ifdef DEBUG_PUSH
3886 fprintf(stderr, "HPP: entering MISC\n");
3887#endif
3888 break;
3889 case XML_PARSER_MISC:
3890 SKIP_BLANKS;
3891 if (in->buf == NULL)
3892 avail = in->length - (in->cur - in->base);
3893 else
3894 avail = in->buf->buffer->use - (in->cur - in->base);
3895 if (avail < 2)
3896 goto done;
3897 cur = in->cur[0];
3898 next = in->cur[1];
3899 if ((cur == '<') && (next == '!') &&
3900 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003901 if ((!terminate) &&
3902 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003903 goto done;
3904#ifdef DEBUG_PUSH
3905 fprintf(stderr, "HPP: Parsing Comment\n");
3906#endif
3907 htmlParseComment(ctxt);
3908 ctxt->instate = XML_PARSER_MISC;
3909 } else if ((cur == '<') && (next == '!') &&
3910 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3911 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3912 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3913 (UPP(8) == 'E')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003914 if ((!terminate) &&
3915 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003916 goto done;
3917#ifdef DEBUG_PUSH
3918 fprintf(stderr, "HPP: Parsing internal subset\n");
3919#endif
3920 htmlParseDocTypeDecl(ctxt);
3921 ctxt->instate = XML_PARSER_PROLOG;
3922#ifdef DEBUG_PUSH
3923 fprintf(stderr, "HPP: entering PROLOG\n");
3924#endif
3925 } else if ((cur == '<') && (next == '!') &&
3926 (avail < 9)) {
3927 goto done;
3928 } else {
3929 ctxt->instate = XML_PARSER_START_TAG;
3930#ifdef DEBUG_PUSH
3931 fprintf(stderr, "HPP: entering START_TAG\n");
3932#endif
3933 }
3934 break;
3935 case XML_PARSER_PROLOG:
3936 SKIP_BLANKS;
3937 if (in->buf == NULL)
3938 avail = in->length - (in->cur - in->base);
3939 else
3940 avail = in->buf->buffer->use - (in->cur - in->base);
3941 if (avail < 2)
3942 goto done;
3943 cur = in->cur[0];
3944 next = in->cur[1];
3945 if ((cur == '<') && (next == '!') &&
3946 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003947 if ((!terminate) &&
3948 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003949 goto done;
3950#ifdef DEBUG_PUSH
3951 fprintf(stderr, "HPP: Parsing Comment\n");
3952#endif
3953 htmlParseComment(ctxt);
3954 ctxt->instate = XML_PARSER_PROLOG;
3955 } else if ((cur == '<') && (next == '!') &&
3956 (avail < 4)) {
3957 goto done;
3958 } else {
3959 ctxt->instate = XML_PARSER_START_TAG;
3960#ifdef DEBUG_PUSH
3961 fprintf(stderr, "HPP: entering START_TAG\n");
3962#endif
3963 }
3964 break;
3965 case XML_PARSER_EPILOG:
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003966 if (in->buf == NULL)
3967 avail = in->length - (in->cur - in->base);
3968 else
3969 avail = in->buf->buffer->use - (in->cur - in->base);
Daniel Veillard87b95392000-08-12 21:12:04 +00003970 if (avail < 1)
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003971 goto done;
3972 cur = in->cur[0];
Daniel Veillard87b95392000-08-12 21:12:04 +00003973 if (IS_BLANK(cur)) {
3974 htmlParseCharData(ctxt, 0);
3975 goto done;
3976 }
3977 if (avail < 2)
3978 goto done;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003979 next = in->cur[1];
3980 if ((cur == '<') && (next == '!') &&
3981 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003982 if ((!terminate) &&
3983 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003984 goto done;
3985#ifdef DEBUG_PUSH
3986 fprintf(stderr, "HPP: Parsing Comment\n");
3987#endif
3988 htmlParseComment(ctxt);
3989 ctxt->instate = XML_PARSER_EPILOG;
3990 } else if ((cur == '<') && (next == '!') &&
3991 (avail < 4)) {
3992 goto done;
3993 } else {
Daniel Veillarda2c6da92000-09-16 18:15:00 +00003994 ctxt->errNo = XML_ERR_DOCUMENT_END;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003995 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3996 ctxt->sax->error(ctxt->userData,
3997 "Extra content at the end of the document\n");
3998 ctxt->wellFormed = 0;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003999 ctxt->instate = XML_PARSER_EOF;
4000#ifdef DEBUG_PUSH
4001 fprintf(stderr, "HPP: entering EOF\n");
4002#endif
4003 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4004 ctxt->sax->endDocument(ctxt->userData);
4005 goto done;
4006 }
4007 break;
4008 case XML_PARSER_START_TAG: {
4009 xmlChar *name, *oldname;
4010 int depth = ctxt->nameNr;
4011 htmlElemDescPtr info;
4012
4013 if (avail < 2)
4014 goto done;
4015 cur = in->cur[0];
4016 if (cur != '<') {
4017 ctxt->instate = XML_PARSER_CONTENT;
4018#ifdef DEBUG_PUSH
4019 fprintf(stderr, "HPP: entering CONTENT\n");
4020#endif
4021 break;
4022 }
Daniel Veillard71b656e2000-01-05 14:46:17 +00004023 if ((!terminate) &&
4024 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004025 goto done;
4026
4027 oldname = xmlStrdup(ctxt->name);
4028 htmlParseStartTag(ctxt);
4029 name = ctxt->name;
4030#ifdef DEBUG
4031 if (oldname == NULL)
4032 fprintf(stderr, "Start of element %s\n", name);
4033 else if (name == NULL)
4034 fprintf(stderr, "Start of element failed, was %s\n",
4035 oldname);
4036 else
4037 fprintf(stderr, "Start of element %s, was %s\n",
4038 name, oldname);
4039#endif
4040 if (((depth == ctxt->nameNr) &&
Daniel Veillard8b5dd832000-10-01 20:28:44 +00004041 (xmlStrEqual(oldname, ctxt->name))) ||
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004042 (name == NULL)) {
4043 if (CUR == '>')
4044 NEXT;
4045 if (oldname != NULL)
4046 xmlFree(oldname);
4047 break;
4048 }
4049 if (oldname != NULL)
4050 xmlFree(oldname);
4051
4052 /*
4053 * Lookup the info for that element.
4054 */
4055 info = htmlTagLookup(name);
4056 if (info == NULL) {
4057 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4058 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4059 name);
4060 ctxt->wellFormed = 0;
4061 } else if (info->depr) {
4062 /***************************
4063 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4064 ctxt->sax->warning(ctxt->userData,
4065 "Tag %s is deprecated\n",
4066 name);
4067 ***************************/
4068 }
4069
4070 /*
4071 * Check for an Empty Element labelled the XML/SGML way
4072 */
4073 if ((CUR == '/') && (NXT(1) == '>')) {
4074 SKIP(2);
4075 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4076 ctxt->sax->endElement(ctxt->userData, name);
4077 oldname = htmlnamePop(ctxt);
4078#ifdef DEBUG
4079 fprintf(stderr,"End of tag the XML way: popping out %s\n",
4080 oldname);
4081#endif
4082 if (oldname != NULL)
4083 xmlFree(oldname);
4084 ctxt->instate = XML_PARSER_CONTENT;
4085#ifdef DEBUG_PUSH
4086 fprintf(stderr, "HPP: entering CONTENT\n");
4087#endif
4088 break;
4089 }
4090
4091 if (CUR == '>') {
4092 NEXT;
4093 } else {
4094 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4095 ctxt->sax->error(ctxt->userData,
4096 "Couldn't find end of Start Tag %s\n",
4097 name);
4098 ctxt->wellFormed = 0;
4099
4100 /*
4101 * end of parsing of this node.
4102 */
Daniel Veillard8b5dd832000-10-01 20:28:44 +00004103 if (xmlStrEqual(name, ctxt->name)) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004104 nodePop(ctxt);
4105 oldname = htmlnamePop(ctxt);
4106#ifdef DEBUG
4107 fprintf(stderr,
4108 "End of start tag problem: popping out %s\n", oldname);
4109#endif
4110 if (oldname != NULL)
4111 xmlFree(oldname);
4112 }
4113
4114 ctxt->instate = XML_PARSER_CONTENT;
4115#ifdef DEBUG_PUSH
4116 fprintf(stderr, "HPP: entering CONTENT\n");
4117#endif
4118 break;
4119 }
4120
4121 /*
4122 * Check for an Empty Element from DTD definition
4123 */
4124 if ((info != NULL) && (info->empty)) {
4125 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4126 ctxt->sax->endElement(ctxt->userData, name);
4127 oldname = htmlnamePop(ctxt);
4128#ifdef DEBUG
4129 fprintf(stderr,"End of empty tag %s : popping out %s\n", name, oldname);
4130#endif
4131 if (oldname != NULL)
4132 xmlFree(oldname);
4133 }
4134 ctxt->instate = XML_PARSER_CONTENT;
4135#ifdef DEBUG_PUSH
4136 fprintf(stderr, "HPP: entering CONTENT\n");
4137#endif
4138 break;
4139 }
Daniel Veillard87b95392000-08-12 21:12:04 +00004140 case XML_PARSER_CONTENT: {
4141 long cons;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004142 /*
4143 * Handle preparsed entities and charRef
4144 */
4145 if (ctxt->token != 0) {
Daniel Veillard365e13b2000-07-02 07:56:37 +00004146 xmlChar chr[2] = { 0 , 0 } ;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004147
Daniel Veillard365e13b2000-07-02 07:56:37 +00004148 chr[0] = (xmlChar) ctxt->token;
Daniel Veillardb8f25c92000-08-19 19:52:36 +00004149 htmlCheckParagraph(ctxt);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004150 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
Daniel Veillard365e13b2000-07-02 07:56:37 +00004151 ctxt->sax->characters(ctxt->userData, chr, 1);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004152 ctxt->token = 0;
4153 ctxt->checkIndex = 0;
4154 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00004155 if ((avail == 1) && (terminate)) {
4156 cur = in->cur[0];
4157 if ((cur != '<') && (cur != '&')) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00004158 if (ctxt->sax != NULL) {
4159 if (IS_BLANK(cur)) {
4160 if (ctxt->sax->ignorableWhitespace != NULL)
4161 ctxt->sax->ignorableWhitespace(
4162 ctxt->userData, &cur, 1);
4163 } else {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00004164 htmlCheckParagraph(ctxt);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00004165 if (ctxt->sax->characters != NULL)
4166 ctxt->sax->characters(
4167 ctxt->userData, &cur, 1);
4168 }
4169 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00004170 ctxt->token = 0;
4171 ctxt->checkIndex = 0;
4172 NEXT;
4173 }
4174 break;
4175 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004176 if (avail < 2)
4177 goto done;
4178 cur = in->cur[0];
4179 next = in->cur[1];
Daniel Veillard87b95392000-08-12 21:12:04 +00004180 cons = ctxt->nbChars;
Daniel Veillard7eda8452000-10-14 23:38:43 +00004181 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4182 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004183 /*
Daniel Veillard7eda8452000-10-14 23:38:43 +00004184 * Handle SCRIPT/STYLE separately
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004185 */
Daniel Veillard7eda8452000-10-14 23:38:43 +00004186 if ((!terminate) &&
4187 (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4188 goto done;
4189 htmlParseScript(ctxt);
4190 if ((cur == '<') && (next == '/')) {
4191 ctxt->instate = XML_PARSER_END_TAG;
4192 ctxt->checkIndex = 0;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004193#ifdef DEBUG_PUSH
Daniel Veillard7eda8452000-10-14 23:38:43 +00004194 fprintf(stderr, "HPP: entering END_TAG\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004195#endif
Daniel Veillard7eda8452000-10-14 23:38:43 +00004196 break;
4197 }
4198 } else {
4199 /*
4200 * Sometimes DOCTYPE arrives in the middle of the document
4201 */
4202 if ((cur == '<') && (next == '!') &&
4203 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4204 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4205 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4206 (UPP(8) == 'E')) {
4207 if ((!terminate) &&
4208 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4209 goto done;
4210 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4211 ctxt->sax->error(ctxt->userData,
4212 "Misplaced DOCTYPE declaration\n");
4213 ctxt->wellFormed = 0;
4214 htmlParseDocTypeDecl(ctxt);
4215 } else if ((cur == '<') && (next == '!') &&
4216 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4217 if ((!terminate) &&
4218 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4219 goto done;
4220#ifdef DEBUG_PUSH
4221 fprintf(stderr, "HPP: Parsing Comment\n");
4222#endif
4223 htmlParseComment(ctxt);
4224 ctxt->instate = XML_PARSER_CONTENT;
4225 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4226 goto done;
4227 } else if ((cur == '<') && (next == '/')) {
4228 ctxt->instate = XML_PARSER_END_TAG;
4229 ctxt->checkIndex = 0;
4230#ifdef DEBUG_PUSH
4231 fprintf(stderr, "HPP: entering END_TAG\n");
4232#endif
4233 break;
4234 } else if (cur == '<') {
4235 ctxt->instate = XML_PARSER_START_TAG;
4236 ctxt->checkIndex = 0;
4237#ifdef DEBUG_PUSH
4238 fprintf(stderr, "HPP: entering START_TAG\n");
4239#endif
4240 break;
4241 } else if (cur == '&') {
4242 if ((!terminate) &&
4243 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4244 goto done;
4245#ifdef DEBUG_PUSH
4246 fprintf(stderr, "HPP: Parsing Reference\n");
4247#endif
4248 /* TODO: check generation of subtrees if noent !!! */
4249 htmlParseReference(ctxt);
4250 } else {
4251 /* TODO Avoid the extra copy, handle directly !!!!!! */
4252 /*
4253 * Goal of the following test is :
4254 * - minimize calls to the SAX 'character' callback
4255 * when they are mergeable
4256 */
4257 if ((ctxt->inputNr == 1) &&
4258 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4259 if ((!terminate) &&
4260 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4261 goto done;
4262 }
4263 ctxt->checkIndex = 0;
4264#ifdef DEBUG_PUSH
4265 fprintf(stderr, "HPP: Parsing char data\n");
4266#endif
4267 htmlParseCharData(ctxt, 0);
4268 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004269 }
Daniel Veillard87b95392000-08-12 21:12:04 +00004270 if (cons == ctxt->nbChars) {
4271 if (ctxt->node != NULL) {
4272 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4273 ctxt->sax->error(ctxt->userData,
4274 "detected an error in element content\n");
4275 ctxt->wellFormed = 0;
Daniel Veillard87b95392000-08-12 21:12:04 +00004276 }
Daniel Veillard8ddb5a72000-09-23 10:28:52 +00004277 NEXT;
Daniel Veillard87b95392000-08-12 21:12:04 +00004278 break;
4279 }
4280
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004281 break;
Daniel Veillard87b95392000-08-12 21:12:04 +00004282 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004283 case XML_PARSER_END_TAG:
4284 if (avail < 2)
4285 goto done;
Daniel Veillard71b656e2000-01-05 14:46:17 +00004286 if ((!terminate) &&
4287 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004288 goto done;
4289 htmlParseEndTag(ctxt);
4290 if (ctxt->nameNr == 0) {
4291 ctxt->instate = XML_PARSER_EPILOG;
4292 } else {
4293 ctxt->instate = XML_PARSER_CONTENT;
4294 }
4295 ctxt->checkIndex = 0;
4296#ifdef DEBUG_PUSH
4297 fprintf(stderr, "HPP: entering CONTENT\n");
4298#endif
4299 break;
4300 case XML_PARSER_CDATA_SECTION:
4301 fprintf(stderr, "HPP: internal error, state == CDATA\n");
4302 ctxt->instate = XML_PARSER_CONTENT;
4303 ctxt->checkIndex = 0;
4304#ifdef DEBUG_PUSH
4305 fprintf(stderr, "HPP: entering CONTENT\n");
4306#endif
4307 break;
4308 case XML_PARSER_DTD:
4309 fprintf(stderr, "HPP: internal error, state == DTD\n");
4310 ctxt->instate = XML_PARSER_CONTENT;
4311 ctxt->checkIndex = 0;
4312#ifdef DEBUG_PUSH
4313 fprintf(stderr, "HPP: entering CONTENT\n");
4314#endif
4315 break;
4316 case XML_PARSER_COMMENT:
4317 fprintf(stderr, "HPP: internal error, state == COMMENT\n");
4318 ctxt->instate = XML_PARSER_CONTENT;
4319 ctxt->checkIndex = 0;
4320#ifdef DEBUG_PUSH
4321 fprintf(stderr, "HPP: entering CONTENT\n");
4322#endif
4323 break;
4324 case XML_PARSER_PI:
4325 fprintf(stderr, "HPP: internal error, state == PI\n");
4326 ctxt->instate = XML_PARSER_CONTENT;
4327 ctxt->checkIndex = 0;
4328#ifdef DEBUG_PUSH
4329 fprintf(stderr, "HPP: entering CONTENT\n");
4330#endif
4331 break;
4332 case XML_PARSER_ENTITY_DECL:
4333 fprintf(stderr, "HPP: internal error, state == ENTITY_DECL\n");
4334 ctxt->instate = XML_PARSER_CONTENT;
4335 ctxt->checkIndex = 0;
4336#ifdef DEBUG_PUSH
4337 fprintf(stderr, "HPP: entering CONTENT\n");
4338#endif
4339 break;
4340 case XML_PARSER_ENTITY_VALUE:
4341 fprintf(stderr, "HPP: internal error, state == ENTITY_VALUE\n");
4342 ctxt->instate = XML_PARSER_CONTENT;
4343 ctxt->checkIndex = 0;
4344#ifdef DEBUG_PUSH
4345 fprintf(stderr, "HPP: entering DTD\n");
4346#endif
4347 break;
4348 case XML_PARSER_ATTRIBUTE_VALUE:
4349 fprintf(stderr, "HPP: internal error, state == ATTRIBUTE_VALUE\n");
4350 ctxt->instate = XML_PARSER_START_TAG;
4351 ctxt->checkIndex = 0;
4352#ifdef DEBUG_PUSH
4353 fprintf(stderr, "HPP: entering START_TAG\n");
4354#endif
4355 break;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00004356 case XML_PARSER_SYSTEM_LITERAL:
4357 fprintf(stderr, "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
4358 ctxt->instate = XML_PARSER_CONTENT;
4359 ctxt->checkIndex = 0;
4360#ifdef DEBUG_PUSH
4361 fprintf(stderr, "HPP: entering CONTENT\n");
4362#endif
4363 break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004364 }
4365 }
4366done:
Daniel Veillard365e13b2000-07-02 07:56:37 +00004367 if ((avail == 0) && (terminate)) {
4368 htmlAutoClose(ctxt, NULL);
Daniel Veillard87b95392000-08-12 21:12:04 +00004369 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4370 /*
4371 * SAX: end of the document processing.
4372 */
Daniel Veillard365e13b2000-07-02 07:56:37 +00004373 ctxt->instate = XML_PARSER_EOF;
Daniel Veillard87b95392000-08-12 21:12:04 +00004374 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4375 ctxt->sax->endDocument(ctxt->userData);
4376 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00004377 }
Daniel Veillardb8f25c92000-08-19 19:52:36 +00004378 if ((ctxt->myDoc != NULL) &&
4379 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4380 (ctxt->instate == XML_PARSER_EPILOG))) {
4381 xmlDtdPtr dtd;
4382 dtd = xmlGetIntSubset(ctxt->myDoc);
4383 if (dtd == NULL)
4384 ctxt->myDoc->intSubset =
4385 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4386 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4387 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4388 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004389#ifdef DEBUG_PUSH
4390 fprintf(stderr, "HPP: done %d\n", ret);
4391#endif
4392 return(ret);
4393}
4394
4395/**
Daniel Veillard71b656e2000-01-05 14:46:17 +00004396 * htmlParseTry:
4397 * @ctxt: an HTML parser context
4398 *
4399 * Try to progress on parsing
4400 *
4401 * Returns zero if no parsing was possible
4402 */
4403int
4404htmlParseTry(htmlParserCtxtPtr ctxt) {
4405 return(htmlParseTryOrFinish(ctxt, 0));
4406}
4407
4408/**
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004409 * htmlParseChunk:
4410 * @ctxt: an XML parser context
4411 * @chunk: an char array
4412 * @size: the size in byte of the chunk
4413 * @terminate: last chunk indicator
4414 *
4415 * Parse a Chunk of memory
4416 *
4417 * Returns zero if no error, the xmlParserErrors otherwise.
4418 */
4419int
4420htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4421 int terminate) {
4422 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4423 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4424 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4425 int cur = ctxt->input->cur - ctxt->input->base;
4426
4427 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4428 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4429 ctxt->input->cur = ctxt->input->base + cur;
4430#ifdef DEBUG_PUSH
4431 fprintf(stderr, "HPP: pushed %d\n", size);
4432#endif
4433
Daniel Veillardd0f7f742000-02-02 17:42:48 +00004434 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4435 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard52402ce2000-08-22 23:36:12 +00004436 } else if (ctxt->instate != XML_PARSER_EOF) {
4437 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
Daniel Veillard71b656e2000-01-05 14:46:17 +00004438 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard52402ce2000-08-22 23:36:12 +00004439 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004440 if (terminate) {
4441 if ((ctxt->instate != XML_PARSER_EOF) &&
4442 (ctxt->instate != XML_PARSER_EPILOG) &&
4443 (ctxt->instate != XML_PARSER_MISC)) {
Daniel Veillarda2c6da92000-09-16 18:15:00 +00004444 ctxt->errNo = XML_ERR_DOCUMENT_END;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004445 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4446 ctxt->sax->error(ctxt->userData,
4447 "Extra content at the end of the document\n");
4448 ctxt->wellFormed = 0;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004449 }
4450 if (ctxt->instate != XML_PARSER_EOF) {
4451 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4452 ctxt->sax->endDocument(ctxt->userData);
4453 }
4454 ctxt->instate = XML_PARSER_EOF;
4455 }
4456 return((xmlParserErrors) ctxt->errNo);
4457}
4458
4459/************************************************************************
4460 * *
4461 * User entry points *
4462 * *
4463 ************************************************************************/
4464
4465/**
4466 * htmlCreatePushParserCtxt :
4467 * @sax: a SAX handler
4468 * @user_data: The user data returned on SAX callbacks
4469 * @chunk: a pointer to an array of chars
4470 * @size: number of chars in the array
4471 * @filename: an optional file name or URI
4472 * @enc: an optional encoding
4473 *
4474 * Create a parser context for using the HTML parser in push mode
4475 * To allow content encoding detection, @size should be >= 4
4476 * The value of @filename is used for fetching external entities
4477 * and error/warning reports.
4478 *
4479 * Returns the new parser context or NULL
4480 */
4481htmlParserCtxtPtr
4482htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4483 const char *chunk, int size, const char *filename,
4484 xmlCharEncoding enc) {
4485 htmlParserCtxtPtr ctxt;
4486 htmlParserInputPtr inputStream;
4487 xmlParserInputBufferPtr buf;
4488
4489 buf = xmlAllocParserInputBuffer(enc);
4490 if (buf == NULL) return(NULL);
4491
4492 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4493 if (ctxt == NULL) {
4494 xmlFree(buf);
4495 return(NULL);
4496 }
4497 memset(ctxt, 0, sizeof(htmlParserCtxt));
4498 htmlInitParserCtxt(ctxt);
4499 if (sax != NULL) {
4500 if (ctxt->sax != &htmlDefaultSAXHandler)
4501 xmlFree(ctxt->sax);
4502 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4503 if (ctxt->sax == NULL) {
4504 xmlFree(buf);
4505 xmlFree(ctxt);
4506 return(NULL);
4507 }
4508 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4509 if (user_data != NULL)
4510 ctxt->userData = user_data;
4511 }
4512 if (filename == NULL) {
4513 ctxt->directory = NULL;
4514 } else {
4515 ctxt->directory = xmlParserGetDirectory(filename);
4516 }
4517
4518 inputStream = htmlNewInputStream(ctxt);
4519 if (inputStream == NULL) {
4520 xmlFreeParserCtxt(ctxt);
4521 return(NULL);
4522 }
4523
4524 if (filename == NULL)
4525 inputStream->filename = NULL;
4526 else
4527 inputStream->filename = xmlMemStrdup(filename);
4528 inputStream->buf = buf;
4529 inputStream->base = inputStream->buf->buffer->content;
4530 inputStream->cur = inputStream->buf->buffer->content;
4531
4532 inputPush(ctxt, inputStream);
4533
4534 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4535 (ctxt->input->buf != NULL)) {
4536 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4537#ifdef DEBUG_PUSH
4538 fprintf(stderr, "HPP: pushed %d\n", size);
4539#endif
4540 }
4541
4542 return(ctxt);
4543}
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004544
4545/**
4546 * htmlSAXParseDoc :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00004547 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004548 * @encoding: a free form C string describing the HTML document encoding, or NULL
4549 * @sax: the SAX handler block
4550 * @userData: if using SAX, this pointer will be provided on callbacks.
4551 *
4552 * parse an HTML in-memory document and build a tree.
4553 * It use the given SAX function block to handle the parsing callback.
4554 * If sax is NULL, fallback to the default DOM tree building routines.
4555 *
4556 * Returns the resulting document tree
4557 */
4558
4559htmlDocPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00004560htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004561 htmlDocPtr ret;
4562 htmlParserCtxtPtr ctxt;
4563
4564 if (cur == NULL) return(NULL);
4565
4566
4567 ctxt = htmlCreateDocParserCtxt(cur, encoding);
4568 if (ctxt == NULL) return(NULL);
4569 if (sax != NULL) {
4570 ctxt->sax = sax;
4571 ctxt->userData = userData;
4572 }
4573
4574 htmlParseDocument(ctxt);
4575 ret = ctxt->myDoc;
4576 if (sax != NULL) {
4577 ctxt->sax = NULL;
4578 ctxt->userData = NULL;
4579 }
4580 htmlFreeParserCtxt(ctxt);
4581
4582 return(ret);
4583}
4584
4585/**
4586 * htmlParseDoc :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00004587 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004588 * @encoding: a free form C string describing the HTML document encoding, or NULL
4589 *
4590 * parse an HTML in-memory document and build a tree.
4591 *
4592 * Returns the resulting document tree
4593 */
4594
4595htmlDocPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00004596htmlParseDoc(xmlChar *cur, const char *encoding) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004597 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4598}
4599
4600
4601/**
4602 * htmlCreateFileParserCtxt :
4603 * @filename: the filename
4604 * @encoding: a free form C string describing the HTML document encoding, or NULL
4605 *
4606 * Create a parser context for a file content.
4607 * Automatic support for ZLIB/Compress compressed document is provided
4608 * by default if found at compile-time.
4609 *
4610 * Returns the new parser context or NULL
4611 */
4612htmlParserCtxtPtr
4613htmlCreateFileParserCtxt(const char *filename, const char *encoding)
4614{
4615 htmlParserCtxtPtr ctxt;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004616 htmlParserInputPtr inputStream;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00004617 xmlParserInputBufferPtr buf;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004618 /* htmlCharEncoding enc; */
4619
Daniel Veillarde2d034d1999-07-27 19:52:06 +00004620 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
4621 if (buf == NULL) return(NULL);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004622
Daniel Veillard6454aec1999-09-02 22:04:43 +00004623 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004624 if (ctxt == NULL) {
4625 perror("malloc");
4626 return(NULL);
4627 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00004628 memset(ctxt, 0, sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004629 htmlInitParserCtxt(ctxt);
Daniel Veillard6454aec1999-09-02 22:04:43 +00004630 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004631 if (inputStream == NULL) {
4632 perror("malloc");
Daniel Veillard6454aec1999-09-02 22:04:43 +00004633 xmlFree(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004634 return(NULL);
4635 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00004636 memset(inputStream, 0, sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004637
Daniel Veillard6454aec1999-09-02 22:04:43 +00004638 inputStream->filename = xmlMemStrdup(filename);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004639 inputStream->line = 1;
4640 inputStream->col = 1;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00004641 inputStream->buf = buf;
Daniel Veillard35008381999-10-25 13:15:52 +00004642 inputStream->directory = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004643
Daniel Veillarde2d034d1999-07-27 19:52:06 +00004644 inputStream->base = inputStream->buf->buffer->content;
4645 inputStream->cur = inputStream->buf->buffer->content;
4646 inputStream->free = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004647
4648 inputPush(ctxt, inputStream);
4649 return(ctxt);
4650}
4651
4652/**
4653 * htmlSAXParseFile :
4654 * @filename: the filename
4655 * @encoding: a free form C string describing the HTML document encoding, or NULL
4656 * @sax: the SAX handler block
4657 * @userData: if using SAX, this pointer will be provided on callbacks.
4658 *
4659 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4660 * compressed document is provided by default if found at compile-time.
4661 * It use the given SAX function block to handle the parsing callback.
4662 * If sax is NULL, fallback to the default DOM tree building routines.
4663 *
4664 * Returns the resulting document tree
4665 */
4666
4667htmlDocPtr
4668htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
4669 void *userData) {
4670 htmlDocPtr ret;
4671 htmlParserCtxtPtr ctxt;
Daniel Veillard87b95392000-08-12 21:12:04 +00004672 htmlSAXHandlerPtr oldsax = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004673
4674 ctxt = htmlCreateFileParserCtxt(filename, encoding);
4675 if (ctxt == NULL) return(NULL);
4676 if (sax != NULL) {
Daniel Veillard87b95392000-08-12 21:12:04 +00004677 oldsax = ctxt->sax;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004678 ctxt->sax = sax;
4679 ctxt->userData = userData;
4680 }
4681
4682 htmlParseDocument(ctxt);
4683
4684 ret = ctxt->myDoc;
4685 if (sax != NULL) {
Daniel Veillard87b95392000-08-12 21:12:04 +00004686 ctxt->sax = oldsax;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004687 ctxt->userData = NULL;
4688 }
4689 htmlFreeParserCtxt(ctxt);
4690
4691 return(ret);
4692}
4693
4694/**
4695 * htmlParseFile :
4696 * @filename: the filename
4697 * @encoding: a free form C string describing the HTML document encoding, or NULL
4698 *
4699 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4700 * compressed document is provided by default if found at compile-time.
4701 *
4702 * Returns the resulting document tree
4703 */
4704
4705htmlDocPtr
4706htmlParseFile(const char *filename, const char *encoding) {
4707 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
4708}
Daniel Veillard361d8452000-04-03 19:48:13 +00004709
4710#endif /* LIBXML_HTML_ENABLED */