blob: 7e3f239c51572a0e333489fe098bc4e7b3bfd33e [file] [log] [blame]
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * Daniel.Veillard@w3.org
7 */
8
9#ifdef WIN32
Daniel Veillard3c558c31999-12-22 11:30:41 +000010#include "win32config.h"
Daniel Veillardbe70ff71999-07-05 16:50:46 +000011#else
Daniel Veillard7f7d1111999-09-22 09:46:25 +000012#include "config.h"
Daniel Veillardbe70ff71999-07-05 16:50:46 +000013#endif
Daniel Veillard7f7d1111999-09-22 09:46:25 +000014
Daniel Veillardb71379b2000-10-09 12:30:39 +000015#include <libxml/xmlversion.h>
Daniel Veillard361d8452000-04-03 19:48:13 +000016#ifdef LIBXML_HTML_ENABLED
Daniel Veillardbe70ff71999-07-05 16:50:46 +000017#include <stdio.h>
Daniel Veillard32bc74e2000-07-14 14:49:25 +000018#include <string.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000019#ifdef HAVE_CTYPE_H
20#include <ctype.h>
21#endif
22#ifdef HAVE_STDLIB_H
Daniel Veillardbe70ff71999-07-05 16:50:46 +000023#include <stdlib.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000024#endif
25#ifdef HAVE_SYS_STAT_H
Daniel Veillardbe70ff71999-07-05 16:50:46 +000026#include <sys/stat.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000027#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +000028#ifdef HAVE_FCNTL_H
29#include <fcntl.h>
30#endif
31#ifdef HAVE_UNISTD_H
32#include <unistd.h>
33#endif
34#ifdef HAVE_ZLIB_H
35#include <zlib.h>
36#endif
37
Daniel Veillard361d8452000-04-03 19:48:13 +000038#include <libxml/xmlmemory.h>
39#include <libxml/tree.h>
Daniel Veillardaaf58b92000-10-06 14:07:26 +000040#include <libxml/parser.h>
41#include <libxml/parserInternals.h>
Daniel Veillardb71379b2000-10-09 12:30:39 +000042#include <libxml/xmlerror.h>
Daniel Veillard361d8452000-04-03 19:48:13 +000043#include <libxml/HTMLparser.h>
44#include <libxml/entities.h>
45#include <libxml/encoding.h>
46#include <libxml/valid.h>
Daniel Veillard361d8452000-04-03 19:48:13 +000047#include <libxml/xmlIO.h>
Daniel Veillarde2d034d1999-07-27 19:52:06 +000048
49#define HTML_MAX_NAMELEN 1000
Daniel Veillard32bc74e2000-07-14 14:49:25 +000050#define HTML_PARSER_BIG_BUFFER_SIZE 1000
Daniel Veillard5e5c6231999-12-29 12:49:06 +000051#define HTML_PARSER_BUFFER_SIZE 100
Daniel Veillardbe70ff71999-07-05 16:50:46 +000052
Daniel Veillard82150d81999-07-07 07:32:15 +000053/* #define DEBUG */
Daniel Veillard5e5c6231999-12-29 12:49:06 +000054/* #define DEBUG_PUSH */
Daniel Veillard5233ffc1999-07-06 22:25:25 +000055
56/************************************************************************
57 * *
58 * Parser stacks related functions and macros *
59 * *
60 ************************************************************************/
61
62/*
63 * Generic function for accessing stacks in the Parser Context
64 */
65
Daniel Veillarddbfd6411999-12-28 16:35:14 +000066#define PUSH_AND_POP(scope, type, name) \
67scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000068 if (ctxt->name##Nr >= ctxt->name##Max) { \
69 ctxt->name##Max *= 2; \
Daniel Veillard32bc74e2000-07-14 14:49:25 +000070 ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000071 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
72 if (ctxt->name##Tab == NULL) { \
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +000073 xmlGenericError(xmlGenericErrorContext, \
74 "realloc failed !\n"); \
Daniel Veillard0142b842000-01-14 14:45:24 +000075 return(0); \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000076 } \
77 } \
78 ctxt->name##Tab[ctxt->name##Nr] = value; \
79 ctxt->name = value; \
80 return(ctxt->name##Nr++); \
81} \
Daniel Veillarddbfd6411999-12-28 16:35:14 +000082scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000083 type ret; \
Daniel Veillard7c1206f1999-10-14 09:10:25 +000084 if (ctxt->name##Nr < 0) return(0); \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000085 ctxt->name##Nr--; \
Daniel Veillard7c1206f1999-10-14 09:10:25 +000086 if (ctxt->name##Nr < 0) return(0); \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000087 if (ctxt->name##Nr > 0) \
88 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
89 else \
90 ctxt->name = NULL; \
91 ret = ctxt->name##Tab[ctxt->name##Nr]; \
92 ctxt->name##Tab[ctxt->name##Nr] = 0; \
93 return(ret); \
94} \
95
Daniel Veillarddbfd6411999-12-28 16:35:14 +000096PUSH_AND_POP(extern, xmlNodePtr, node)
97PUSH_AND_POP(extern, xmlChar*, name)
Daniel Veillard5233ffc1999-07-06 22:25:25 +000098
99/*
100 * Macros for accessing the content. Those should be used only by the parser,
101 * and not exported.
102 *
103 * Dirty macros, i.e. one need to make assumption on the context to use them
104 *
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000105 * CUR_PTR return the current pointer to the xmlChar to be parsed.
106 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000107 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
108 * in UNICODE mode. This should be used internally by the parser
109 * only to compare to ASCII values otherwise it would break when
110 * running with UTF-8 encoding.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000111 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000112 * to compare on ASCII based substring.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000113 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000114 * it should be used only to compare on ASCII based substring.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000115 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000116 * strings within the parser.
117 *
118 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
119 *
120 * CURRENT Returns the current char value, with the full decoding of
121 * UTF-8 if we are using this mode. It returns an int.
122 * NEXT Skip to the next character, this does the proper decoding
123 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000124 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
125 */
126
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000127#define UPPER (toupper(*ctxt->input->cur))
Daniel Veillardcf461992000-03-14 18:30:20 +0000128
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000129#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
Daniel Veillardcf461992000-03-14 18:30:20 +0000130
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000131#define NXT(val) ctxt->input->cur[(val)]
Daniel Veillardcf461992000-03-14 18:30:20 +0000132
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000133#define UPP(val) (toupper(ctxt->input->cur[(val)]))
Daniel Veillardcf461992000-03-14 18:30:20 +0000134
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000135#define CUR_PTR ctxt->input->cur
Daniel Veillardcf461992000-03-14 18:30:20 +0000136
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000137#define SHRINK xmlParserInputShrink(ctxt->input)
Daniel Veillardcf461992000-03-14 18:30:20 +0000138
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000139#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000140
Daniel Veillardcf461992000-03-14 18:30:20 +0000141#define CURRENT ((int) (*ctxt->input->cur))
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000142
Daniel Veillard126f2792000-10-24 17:10:12 +0000143#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
Daniel Veillardcf461992000-03-14 18:30:20 +0000144
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000145/* Inported from XML */
146
147/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
148#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard126f2792000-10-24 17:10:12 +0000149#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000150
151#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
152#define NXT(val) ctxt->input->cur[(val)]
153#define CUR_PTR ctxt->input->cur
154
155
Daniel Veillard126f2792000-10-24 17:10:12 +0000156#define NEXTL(l) do { \
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000157 if (*(ctxt->input->cur) == '\n') { \
158 ctxt->input->line++; ctxt->input->col = 1; \
159 } else ctxt->input->col++; \
Daniel Veillard126f2792000-10-24 17:10:12 +0000160 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
161 } while (0)
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000162
163/************
164 \
165 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
166 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
167 ************/
168
Daniel Veillard126f2792000-10-24 17:10:12 +0000169#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
170#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000171
172#define COPY_BUF(l,b,i,v) \
173 if (l == 1) b[i++] = (xmlChar) v; \
Daniel Veillard126f2792000-10-24 17:10:12 +0000174 else i += xmlCopyChar(l,&b[i],v)
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000175
176/**
177 * htmlCurrentChar:
178 * @ctxt: the HTML parser context
179 * @len: pointer to the length of the char read
180 *
181 * The current char value, if using UTF-8 this may actaully span multiple
182 * bytes in the input buffer. Implement the end of line normalization:
183 * 2.11 End-of-Line Handling
184 * If the encoding is unspecified, in the case we find an ISO-Latin-1
185 * char, then the encoding converter is plugged in automatically.
186 *
187 * Returns the current char value and its lenght
188 */
189
190int
191htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
192 if (ctxt->instate == XML_PARSER_EOF)
193 return(0);
194
195 if (ctxt->token != 0) {
196 *len = 0;
197 return(ctxt->token);
198 }
199 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
200 /*
201 * We are supposed to handle UTF8, check it's valid
202 * From rfc2044: encoding of the Unicode values on UTF-8:
203 *
204 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
205 * 0000 0000-0000 007F 0xxxxxxx
206 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
207 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
208 *
209 * Check for the 0x110000 limit too
210 */
211 const unsigned char *cur = ctxt->input->cur;
212 unsigned char c;
213 unsigned int val;
214
215 c = *cur;
216 if (c & 0x80) {
217 if (cur[1] == 0)
218 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
219 if ((cur[1] & 0xc0) != 0x80)
220 goto encoding_error;
221 if ((c & 0xe0) == 0xe0) {
222
223 if (cur[2] == 0)
224 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
225 if ((cur[2] & 0xc0) != 0x80)
226 goto encoding_error;
227 if ((c & 0xf0) == 0xf0) {
228 if (cur[3] == 0)
229 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
230 if (((c & 0xf8) != 0xf0) ||
231 ((cur[3] & 0xc0) != 0x80))
232 goto encoding_error;
233 /* 4-byte code */
234 *len = 4;
235 val = (cur[0] & 0x7) << 18;
236 val |= (cur[1] & 0x3f) << 12;
237 val |= (cur[2] & 0x3f) << 6;
238 val |= cur[3] & 0x3f;
239 } else {
240 /* 3-byte code */
241 *len = 3;
242 val = (cur[0] & 0xf) << 12;
243 val |= (cur[1] & 0x3f) << 6;
244 val |= cur[2] & 0x3f;
245 }
246 } else {
247 /* 2-byte code */
248 *len = 2;
249 val = (cur[0] & 0x1f) << 6;
250 val |= cur[1] & 0x3f;
251 }
252 if (!IS_CHAR(val)) {
Daniel Veillarda2c6da92000-09-16 18:15:00 +0000253 ctxt->errNo = XML_ERR_INVALID_ENCODING;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000254 if ((ctxt->sax != NULL) &&
255 (ctxt->sax->error != NULL))
256 ctxt->sax->error(ctxt->userData,
257 "Char 0x%X out of allowed range\n", val);
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000258 ctxt->wellFormed = 0;
259 ctxt->disableSAX = 1;
260 }
261 return(val);
262 } else {
263 /* 1-byte code */
264 *len = 1;
265 return((int) *ctxt->input->cur);
266 }
267 }
268 /*
269 * Assume it's a fixed lenght encoding (1) with
270 * a compatibke encoding for the ASCII set, since
271 * XML constructs only use < 128 chars
272 */
273 *len = 1;
274 if ((int) *ctxt->input->cur < 0x80)
275 return((int) *ctxt->input->cur);
276
277 /*
278 * Humm this is bad, do an automatic flow conversion
279 */
280 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
281 ctxt->charset = XML_CHAR_ENCODING_UTF8;
282 return(xmlCurrentChar(ctxt, len));
283
284encoding_error:
285 /*
286 * If we detect an UTF8 error that probably mean that the
287 * input encoding didn't get properly advertized in the
288 * declaration header. Report the error and switch the encoding
289 * to ISO-Latin-1 (if you don't like this policy, just declare the
290 * encoding !)
291 */
Daniel Veillarda2c6da92000-09-16 18:15:00 +0000292 ctxt->errNo = XML_ERR_INVALID_ENCODING;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000293 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
294 ctxt->sax->error(ctxt->userData,
295 "Input is not proper UTF-8, indicate encoding !\n");
296 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
297 ctxt->input->cur[0], ctxt->input->cur[1],
298 ctxt->input->cur[2], ctxt->input->cur[3]);
299 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000300
301 ctxt->charset = XML_CHAR_ENCODING_8859_1;
302 *len = 1;
303 return((int) *ctxt->input->cur);
304}
305
Daniel Veillardcf461992000-03-14 18:30:20 +0000306/**
307 * htmlNextChar:
308 * @ctxt: the HTML parser context
309 *
310 * Skip to the next char input char.
311 */
312
313void
314htmlNextChar(htmlParserCtxtPtr ctxt) {
Daniel Veillard3f6f7f62000-06-30 17:58:25 +0000315 if (ctxt->instate == XML_PARSER_EOF)
316 return;
Daniel Veillardcf461992000-03-14 18:30:20 +0000317 if ((*ctxt->input->cur == 0) &&
318 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
319 xmlPopInput(ctxt);
320 } else {
321 if (*(ctxt->input->cur) == '\n') {
322 ctxt->input->line++; ctxt->input->col = 1;
323 } else ctxt->input->col++;
324 ctxt->input->cur++;
325 ctxt->nbChars++;
326 if (*ctxt->input->cur == 0)
327 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
328 }
329}
330
331/**
332 * htmlSkipBlankChars:
333 * @ctxt: the HTML parser context
334 *
335 * skip all blanks character found at that point in the input streams.
336 *
337 * Returns the number of space chars skipped
338 */
339
340int
341htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
342 int res = 0;
343
344 while (IS_BLANK(*(ctxt->input->cur))) {
345 if ((*ctxt->input->cur == 0) &&
346 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
347 xmlPopInput(ctxt);
348 } else {
349 if (*(ctxt->input->cur) == '\n') {
350 ctxt->input->line++; ctxt->input->col = 1;
351 } else ctxt->input->col++;
352 ctxt->input->cur++;
353 ctxt->nbChars++;
354 if (*ctxt->input->cur == 0)
355 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
356 }
357 res++;
358 }
359 return(res);
360}
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000361
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000362
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000363
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000364/************************************************************************
365 * *
366 * The list of HTML elements and their properties *
367 * *
368 ************************************************************************/
369
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000370/*
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000371 * Start Tag: 1 means the start tag can be ommited
372 * End Tag: 1 means the end tag can be ommited
373 * 2 means it's forbidden (empty elements)
374 * Depr: this element is deprecated
375 * DTD: 1 means that this element is valid only in the Loose DTD
376 * 2 means that this element is valid only in the Frameset DTD
377 *
378 * Name,Start Tag,End Tag, Empty, Depr., DTD, Description
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000379 */
380htmlElemDesc html40ElementTable[] = {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000381{ "a", 0, 0, 0, 0, 0, "anchor " },
382{ "abbr", 0, 0, 0, 0, 0, "abbreviated form" },
383{ "acronym", 0, 0, 0, 0, 0, "" },
384{ "address", 0, 0, 0, 0, 0, "information on author " },
385{ "applet", 0, 0, 0, 1, 1, "java applet " },
386{ "area", 0, 2, 1, 0, 0, "client-side image map area " },
387{ "b", 0, 0, 0, 0, 0, "bold text style" },
388{ "base", 0, 2, 1, 0, 0, "document base uri " },
389{ "basefont", 0, 2, 1, 1, 1, "base font size " },
390{ "bdo", 0, 0, 0, 0, 0, "i18n bidi over-ride " },
391{ "big", 0, 0, 0, 0, 0, "large text style" },
392{ "blockquote", 0, 0, 0, 0, 0, "long quotation " },
393{ "body", 1, 1, 0, 0, 0, "document body " },
394{ "br", 0, 2, 1, 0, 0, "forced line break " },
395{ "button", 0, 0, 0, 0, 0, "push button " },
396{ "caption", 0, 0, 0, 0, 0, "table caption " },
397{ "center", 0, 0, 0, 1, 1, "shorthand for div align=center " },
398{ "cite", 0, 0, 0, 0, 0, "citation" },
399{ "code", 0, 0, 0, 0, 0, "computer code fragment" },
400{ "col", 0, 2, 1, 0, 0, "table column " },
401{ "colgroup", 0, 1, 0, 0, 0, "table column group " },
402{ "dd", 0, 1, 0, 0, 0, "definition description " },
403{ "del", 0, 0, 0, 0, 0, "deleted text " },
404{ "dfn", 0, 0, 0, 0, 0, "instance definition" },
405{ "dir", 0, 0, 0, 1, 1, "directory list" },
406{ "div", 0, 0, 0, 0, 0, "generic language/style container"},
407{ "dl", 0, 0, 0, 0, 0, "definition list " },
408{ "dt", 0, 1, 0, 0, 0, "definition term " },
409{ "em", 0, 0, 0, 0, 0, "emphasis" },
410{ "fieldset", 0, 0, 0, 0, 0, "form control group " },
411{ "font", 0, 0, 0, 1, 1, "local change to font " },
412{ "form", 0, 0, 0, 0, 0, "interactive form " },
413{ "frame", 0, 2, 1, 0, 2, "subwindow " },
414{ "frameset", 0, 0, 0, 0, 2, "window subdivision" },
415{ "h1", 0, 0, 0, 0, 0, "heading " },
416{ "h2", 0, 0, 0, 0, 0, "heading " },
417{ "h3", 0, 0, 0, 0, 0, "heading " },
418{ "h4", 0, 0, 0, 0, 0, "heading " },
419{ "h5", 0, 0, 0, 0, 0, "heading " },
420{ "h6", 0, 0, 0, 0, 0, "heading " },
421{ "head", 1, 1, 0, 0, 0, "document head " },
422{ "hr", 0, 2, 1, 0, 0, "horizontal rule " },
423{ "html", 1, 1, 0, 0, 0, "document root element " },
424{ "i", 0, 0, 0, 0, 0, "italic text style" },
425{ "iframe", 0, 0, 0, 0, 1, "inline subwindow " },
426{ "img", 0, 2, 1, 0, 0, "embedded image " },
427{ "input", 0, 2, 1, 0, 0, "form control " },
428{ "ins", 0, 0, 0, 0, 0, "inserted text" },
429{ "isindex", 0, 2, 1, 1, 1, "single line prompt " },
430{ "kbd", 0, 0, 0, 0, 0, "text to be entered by the user" },
431{ "label", 0, 0, 0, 0, 0, "form field label text " },
432{ "legend", 0, 0, 0, 0, 0, "fieldset legend " },
433{ "li", 0, 1, 0, 0, 0, "list item " },
434{ "link", 0, 2, 1, 0, 0, "a media-independent link " },
435{ "map", 0, 0, 0, 0, 0, "client-side image map " },
436{ "menu", 0, 0, 0, 1, 1, "menu list " },
437{ "meta", 0, 2, 1, 0, 0, "generic metainformation " },
438{ "noframes", 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " },
439{ "noscript", 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
440{ "object", 0, 0, 0, 0, 0, "generic embedded object " },
441{ "ol", 0, 0, 0, 0, 0, "ordered list " },
442{ "optgroup", 0, 0, 0, 0, 0, "option group " },
443{ "option", 0, 1, 0, 0, 0, "selectable choice " },
444{ "p", 0, 1, 0, 0, 0, "paragraph " },
445{ "param", 0, 2, 1, 0, 0, "named property value " },
446{ "pre", 0, 0, 0, 0, 0, "preformatted text " },
447{ "q", 0, 0, 0, 0, 0, "short inline quotation " },
448{ "s", 0, 0, 0, 1, 1, "strike-through text style" },
449{ "samp", 0, 0, 0, 0, 0, "sample program output, scripts, etc." },
450{ "script", 0, 0, 0, 0, 0, "script statements " },
451{ "select", 0, 0, 0, 0, 0, "option selector " },
452{ "small", 0, 0, 0, 0, 0, "small text style" },
453{ "span", 0, 0, 0, 0, 0, "generic language/style container " },
454{ "strike", 0, 0, 0, 1, 1, "strike-through text" },
455{ "strong", 0, 0, 0, 0, 0, "strong emphasis" },
456{ "style", 0, 0, 0, 0, 0, "style info " },
457{ "sub", 0, 0, 0, 0, 0, "subscript" },
458{ "sup", 0, 0, 0, 0, 0, "superscript " },
459{ "table", 0, 0, 0, 0, 0, "&#160;" },
460{ "tbody", 1, 1, 0, 0, 0, "table body " },
461{ "td", 0, 1, 0, 0, 0, "table data cell" },
462{ "textarea", 0, 0, 0, 0, 0, "multi-line text field " },
463{ "tfoot", 0, 1, 0, 0, 0, "table footer " },
464{ "th", 0, 1, 0, 0, 0, "table header cell" },
465{ "thead", 0, 1, 0, 0, 0, "table header " },
466{ "title", 0, 0, 0, 0, 0, "document title " },
467{ "tr", 0, 1, 0, 0, 0, "table row " },
468{ "tt", 0, 0, 0, 0, 0, "teletype or monospaced text style" },
469{ "u", 0, 0, 0, 1, 1, "underlined text style" },
470{ "ul", 0, 0, 0, 0, 0, "unordered list " },
471{ "var", 0, 0, 0, 0, 0, "instance of a variable or program argument" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000472};
473
474/*
475 * start tags that imply the end of a current element
476 * any tag of each line implies the end of the current element if the type of
477 * that element is in the same line
478 */
Daniel Veillardb96e6431999-08-29 21:02:19 +0000479char *htmlEquEnd[] = {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000480"dt", "dd", "li", "option", NULL,
481"h1", "h2", "h3", "h4", "h5", "h6", NULL,
482"ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000483NULL
484};
485/*
486 * acording the HTML DTD, HR should be added to the 2nd line above, as it
487 * is not allowed within a H1, H2, H3, etc. But we should tolerate that case
488 * because many documents contain rules in headings...
489 */
490
491/*
492 * start tags that imply the end of current element
493 */
Daniel Veillardb96e6431999-08-29 21:02:19 +0000494char *htmlStartClose[] = {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000495"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
496 "dl", "ul", "ol", "menu", "dir", "address", "pre",
497 "listing", "xmp", "head", NULL,
498"head", "p", NULL,
499"title", "p", NULL,
500"body", "head", "style", "link", "title", "p", NULL,
501"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
502 "pre", "listing", "xmp", "head", "li", NULL,
503"hr", "p", "head", NULL,
504"h1", "p", "head", NULL,
505"h2", "p", "head", NULL,
506"h3", "p", "head", NULL,
507"h4", "p", "head", NULL,
508"h5", "p", "head", NULL,
509"h6", "p", "head", NULL,
510"dir", "p", "head", NULL,
511"address", "p", "head", "ul", NULL,
512"pre", "p", "head", "ul", NULL,
513"listing", "p", "head", NULL,
514"xmp", "p", "head", NULL,
515"blockquote", "p", "head", NULL,
516"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
517 "xmp", "head", NULL,
518"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
519 "head", "dd", NULL,
520"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
521 "head", "dt", NULL,
522"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
523 "listing", "xmp", NULL,
524"ol", "p", "head", "ul", NULL,
525"menu", "p", "head", "ul", NULL,
526"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
527"div", "p", "head", NULL,
528"noscript", "p", "head", NULL,
529"center", "font", "b", "i", "p", "head", NULL,
530"a", "a", NULL,
531"caption", "p", NULL,
532"colgroup", "caption", "colgroup", "col", "p", NULL,
533"col", "caption", "col", "p", NULL,
534"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
535 "listing", "xmp", "a", NULL,
536"th", "th", "td", NULL,
537"td", "th", "td", "p", NULL,
538"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
539"thead", "caption", "col", "colgroup", NULL,
540"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
541 "tbody", "p", NULL,
542"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
543 "tfoot", "tbody", "p", NULL,
544"optgroup", "option", NULL,
Daniel Veillard126f2792000-10-24 17:10:12 +0000545"option", "option", NULL,
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000546"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
547 "pre", "listing", "xmp", "a", NULL,
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000548NULL
549};
550
Daniel Veillardb8f25c92000-08-19 19:52:36 +0000551/*
552 * The list of HTML elements which are supposed not to have
553 * CDATA content and where a p element will be implied
554 *
555 * TODO: extend that list by reading the HTML SGML DtD on
556 * implied paragraph
557 */
558static char *htmlNoContentElements[] = {
559 "html",
560 "head",
561 "body",
562 NULL
563};
564
Daniel Veillard47e12f22000-10-15 14:24:25 +0000565/*
566 * The list of HTML attributes which are of content %Script;
567 * NOTE: when adding ones, check htmlIsScriptAttribute() since
568 * it assumes the name starts with 'on'
569 */
570static char *htmlScriptAttributes[] = {
571 "onclick",
572 "ondblclick",
573 "onmousedown",
574 "onmouseup",
575 "onmouseover",
576 "onmousemove",
577 "onmouseout",
578 "onkeypress",
579 "onkeydown",
580 "onkeyup",
581 "onload",
582 "onunload",
583 "onfocus",
584 "onblur",
585 "onsubmit",
586 "onrest",
587 "onchange",
588 "onselect"
589};
590
591
Daniel Veillardb96e6431999-08-29 21:02:19 +0000592static char** htmlStartCloseIndex[100];
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000593static int htmlStartCloseIndexinitialized = 0;
594
595/************************************************************************
596 * *
597 * functions to handle HTML specific data *
598 * *
599 ************************************************************************/
600
601/**
602 * htmlInitAutoClose:
603 *
604 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
Daniel Veillardbc765302000-10-01 18:23:35 +0000605 * This is not reentrant. Call xmlInitParser() once before processing in
606 * case of use in multithreaded programs.
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000607 */
608void
609htmlInitAutoClose(void) {
610 int index, i = 0;
611
612 if (htmlStartCloseIndexinitialized) return;
613
614 for (index = 0;index < 100;index ++) htmlStartCloseIndex[index] = NULL;
615 index = 0;
616 while ((htmlStartClose[i] != NULL) && (index < 100 - 1)) {
617 htmlStartCloseIndex[index++] = &htmlStartClose[i];
618 while (htmlStartClose[i] != NULL) i++;
619 i++;
620 }
Daniel Veillardbc765302000-10-01 18:23:35 +0000621 htmlStartCloseIndexinitialized = 1;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000622}
623
624/**
625 * htmlTagLookup:
Daniel Veillardb656ebe2000-09-22 13:51:48 +0000626 * @tag: The tag name in lowercase
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000627 *
628 * Lookup the HTML tag in the ElementTable
629 *
630 * Returns the related htmlElemDescPtr or NULL if not found.
631 */
632htmlElemDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000633htmlTagLookup(const xmlChar *tag) {
Daniel Veillard47f3f312000-08-27 22:40:15 +0000634 int i;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000635
636 for (i = 0; i < (sizeof(html40ElementTable) /
637 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000638 if (xmlStrEqual(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000639 return(&html40ElementTable[i]);
640 }
641 return(NULL);
642}
643
644/**
645 * htmlCheckAutoClose:
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000646 * @newtag: The new tag name
647 * @oldtag: The old tag name
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000648 *
649 * Checks wether the new tag is one of the registered valid tags for closing old.
650 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
651 *
652 * Returns 0 if no, 1 if yes.
653 */
654int
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000655htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000656 int i, index;
Daniel Veillard39c7d712000-09-10 16:14:55 +0000657 char **close = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000658
659 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
660
661 /* inefficient, but not a big deal */
662 for (index = 0; index < 100;index++) {
663 close = htmlStartCloseIndex[index];
664 if (close == NULL) return(0);
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000665 if (xmlStrEqual(BAD_CAST *close, newtag)) break;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000666 }
667
668 i = close - htmlStartClose;
669 i++;
670 while (htmlStartClose[i] != NULL) {
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000671 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000672 return(1);
673 }
674 i++;
675 }
676 return(0);
677}
678
679/**
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000680 * htmlAutoCloseOnClose:
681 * @ctxt: an HTML parser context
682 * @newtag: The new tag name
683 *
684 * The HTmL DtD allows an ending tag to implicitely close other tags.
685 */
686void
687htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
688 htmlElemDescPtr info;
689 xmlChar *oldname;
690 int i;
691
692#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000693 xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000694 for (i = 0;i < ctxt->nameNr;i++)
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000695 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000696#endif
697
698 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000699 if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000700 }
701 if (i < 0) return;
702
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000703 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000704 info = htmlTagLookup(ctxt->name);
705 if ((info == NULL) || (info->endTag == 1)) {
706#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000707 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000708#endif
709 } else {
710 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
711 ctxt->sax->error(ctxt->userData,
712 "Opening and ending tag mismatch: %s and %s\n",
713 newtag, ctxt->name);
714 ctxt->wellFormed = 0;
715 }
716 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
717 ctxt->sax->endElement(ctxt->userData, ctxt->name);
718 oldname = htmlnamePop(ctxt);
719 if (oldname != NULL) {
720#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000721 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000722#endif
723 xmlFree(oldname);
724 }
725 }
726}
727
728/**
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000729 * htmlAutoClose:
730 * @ctxt: an HTML parser context
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000731 * @newtag: The new tag name or NULL
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000732 *
733 * The HTmL DtD allows a tag to implicitely close other tags.
734 * The list is kept in htmlStartClose array. This function is
735 * called when a new tag has been detected and generates the
736 * appropriates closes if possible/needed.
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000737 * If newtag is NULL this mean we are at the end of the resource
Daniel Veillard365e13b2000-07-02 07:56:37 +0000738 * and we should check
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000739 */
740void
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000741htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000742 xmlChar *oldname;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000743 while ((newtag != NULL) && (ctxt->name != NULL) &&
744 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000745#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000746 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000747#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000748 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000749 ctxt->sax->endElement(ctxt->userData, ctxt->name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +0000750 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000751 if (oldname != NULL) {
752#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000753 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000754#endif
755 xmlFree(oldname);
756 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000757 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000758 if (newtag == NULL) {
759 htmlAutoCloseOnClose(ctxt, BAD_CAST"head");
760 htmlAutoCloseOnClose(ctxt, BAD_CAST"body");
761 htmlAutoCloseOnClose(ctxt, BAD_CAST"html");
762 }
763 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000764 ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
765 (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
766 (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
Daniel Veillard365e13b2000-07-02 07:56:37 +0000767#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000768 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
Daniel Veillard365e13b2000-07-02 07:56:37 +0000769#endif
770 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
771 ctxt->sax->endElement(ctxt->userData, ctxt->name);
772 oldname = htmlnamePop(ctxt);
773 if (oldname != NULL) {
774#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000775 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
Daniel Veillard365e13b2000-07-02 07:56:37 +0000776#endif
777 xmlFree(oldname);
778 }
779 }
780
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000781}
782
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000783/**
Daniel Veillard5cb5ab81999-12-21 15:35:29 +0000784 * htmlAutoCloseTag:
785 * @doc: the HTML document
786 * @name: The tag name
787 * @elem: the HTML element
788 *
789 * The HTmL DtD allows a tag to implicitely close other tags.
790 * The list is kept in htmlStartClose array. This function checks
791 * if the element or one of it's children would autoclose the
792 * given tag.
793 *
794 * Returns 1 if autoclose, 0 otherwise
795 */
796int
797htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
798 htmlNodePtr child;
799
800 if (elem == NULL) return(1);
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000801 if (xmlStrEqual(name, elem->name)) return(0);
Daniel Veillard5cb5ab81999-12-21 15:35:29 +0000802 if (htmlCheckAutoClose(elem->name, name)) return(1);
Daniel Veillardcf461992000-03-14 18:30:20 +0000803 child = elem->children;
Daniel Veillard5cb5ab81999-12-21 15:35:29 +0000804 while (child != NULL) {
805 if (htmlAutoCloseTag(doc, name, child)) return(1);
806 child = child->next;
807 }
808 return(0);
809}
810
811/**
812 * htmlIsAutoClosed:
813 * @doc: the HTML document
814 * @elem: the HTML element
815 *
816 * The HTmL DtD allows a tag to implicitely close other tags.
817 * The list is kept in htmlStartClose array. This function checks
818 * if a tag is autoclosed by one of it's child
819 *
820 * Returns 1 if autoclosed, 0 otherwise
821 */
822int
823htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
824 htmlNodePtr child;
825
826 if (elem == NULL) return(1);
Daniel Veillardcf461992000-03-14 18:30:20 +0000827 child = elem->children;
Daniel Veillard5cb5ab81999-12-21 15:35:29 +0000828 while (child != NULL) {
829 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
830 child = child->next;
831 }
832 return(0);
833}
834
835/**
Daniel Veillardbe803962000-06-28 23:40:59 +0000836 * htmlCheckImplied:
837 * @ctxt: an HTML parser context
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000838 * @newtag: The new tag name
Daniel Veillardbe803962000-06-28 23:40:59 +0000839 *
840 * The HTmL DtD allows a tag to exists only implicitely
841 * called when a new tag has been detected and generates the
842 * appropriates implicit tags if missing
843 */
844void
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000845htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000846 if (xmlStrEqual(newtag, BAD_CAST"html"))
Daniel Veillardbe803962000-06-28 23:40:59 +0000847 return;
848 if (ctxt->nameNr <= 0) {
849#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000850 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
Daniel Veillardbe803962000-06-28 23:40:59 +0000851#endif
852 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
853 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
854 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
855 }
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000856 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
Daniel Veillardbe803962000-06-28 23:40:59 +0000857 return;
858 if (ctxt->nameNr <= 1) {
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000859 if ((xmlStrEqual(newtag, BAD_CAST"script")) ||
860 (xmlStrEqual(newtag, BAD_CAST"style")) ||
861 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
862 (xmlStrEqual(newtag, BAD_CAST"link")) ||
863 (xmlStrEqual(newtag, BAD_CAST"title")) ||
864 (xmlStrEqual(newtag, BAD_CAST"base"))) {
Daniel Veillardbe803962000-06-28 23:40:59 +0000865 /*
866 * dropped OBJECT ... i you put it first BODY will be
867 * assumed !
868 */
869#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000870 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
Daniel Veillardbe803962000-06-28 23:40:59 +0000871#endif
872 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
873 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
874 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
875 } else {
876#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000877 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
Daniel Veillardbe803962000-06-28 23:40:59 +0000878#endif
879 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
880 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
881 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
882 }
883 }
884}
885
Daniel Veillardb8f25c92000-08-19 19:52:36 +0000886/**
887 * htmlCheckParagraph
888 * @ctxt: an HTML parser context
889 *
890 * Check whether a p element need to be implied before inserting
891 * characters in the current element.
892 *
893 * Returns 1 if a paragraph has been inserted, 0 if not and -1
894 * in case of error.
895 */
896
897int
898htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
899 const xmlChar *tag;
900 int i;
901
902 if (ctxt == NULL)
903 return(-1);
904 tag = ctxt->name;
905 if (tag == NULL) {
906 htmlAutoClose(ctxt, BAD_CAST"p");
907 htmlCheckImplied(ctxt, BAD_CAST"p");
908 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
909 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
910 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
911 return(1);
912 }
913 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000914 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Daniel Veillardb8f25c92000-08-19 19:52:36 +0000915#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000916 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
Daniel Veillardb8f25c92000-08-19 19:52:36 +0000917#endif
918 htmlAutoClose(ctxt, BAD_CAST"p");
919 htmlCheckImplied(ctxt, BAD_CAST"p");
920 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
921 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
922 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
923 return(1);
924 }
925 }
926 return(0);
927}
928
Daniel Veillard47e12f22000-10-15 14:24:25 +0000929/**
930 * htmlIsScriptAttribute:
931 * @name: an attribute name
932 *
933 * Check if an attribute is of content type Script
934 *
935 * Returns 1 is the attribute is a script 0 otherwise
936 */
937int
938htmlIsScriptAttribute(const xmlChar *name) {
939 int i;
940
941 if (name == NULL)
942 return(0);
943 /*
944 * all script attributes start with 'on'
945 */
946 if ((name[0] != 'o') || (name[1] != 'n'))
947 return(0);
948 for (i = 0;
949 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
950 i++) {
951 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
952 return(1);
953 }
954 return(0);
955}
956
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000957/************************************************************************
958 * *
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000959 * The list of HTML predefined entities *
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000960 * *
961 ************************************************************************/
962
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000963
964htmlEntityDesc html40EntitiesTable[] = {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000965/*
Daniel Veillard47f3f312000-08-27 22:40:15 +0000966 * the 4 absolute ones, plus apostrophe.
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000967 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000968{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
969{ 38, "amp", "ampersand, U+0026 ISOnum" },
Daniel Veillard47f3f312000-08-27 22:40:15 +0000970{ 39, "apos", "single quote" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000971{ 60, "lt", "less-than sign, U+003C ISOnum" },
972{ 62, "gt", "greater-than sign, U+003E ISOnum" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000973
974/*
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000975 * A bunch still in the 128-255 range
976 * Replacing them depend really on the charset used.
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000977 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000978{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
979{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
980{ 162, "cent", "cent sign, U+00A2 ISOnum" },
981{ 163, "pound","pound sign, U+00A3 ISOnum" },
982{ 164, "curren","currency sign, U+00A4 ISOnum" },
983{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
984{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
985{ 167, "sect", "section sign, U+00A7 ISOnum" },
986{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
987{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
988{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
989{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
990{ 172, "not", "not sign, U+00AC ISOnum" },
991{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
992{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
993{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
994{ 176, "deg", "degree sign, U+00B0 ISOnum" },
995{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
996{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
997{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
998{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
999{ 181, "micro","micro sign, U+00B5 ISOnum" },
1000{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +00001001{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001002{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1003{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1004{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +00001005{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001006{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1007{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1008{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1009{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1010{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1011{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1012{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1013{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1014{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1015{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1016{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1017{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1018{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1019{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1020{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1021{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1022{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1023{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1024{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1025{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1026{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1027{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1028{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1029{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1030{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1031{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1032{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1033{ 215, "times","multiplication sign, U+00D7 ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +00001034{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001035{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1036{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1037{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1038{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1039{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1040{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1041{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1042{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1043{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1044{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1045{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1046{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1047{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1048{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1049{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1050{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1051{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1052{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1053{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1054{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1055{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1056{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1057{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1058{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1059{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1060{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1061{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1062{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1063{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1064{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1065{ 247, "divide","division sign, U+00F7 ISOnum" },
1066{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1067{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1068{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1069{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1070{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1071{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1072{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1073{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001074
Daniel Veillard47f3f312000-08-27 22:40:15 +00001075{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1076{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1077{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1078{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1079{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1080
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001081/*
1082 * Anything below should really be kept as entities references
1083 */
1084{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001085
Daniel Veillard47f3f312000-08-27 22:40:15 +00001086{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1087{ 732, "tilde","small tilde, U+02DC ISOdia" },
1088
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001089{ 913, "Alpha","greek capital letter alpha, U+0391" },
1090{ 914, "Beta", "greek capital letter beta, U+0392" },
1091{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1092{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1093{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1094{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1095{ 919, "Eta", "greek capital letter eta, U+0397" },
1096{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1097{ 921, "Iota", "greek capital letter iota, U+0399" },
1098{ 922, "Kappa","greek capital letter kappa, U+039A" },
1099{ 923, "Lambda""greek capital letter lambda, U+039B ISOgrk3" },
1100{ 924, "Mu", "greek capital letter mu, U+039C" },
1101{ 925, "Nu", "greek capital letter nu, U+039D" },
1102{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1103{ 927, "Omicron","greek capital letter omicron, U+039F" },
1104{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1105{ 929, "Rho", "greek capital letter rho, U+03A1" },
1106{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1107{ 932, "Tau", "greek capital letter tau, U+03A4" },
1108{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1109{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1110{ 935, "Chi", "greek capital letter chi, U+03A7" },
1111{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1112{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001113
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001114{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1115{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1116{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1117{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1118{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1119{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1120{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1121{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1122{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1123{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1124{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1125{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1126{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1127{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1128{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1129{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1130{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1131{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1132{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1133{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1134{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1135{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1136{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1137{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1138{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1139{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1140{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1141{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001142
Daniel Veillard47f3f312000-08-27 22:40:15 +00001143{ 8194, "ensp", "en space, U+2002 ISOpub" },
1144{ 8195, "emsp", "em space, U+2003 ISOpub" },
1145{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1146{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1147{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1148{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1149{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1150{ 8211, "ndash","en dash, U+2013 ISOpub" },
1151{ 8212, "mdash","em dash, U+2014 ISOpub" },
1152{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1153{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1154{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1155{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1156{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1157{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1158{ 8224, "dagger","dagger, U+2020 ISOpub" },
1159{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1160
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001161{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1162{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
Daniel Veillard47f3f312000-08-27 22:40:15 +00001163
1164{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1165
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001166{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1167{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
Daniel Veillard47f3f312000-08-27 22:40:15 +00001168
1169{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1170{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1171
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001172{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1173{ 8260, "frasl","fraction slash, U+2044 NEW" },
1174
Daniel Veillard47f3f312000-08-27 22:40:15 +00001175{ 8364, "euro", "euro sign, U+20AC NEW" },
1176
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001177{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
Daniel Veillard47f3f312000-08-27 22:40:15 +00001178{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001179{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1180{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1181{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1182{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1183{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1184{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1185{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1186{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1187{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1188{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1189{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1190{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1191{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1192{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1193
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001194{ 8704, "forall","for all, U+2200 ISOtech" },
1195{ 8706, "part", "partial differential, U+2202 ISOtech" },
1196{ 8707, "exist","there exists, U+2203 ISOtech" },
1197{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1198{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1199{ 8712, "isin", "element of, U+2208 ISOtech" },
1200{ 8713, "notin","not an element of, U+2209 ISOtech" },
1201{ 8715, "ni", "contains as member, U+220B ISOtech" },
1202{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1203{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
1204{ 8722, "minus","minus sign, U+2212 ISOtech" },
1205{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1206{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1207{ 8733, "prop", "proportional to, U+221D ISOtech" },
1208{ 8734, "infin","infinity, U+221E ISOtech" },
1209{ 8736, "ang", "angle, U+2220 ISOamso" },
1210{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1211{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1212{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1213{ 8746, "cup", "union = cup, U+222A ISOtech" },
1214{ 8747, "int", "integral, U+222B ISOtech" },
1215{ 8756, "there4","therefore, U+2234 ISOtech" },
1216{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1217{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1218{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1219{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1220{ 8801, "equiv","identical to, U+2261 ISOtech" },
1221{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1222{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1223{ 8834, "sub", "subset of, U+2282 ISOtech" },
1224{ 8835, "sup", "superset of, U+2283 ISOtech" },
1225{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1226{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1227{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1228{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1229{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1230{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1231{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1232{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1233{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1234{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1235{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1236{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1237{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1238{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1239
1240{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1241{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1242{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1243{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1244
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001245};
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001246
1247/************************************************************************
1248 * *
1249 * Commodity functions to handle entities *
1250 * *
1251 ************************************************************************/
1252
1253/*
1254 * Macro used to grow the current buffer.
1255 */
1256#define growBuffer(buffer) { \
1257 buffer##_size *= 2; \
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001258 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001259 if (buffer == NULL) { \
1260 perror("realloc failed"); \
Daniel Veillard0142b842000-01-14 14:45:24 +00001261 return(NULL); \
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001262 } \
1263}
1264
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001265/**
1266 * htmlEntityLookup:
1267 * @name: the entity name
1268 *
1269 * Lookup the given entity in EntitiesTable
1270 *
1271 * TODO: the linear scan is really ugly, an hash table is really needed.
1272 *
1273 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1274 */
1275htmlEntityDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001276htmlEntityLookup(const xmlChar *name) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001277 int i;
1278
1279 for (i = 0;i < (sizeof(html40EntitiesTable)/
1280 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard8b5dd832000-10-01 20:28:44 +00001281 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001282#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001283 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001284#endif
1285 return(&html40EntitiesTable[i]);
1286 }
1287 }
1288 return(NULL);
1289}
1290
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001291/**
Daniel Veillard47f3f312000-08-27 22:40:15 +00001292 * htmlEntityValueLookup:
1293 * @value: the entity's unicode value
1294 *
1295 * Lookup the given entity in EntitiesTable
1296 *
1297 * TODO: the linear scan is really ugly, an hash table is really needed.
1298 *
1299 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1300 */
1301htmlEntityDescPtr
1302htmlEntityValueLookup(int value) {
1303 int i;
1304#ifdef DEBUG
1305 int lv = 0;
1306#endif
1307
1308 for (i = 0;i < (sizeof(html40EntitiesTable)/
1309 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard4b0755c2000-09-25 14:26:28 +00001310 if ((unsigned int) html40EntitiesTable[i].value >= value) {
1311 if ((unsigned int) html40EntitiesTable[i].value > value)
Daniel Veillard47f3f312000-08-27 22:40:15 +00001312 break;
1313#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001314 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
Daniel Veillard47f3f312000-08-27 22:40:15 +00001315#endif
1316 return(&html40EntitiesTable[i]);
1317 }
1318#ifdef DEBUG
1319 if (lv > html40EntitiesTable[i].value) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001320 xmlGenericError(xmlGenericErrorContext,
1321 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
Daniel Veillard47f3f312000-08-27 22:40:15 +00001322 lv, html40EntitiesTable[i].value);
1323 }
1324 lv = html40EntitiesTable[i].value;
1325#endif
1326 }
1327 return(NULL);
1328}
1329
1330/**
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001331 * UTF8ToHtml:
1332 * @out: a pointer to an array of bytes to store the result
1333 * @outlen: the length of @out
1334 * @in: a pointer to an array of UTF-8 chars
1335 * @inlen: the length of @in
1336 *
1337 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1338 * plus HTML entities block of chars out.
1339 *
1340 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1341 * The value of @inlen after return is the number of octets consumed
1342 * as the return value is positive, else unpredictiable.
1343 * The value of @outlen after return is the number of octets consumed.
1344 */
1345int
1346UTF8ToHtml(unsigned char* out, int *outlen,
1347 const unsigned char* in, int *inlen) {
1348 const unsigned char* processed = in;
1349 const unsigned char* outend;
1350 const unsigned char* outstart = out;
1351 const unsigned char* instart = in;
1352 const unsigned char* inend;
1353 unsigned int c, d;
1354 int trailing;
1355
1356 if (in == NULL) {
1357 /*
1358 * initialization nothing to do
1359 */
1360 *outlen = 0;
1361 *inlen = 0;
1362 return(0);
1363 }
1364 inend = in + (*inlen);
1365 outend = out + (*outlen);
1366 while (in < inend) {
1367 d = *in++;
1368 if (d < 0x80) { c= d; trailing= 0; }
1369 else if (d < 0xC0) {
1370 /* trailing byte in leading position */
1371 *outlen = out - outstart;
1372 *inlen = processed - instart;
1373 return(-2);
1374 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1375 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1376 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1377 else {
1378 /* no chance for this in Ascii */
1379 *outlen = out - outstart;
1380 *inlen = processed - instart;
1381 return(-2);
1382 }
1383
1384 if (inend - in < trailing) {
1385 break;
1386 }
1387
1388 for ( ; trailing; trailing--) {
1389 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1390 break;
1391 c <<= 6;
1392 c |= d & 0x3F;
1393 }
1394
1395 /* assertion: c is a single UTF-4 value */
1396 if (c < 0x80) {
Daniel Veillarde010c172000-08-28 10:04:51 +00001397 if (out + 1 >= outend)
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001398 break;
1399 *out++ = c;
1400 } else {
Daniel Veillard47f3f312000-08-27 22:40:15 +00001401 int len;
1402 htmlEntityDescPtr ent;
1403
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001404 /*
1405 * Try to lookup a predefined HTML entity for it
1406 */
1407
Daniel Veillard47f3f312000-08-27 22:40:15 +00001408 ent = htmlEntityValueLookup(c);
1409 if (ent == NULL) {
1410 /* no chance for this in Ascii */
1411 *outlen = out - outstart;
1412 *inlen = processed - instart;
1413 return(-2);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001414 }
Daniel Veillard47f3f312000-08-27 22:40:15 +00001415 len = strlen(ent->name);
Daniel Veillarde010c172000-08-28 10:04:51 +00001416 if (out + 2 + len >= outend)
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001417 break;
1418 *out++ = '&';
Daniel Veillard47f3f312000-08-27 22:40:15 +00001419 memcpy(out, ent->name, len);
1420 out += len;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001421 *out++ = ';';
1422 }
1423 processed = in;
1424 }
1425 *outlen = out - outstart;
1426 *inlen = processed - instart;
1427 return(0);
1428}
1429
Daniel Veillarde010c172000-08-28 10:04:51 +00001430/**
1431 * htmlEncodeEntities:
1432 * @out: a pointer to an array of bytes to store the result
1433 * @outlen: the length of @out
1434 * @in: a pointer to an array of UTF-8 chars
1435 * @inlen: the length of @in
1436 * @quoteChar: the quote character to escape (' or ") or zero.
1437 *
1438 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1439 * plus HTML entities block of chars out.
1440 *
1441 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1442 * The value of @inlen after return is the number of octets consumed
1443 * as the return value is positive, else unpredictiable.
1444 * The value of @outlen after return is the number of octets consumed.
1445 */
1446int
1447htmlEncodeEntities(unsigned char* out, int *outlen,
1448 const unsigned char* in, int *inlen, int quoteChar) {
1449 const unsigned char* processed = in;
1450 const unsigned char* outend = out + (*outlen);
1451 const unsigned char* outstart = out;
1452 const unsigned char* instart = in;
1453 const unsigned char* inend = in + (*inlen);
1454 unsigned int c, d;
1455 int trailing;
1456
1457 while (in < inend) {
1458 d = *in++;
1459 if (d < 0x80) { c= d; trailing= 0; }
1460 else if (d < 0xC0) {
1461 /* trailing byte in leading position */
1462 *outlen = out - outstart;
1463 *inlen = processed - instart;
1464 return(-2);
1465 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1466 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1467 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1468 else {
1469 /* no chance for this in Ascii */
1470 *outlen = out - outstart;
1471 *inlen = processed - instart;
1472 return(-2);
1473 }
1474
1475 if (inend - in < trailing)
1476 break;
1477
1478 while (trailing--) {
1479 if (((d= *in++) & 0xC0) != 0x80) {
1480 *outlen = out - outstart;
1481 *inlen = processed - instart;
1482 return(-2);
1483 }
1484 c <<= 6;
1485 c |= d & 0x3F;
1486 }
1487
1488 /* assertion: c is a single UTF-4 value */
1489 if (c < 0x80 && c != quoteChar && c != '&' && c != '<' && c != '>') {
1490 if (out >= outend)
1491 break;
1492 *out++ = c;
1493 } else {
1494 htmlEntityDescPtr ent;
1495 const char *cp;
1496 char nbuf[16];
1497 int len;
1498
1499 /*
1500 * Try to lookup a predefined HTML entity for it
1501 */
1502 ent = htmlEntityValueLookup(c);
1503 if (ent == NULL) {
1504 sprintf(nbuf, "#%u", c);
1505 cp = nbuf;
1506 }
1507 else
1508 cp = ent->name;
1509 len = strlen(cp);
1510 if (out + 2 + len > outend)
1511 break;
1512 *out++ = '&';
1513 memcpy(out, cp, len);
1514 out += len;
1515 *out++ = ';';
1516 }
1517 processed = in;
1518 }
1519 *outlen = out - outstart;
1520 *inlen = processed - instart;
1521 return(0);
1522}
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001523
1524/**
1525 * htmlDecodeEntities:
1526 * @ctxt: the parser context
1527 * @len: the len to decode (in bytes !), -1 for no size limit
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001528 * @end: an end marker xmlChar, 0 if none
1529 * @end2: an end marker xmlChar, 0 if none
1530 * @end3: an end marker xmlChar, 0 if none
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001531 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001532 * Subtitute the HTML entities by their value
1533 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001534 * DEPRECATED !!!!
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001535 *
1536 * Returns A newly allocated string with the substitution done. The caller
1537 * must deallocate it !
1538 */
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001539xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001540htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001541 xmlChar end, xmlChar end2, xmlChar end3) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001542 xmlChar *name = NULL;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001543 xmlChar *buffer = NULL;
1544 unsigned int buffer_size = 0;
1545 unsigned int nbchars = 0;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001546 htmlEntityDescPtr ent;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001547 unsigned int max = (unsigned int) len;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001548 int c,l;
1549
1550 if (ctxt->depth > 40) {
Daniel Veillarda2c6da92000-09-16 18:15:00 +00001551 ctxt->errNo = XML_ERR_ENTITY_LOOP;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001552 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1553 ctxt->sax->error(ctxt->userData,
1554 "Detected entity reference loop\n");
1555 ctxt->wellFormed = 0;
1556 ctxt->disableSAX = 1;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001557 return(NULL);
1558 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001559
1560 /*
1561 * allocate a translation buffer.
1562 */
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001563 buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001564 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001565 if (buffer == NULL) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001566 perror("xmlDecodeEntities: malloc failed");
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001567 return(NULL);
1568 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001569
1570 /*
1571 * Ok loop until we reach one of the ending char or a size limit.
1572 */
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001573 c = CUR_CHAR(l);
1574 while ((nbchars < max) && (c != end) &&
1575 (c != end2) && (c != end3)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001576
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001577 if (c == 0) break;
1578 if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
1579 int val = htmlParseCharRef(ctxt);
1580 COPY_BUF(0,buffer,nbchars,val);
1581 NEXTL(l);
1582 } else if ((c == '&') && (ctxt->token != '&')) {
1583 ent = htmlParseEntityRef(ctxt, &name);
1584 if (name != NULL) {
1585 if (ent != NULL) {
1586 int val = ent->value;
1587 COPY_BUF(0,buffer,nbchars,val);
1588 NEXTL(l);
1589 } else {
1590 const xmlChar *cur = name;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001591
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001592 buffer[nbchars++] = '&';
1593 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1594 growBuffer(buffer);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001595 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001596 while (*cur != 0) {
1597 buffer[nbchars++] = *cur++;
1598 }
1599 buffer[nbchars++] = ';';
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001600 }
1601 }
1602 } else {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001603 COPY_BUF(l,buffer,nbchars,c);
1604 NEXTL(l);
1605 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001606 growBuffer(buffer);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001607 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001608 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001609 c = CUR_CHAR(l);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001610 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001611 buffer[nbchars++] = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001612 return(buffer);
1613}
1614
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001615/************************************************************************
1616 * *
1617 * Commodity functions to handle streams *
1618 * *
1619 ************************************************************************/
1620
1621/**
1622 * htmlFreeInputStream:
1623 * @input: an htmlParserInputPtr
1624 *
1625 * Free up an input stream.
1626 */
1627void
1628htmlFreeInputStream(htmlParserInputPtr input) {
1629 if (input == NULL) return;
1630
1631 if (input->filename != NULL) xmlFree((char *) input->filename);
1632 if (input->directory != NULL) xmlFree((char *) input->directory);
1633 if ((input->free != NULL) && (input->base != NULL))
1634 input->free((xmlChar *) input->base);
1635 if (input->buf != NULL)
1636 xmlFreeParserInputBuffer(input->buf);
1637 memset(input, -1, sizeof(htmlParserInput));
1638 xmlFree(input);
1639}
1640
1641/**
1642 * htmlNewInputStream:
1643 * @ctxt: an HTML parser context
1644 *
1645 * Create a new input stream structure
1646 * Returns the new input stream or NULL
1647 */
1648htmlParserInputPtr
1649htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1650 htmlParserInputPtr input;
1651
1652 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1653 if (input == NULL) {
1654 ctxt->errNo = XML_ERR_NO_MEMORY;
1655 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1656 ctxt->sax->error(ctxt->userData,
1657 "malloc: couldn't allocate a new input stream\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001658 return(NULL);
1659 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001660 memset(input, 0, sizeof(htmlParserInput));
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001661 input->filename = NULL;
1662 input->directory = NULL;
1663 input->base = NULL;
1664 input->cur = NULL;
1665 input->buf = NULL;
1666 input->line = 1;
1667 input->col = 1;
1668 input->buf = NULL;
1669 input->free = NULL;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001670 input->version = NULL;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001671 input->consumed = 0;
1672 input->length = 0;
1673 return(input);
1674}
1675
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001676
1677/************************************************************************
1678 * *
1679 * Commodity functions, cleanup needed ? *
1680 * *
1681 ************************************************************************/
1682
1683/**
1684 * areBlanks:
1685 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001686 * @str: a xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001687 * @len: the size of @str
1688 *
1689 * Is this a sequence of blank chars that one can ignore ?
1690 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001691 * Returns 1 if ignorable 0 otherwise.
1692 */
1693
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001694static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001695 int i;
1696 xmlNodePtr lastChild;
1697
1698 for (i = 0;i < len;i++)
1699 if (!(IS_BLANK(str[i]))) return(0);
1700
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001701 if (CUR == 0) return(1);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001702 if (CUR != '<') return(0);
Daniel Veillarde010c172000-08-28 10:04:51 +00001703 if (ctxt->name == NULL)
1704 return(1);
Daniel Veillard8b5dd832000-10-01 20:28:44 +00001705 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
Daniel Veillard4948eb42000-08-29 09:41:15 +00001706 return(1);
Daniel Veillard8b5dd832000-10-01 20:28:44 +00001707 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
Daniel Veillarde010c172000-08-28 10:04:51 +00001708 return(1);
Daniel Veillard8b5dd832000-10-01 20:28:44 +00001709 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
Daniel Veillarde010c172000-08-28 10:04:51 +00001710 return(1);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001711 if (ctxt->node == NULL) return(0);
1712 lastChild = xmlGetLastChild(ctxt->node);
1713 if (lastChild == NULL) {
1714 if (ctxt->node->content != NULL) return(0);
Daniel Veillardc4f4f0b2000-10-29 17:46:30 +00001715 } else if (xmlNodeIsText(lastChild)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001716 return(0);
Daniel Veillardc4f4f0b2000-10-29 17:46:30 +00001717 } else if (xmlStrEqual(lastChild->name, BAD_CAST"b")) {
1718 return(0);
1719 } else if (xmlStrEqual(lastChild->name, BAD_CAST"bold")) {
1720 return(0);
1721 } else if (xmlStrEqual(lastChild->name, BAD_CAST"em")) {
1722 return(0);
1723 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001724 return(1);
1725}
1726
1727/**
1728 * htmlHandleEntity:
1729 * @ctxt: an HTML parser context
1730 * @entity: an XML entity pointer.
1731 *
1732 * Default handling of an HTML entity, call the parser with the
1733 * substitution string
1734 */
1735
1736void
1737htmlHandleEntity(htmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
1738 int len;
1739
1740 if (entity->content == NULL) {
1741 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1742 ctxt->sax->error(ctxt->userData, "htmlHandleEntity %s: content == NULL\n",
1743 entity->name);
1744 ctxt->wellFormed = 0;
1745 return;
1746 }
1747 len = xmlStrlen(entity->content);
1748
1749 /*
1750 * Just handle the content as a set of chars.
1751 */
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001752 htmlCheckParagraph(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001753 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
1754 ctxt->sax->characters(ctxt->userData, entity->content, len);
1755
1756}
1757
1758/**
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001759 * htmlNewDocNoDtD:
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001760 * @URI: URI for the dtd, or NULL
1761 * @ExternalID: the external ID of the DTD, or NULL
1762 *
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001763 * Returns a new document, do not intialize the DTD if not provided
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001764 */
1765htmlDocPtr
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001766htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001767 xmlDocPtr cur;
1768
1769 /*
1770 * Allocate a new document and fill the fields.
1771 */
Daniel Veillard6454aec1999-09-02 22:04:43 +00001772 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001773 if (cur == NULL) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001774 xmlGenericError(xmlGenericErrorContext,
1775 "xmlNewDoc : malloc failed\n");
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001776 return(NULL);
1777 }
Daniel Veillarde7a5a771999-08-30 13:05:42 +00001778 memset(cur, 0, sizeof(xmlDoc));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001779
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001780 cur->type = XML_HTML_DOCUMENT_NODE;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001781 cur->version = NULL;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001782 cur->intSubset = NULL;
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001783 if ((ExternalID != NULL) ||
1784 (URI != NULL))
Daniel Veillard5cb5ab81999-12-21 15:35:29 +00001785 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
Daniel Veillardbe803962000-06-28 23:40:59 +00001786 cur->doc = cur;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001787 cur->name = NULL;
Daniel Veillardcf461992000-03-14 18:30:20 +00001788 cur->children = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001789 cur->extSubset = NULL;
1790 cur->oldNs = NULL;
1791 cur->encoding = NULL;
1792 cur->standalone = 1;
1793 cur->compression = 0;
Daniel Veillardc08a2c61999-09-08 21:35:25 +00001794 cur->ids = NULL;
1795 cur->refs = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001796#ifndef XML_WITHOUT_CORBA
1797 cur->_private = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001798#endif
1799 return(cur);
1800}
1801
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001802/**
1803 * htmlNewDoc:
1804 * @URI: URI for the dtd, or NULL
1805 * @ExternalID: the external ID of the DTD, or NULL
1806 *
1807 * Returns a new document
1808 */
1809htmlDocPtr
1810htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1811 if ((URI == NULL) && (ExternalID == NULL))
1812 return(htmlNewDocNoDtD(
1813 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
1814 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"));
1815
1816 return(htmlNewDocNoDtD(URI, ExternalID));
1817}
1818
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001819
1820/************************************************************************
1821 * *
1822 * The parser itself *
1823 * Relates to http://www.w3.org/TR/html40 *
1824 * *
1825 ************************************************************************/
1826
1827/************************************************************************
1828 * *
1829 * The parser itself *
1830 * *
1831 ************************************************************************/
1832
1833/**
1834 * htmlParseHTMLName:
1835 * @ctxt: an HTML parser context
1836 *
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001837 * parse an HTML tag or attribute name, note that we convert it to lowercase
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001838 * since HTML names are not case-sensitive.
1839 *
1840 * Returns the Tag Name parsed or NULL
1841 */
1842
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001843xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001844htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001845 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001846 int i = 0;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001847 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001848
1849 if (!IS_LETTER(CUR) && (CUR != '_') &&
1850 (CUR != ':')) return(NULL);
1851
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001852 while ((i < HTML_PARSER_BUFFER_SIZE) &&
Daniel Veillard3f6f7f62000-06-30 17:58:25 +00001853 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
Daniel Veillarde8282ed2000-10-10 23:01:31 +00001854 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001855 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001856 else loc[i] = CUR;
1857 i++;
1858
1859 NEXT;
1860 }
1861
1862 ret = xmlStrndup(loc, i);
1863
1864 return(ret);
1865}
1866
1867/**
1868 * htmlParseName:
1869 * @ctxt: an HTML parser context
1870 *
1871 * parse an HTML name, this routine is case sensistive.
1872 *
1873 * Returns the Name parsed or NULL
1874 */
1875
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001876xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001877htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001878 xmlChar buf[HTML_MAX_NAMELEN];
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001879 int len = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001880
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001881 GROW;
1882 if (!IS_LETTER(CUR) && (CUR != '_')) {
1883 return(NULL);
1884 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001885
1886 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1887 (CUR == '.') || (CUR == '-') ||
1888 (CUR == '_') || (CUR == ':') ||
1889 (IS_COMBINING(CUR)) ||
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001890 (IS_EXTENDER(CUR))) {
1891 buf[len++] = CUR;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001892 NEXT;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001893 if (len >= HTML_MAX_NAMELEN) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001894 xmlGenericError(xmlGenericErrorContext,
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001895 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1896 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1897 (CUR == '.') || (CUR == '-') ||
1898 (CUR == '_') || (CUR == ':') ||
1899 (IS_COMBINING(CUR)) ||
1900 (IS_EXTENDER(CUR)))
1901 NEXT;
1902 break;
1903 }
1904 }
1905 return(xmlStrndup(buf, len));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001906}
1907
1908/**
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001909 * htmlParseHTMLAttribute:
1910 * @ctxt: an HTML parser context
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001911 * @stop: a char stop value
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001912 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001913 * parse an HTML attribute value till the stop (quote), if
1914 * stop is 0 then it stops at the first space
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001915 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001916 * Returns the attribute parsed or NULL
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001917 */
1918
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001919xmlChar *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001920htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00001921 xmlChar *buffer = NULL;
1922 int buffer_size = 0;
1923 xmlChar *out = NULL;
1924 xmlChar *name = NULL;
1925
1926 xmlChar *cur = NULL;
1927 htmlEntityDescPtr ent;
1928
1929 /*
1930 * allocate a translation buffer.
1931 */
Daniel Veillard7eda8452000-10-14 23:38:43 +00001932 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard71b656e2000-01-05 14:46:17 +00001933 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1934 if (buffer == NULL) {
1935 perror("htmlParseHTMLAttribute: malloc failed");
1936 return(NULL);
1937 }
1938 out = buffer;
1939
1940 /*
1941 * Ok loop until we reach one of the ending chars
1942 */
1943 while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1944 if ((stop == 0) && (IS_BLANK(CUR))) break;
1945 if (CUR == '&') {
1946 if (NXT(1) == '#') {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001947 unsigned int c;
1948 int bits;
1949
1950 c = htmlParseCharRef(ctxt);
1951 if (c < 0x80)
1952 { *out++ = c; bits= -6; }
1953 else if (c < 0x800)
1954 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1955 else if (c < 0x10000)
1956 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1957 else
1958 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1959
1960 for ( ; bits >= 0; bits-= 6) {
1961 *out++ = ((c >> bits) & 0x3F) | 0x80;
1962 }
Daniel Veillard71b656e2000-01-05 14:46:17 +00001963 } else {
1964 ent = htmlParseEntityRef(ctxt, &name);
1965 if (name == NULL) {
1966 *out++ = '&';
1967 if (out - buffer > buffer_size - 100) {
1968 int index = out - buffer;
1969
1970 growBuffer(buffer);
1971 out = &buffer[index];
1972 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001973 } else if (ent == NULL) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00001974 *out++ = '&';
1975 cur = name;
1976 while (*cur != 0) {
1977 if (out - buffer > buffer_size - 100) {
1978 int index = out - buffer;
1979
1980 growBuffer(buffer);
1981 out = &buffer[index];
1982 }
1983 *out++ = *cur++;
1984 }
1985 xmlFree(name);
1986 } else {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001987 unsigned int c;
1988 int bits;
1989
Daniel Veillard71b656e2000-01-05 14:46:17 +00001990 if (out - buffer > buffer_size - 100) {
1991 int index = out - buffer;
1992
1993 growBuffer(buffer);
1994 out = &buffer[index];
1995 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001996 c = (xmlChar)ent->value;
1997 if (c < 0x80)
1998 { *out++ = c; bits= -6; }
1999 else if (c < 0x800)
2000 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2001 else if (c < 0x10000)
2002 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2003 else
2004 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2005
2006 for ( ; bits >= 0; bits-= 6) {
2007 *out++ = ((c >> bits) & 0x3F) | 0x80;
2008 }
Daniel Veillard71b656e2000-01-05 14:46:17 +00002009 xmlFree(name);
2010 }
2011 }
2012 } else {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002013 unsigned int c;
Daniel Veillard4fb87ee2000-09-19 12:25:59 +00002014 int bits, l;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002015
Daniel Veillard71b656e2000-01-05 14:46:17 +00002016 if (out - buffer > buffer_size - 100) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002017 int index = out - buffer;
2018
2019 growBuffer(buffer);
2020 out = &buffer[index];
2021 }
Daniel Veillard4fb87ee2000-09-19 12:25:59 +00002022 c = CUR_CHAR(l);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002023 if (c < 0x80)
2024 { *out++ = c; bits= -6; }
2025 else if (c < 0x800)
2026 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2027 else if (c < 0x10000)
2028 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2029 else
2030 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2031
2032 for ( ; bits >= 0; bits-= 6) {
2033 *out++ = ((c >> bits) & 0x3F) | 0x80;
Daniel Veillard71b656e2000-01-05 14:46:17 +00002034 }
2035 NEXT;
2036 }
2037 }
2038 *out++ = 0;
2039 return(buffer);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002040}
2041
2042/**
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002043 * htmlParseNmtoken:
2044 * @ctxt: an HTML parser context
2045 *
2046 * parse an HTML Nmtoken.
2047 *
2048 * Returns the Nmtoken parsed or NULL
2049 */
2050
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002051xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002052htmlParseNmtoken(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002053 xmlChar buf[HTML_MAX_NAMELEN];
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002054 int len = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002055
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002056 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002057 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2058 (CUR == '.') || (CUR == '-') ||
2059 (CUR == '_') || (CUR == ':') ||
2060 (IS_COMBINING(CUR)) ||
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002061 (IS_EXTENDER(CUR))) {
2062 buf[len++] = CUR;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002063 NEXT;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002064 if (len >= HTML_MAX_NAMELEN) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00002065 xmlGenericError(xmlGenericErrorContext,
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002066 "htmlParseNmtoken: reached HTML_MAX_NAMELEN limit\n");
2067 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2068 (CUR == '.') || (CUR == '-') ||
2069 (CUR == '_') || (CUR == ':') ||
2070 (IS_COMBINING(CUR)) ||
2071 (IS_EXTENDER(CUR)))
2072 NEXT;
2073 break;
2074 }
2075 }
2076 return(xmlStrndup(buf, len));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002077}
2078
2079/**
2080 * htmlParseEntityRef:
2081 * @ctxt: an HTML parser context
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002082 * @str: location to store the entity name
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002083 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002084 * parse an HTML ENTITY references
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002085 *
2086 * [68] EntityRef ::= '&' Name ';'
2087 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002088 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2089 * if non-NULL *str will have to be freed by the caller.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002090 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002091htmlEntityDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002092htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2093 xmlChar *name;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002094 htmlEntityDescPtr ent = NULL;
2095 *str = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002096
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002097 if (CUR == '&') {
2098 NEXT;
2099 name = htmlParseName(ctxt);
2100 if (name == NULL) {
2101 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2102 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2103 ctxt->wellFormed = 0;
2104 } else {
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002105 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002106 if (CUR == ';') {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002107 *str = name;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002108
2109 /*
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002110 * Lookup the entity in the table.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002111 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002112 ent = htmlEntityLookup(name);
Daniel Veillard71b656e2000-01-05 14:46:17 +00002113 if (ent != NULL) /* OK that's ugly !!! */
2114 NEXT;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002115 } else {
2116 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2117 ctxt->sax->error(ctxt->userData,
2118 "htmlParseEntityRef: expecting ';'\n");
Daniel Veillard71b656e2000-01-05 14:46:17 +00002119 *str = name;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002120 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002121 }
2122 }
2123 return(ent);
2124}
2125
2126/**
2127 * htmlParseAttValue:
2128 * @ctxt: an HTML parser context
2129 *
2130 * parse a value for an attribute
2131 * Note: the parser won't do substitution of entities here, this
2132 * will be handled later in xmlStringGetNodeList, unless it was
2133 * asked for ctxt->replaceEntities != 0
2134 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002135 * Returns the AttValue parsed or NULL.
2136 */
2137
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002138xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002139htmlParseAttValue(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002140 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002141
2142 if (CUR == '"') {
2143 NEXT;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002144 ret = htmlParseHTMLAttribute(ctxt, '"');
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002145 if (CUR != '"') {
2146 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2147 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2148 ctxt->wellFormed = 0;
2149 } else
2150 NEXT;
2151 } else if (CUR == '\'') {
2152 NEXT;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002153 ret = htmlParseHTMLAttribute(ctxt, '\'');
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002154 if (CUR != '\'') {
2155 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2156 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2157 ctxt->wellFormed = 0;
2158 } else
2159 NEXT;
2160 } else {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002161 /*
2162 * That's an HTMLism, the attribute value may not be quoted
2163 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002164 ret = htmlParseHTMLAttribute(ctxt, 0);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002165 if (ret == NULL) {
2166 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2167 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2168 ctxt->wellFormed = 0;
2169 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002170 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002171 return(ret);
2172}
2173
2174/**
2175 * htmlParseSystemLiteral:
2176 * @ctxt: an HTML parser context
2177 *
2178 * parse an HTML Literal
2179 *
2180 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2181 *
2182 * Returns the SystemLiteral parsed or NULL
2183 */
2184
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002185xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002186htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002187 const xmlChar *q;
2188 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002189
2190 if (CUR == '"') {
2191 NEXT;
2192 q = CUR_PTR;
2193 while ((IS_CHAR(CUR)) && (CUR != '"'))
2194 NEXT;
2195 if (!IS_CHAR(CUR)) {
2196 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2197 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2198 ctxt->wellFormed = 0;
2199 } else {
2200 ret = xmlStrndup(q, CUR_PTR - q);
2201 NEXT;
2202 }
2203 } else if (CUR == '\'') {
2204 NEXT;
2205 q = CUR_PTR;
2206 while ((IS_CHAR(CUR)) && (CUR != '\''))
2207 NEXT;
2208 if (!IS_CHAR(CUR)) {
2209 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2210 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2211 ctxt->wellFormed = 0;
2212 } else {
2213 ret = xmlStrndup(q, CUR_PTR - q);
2214 NEXT;
2215 }
2216 } else {
2217 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardcf461992000-03-14 18:30:20 +00002218 ctxt->sax->error(ctxt->userData,
2219 "SystemLiteral \" or ' expected\n");
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002220 ctxt->wellFormed = 0;
2221 }
2222
2223 return(ret);
2224}
2225
2226/**
2227 * htmlParsePubidLiteral:
2228 * @ctxt: an HTML parser context
2229 *
2230 * parse an HTML public literal
2231 *
2232 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2233 *
2234 * Returns the PubidLiteral parsed or NULL.
2235 */
2236
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002237xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002238htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002239 const xmlChar *q;
2240 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002241 /*
2242 * Name ::= (Letter | '_') (NameChar)*
2243 */
2244 if (CUR == '"') {
2245 NEXT;
2246 q = CUR_PTR;
2247 while (IS_PUBIDCHAR(CUR)) NEXT;
2248 if (CUR != '"') {
2249 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2250 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2251 ctxt->wellFormed = 0;
2252 } else {
2253 ret = xmlStrndup(q, CUR_PTR - q);
2254 NEXT;
2255 }
2256 } else if (CUR == '\'') {
2257 NEXT;
2258 q = CUR_PTR;
2259 while ((IS_LETTER(CUR)) && (CUR != '\''))
2260 NEXT;
2261 if (!IS_LETTER(CUR)) {
2262 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2263 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2264 ctxt->wellFormed = 0;
2265 } else {
2266 ret = xmlStrndup(q, CUR_PTR - q);
2267 NEXT;
2268 }
2269 } else {
2270 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2271 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2272 ctxt->wellFormed = 0;
2273 }
2274
2275 return(ret);
2276}
2277
2278/**
Daniel Veillard7eda8452000-10-14 23:38:43 +00002279 * htmlParseScript:
2280 * @ctxt: an HTML parser context
2281 *
2282 * parse the content of an HTML SCRIPT or STYLE element
2283 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2284 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2285 * http://www.w3.org/TR/html4/types.html#type-script
2286 * http://www.w3.org/TR/html4/types.html#h-6.15
2287 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2288 *
2289 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2290 * element and the value of intrinsic event attributes. User agents must
2291 * not evaluate script data as HTML markup but instead must pass it on as
2292 * data to a script engine.
2293 * NOTES:
2294 * - The content is passed like CDATA
2295 * - the attributes for style and scripting "onXXX" are also described
2296 * as CDATA but SGML allows entities references in attributes so their
2297 * processing is identical as other attributes
2298 */
2299void
2300htmlParseScript(htmlParserCtxtPtr ctxt) {
2301 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2302 int nbchar = 0;
2303 xmlChar cur;
2304
2305 SHRINK;
2306 cur = CUR;
2307 while (IS_CHAR(cur)) {
2308 if ((cur == '<') && (NXT(1) == '/')) {
2309 /*
2310 * One should break here, the specification is clear:
2311 * Authors should therefore escape "</" within the content.
2312 * Escape mechanisms are specific to each scripting or
2313 * style sheet language.
2314 */
2315 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2316 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2317 break; /* while */
2318 }
2319 buf[nbchar++] = cur;
2320 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2321 if (ctxt->sax->cdataBlock!= NULL) {
2322 /*
2323 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2324 */
2325 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2326 }
2327 nbchar = 0;
2328 }
2329 NEXT;
2330 cur = CUR;
2331 }
Daniel Veillarda4964b72000-10-31 18:23:44 +00002332 if (!(IS_CHAR(cur))) {
2333 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2334 ctxt->sax->error(ctxt->userData,
2335 "Invalid char in CDATA 0x%X\n", cur);
2336 ctxt->wellFormed = 0;
2337 NEXT;
2338 }
2339
Daniel Veillard7eda8452000-10-14 23:38:43 +00002340 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2341 if (ctxt->sax->cdataBlock!= NULL) {
2342 /*
2343 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2344 */
2345 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2346 }
2347 }
2348}
2349
2350
2351/**
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002352 * htmlParseCharData:
2353 * @ctxt: an HTML parser context
2354 * @cdata: int indicating whether we are within a CDATA section
2355 *
2356 * parse a CharData section.
2357 * if we are within a CDATA section ']]>' marks an end of section.
2358 *
2359 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2360 */
2361
2362void
2363htmlParseCharData(htmlParserCtxtPtr ctxt, int cdata) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002364 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2365 int nbchar = 0;
2366 int cur, l;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002367
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002368 SHRINK;
2369 cur = CUR_CHAR(l);
2370 while (((cur != '<') || (ctxt->token == '<')) &&
2371 ((cur != '&') || (ctxt->token == '&')) &&
2372 (IS_CHAR(cur))) {
2373 COPY_BUF(l,buf,nbchar,cur);
2374 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2375 /*
2376 * Ok the segment is to be consumed as chars.
2377 */
2378 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2379 if (areBlanks(ctxt, buf, nbchar)) {
2380 if (ctxt->sax->ignorableWhitespace != NULL)
2381 ctxt->sax->ignorableWhitespace(ctxt->userData,
2382 buf, nbchar);
2383 } else {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00002384 htmlCheckParagraph(ctxt);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002385 if (ctxt->sax->characters != NULL)
2386 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2387 }
2388 }
2389 nbchar = 0;
2390 }
2391 NEXTL(l);
2392 cur = CUR_CHAR(l);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002393 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002394 if (nbchar != 0) {
2395 /*
2396 * Ok the segment is to be consumed as chars.
2397 */
2398 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2399 if (areBlanks(ctxt, buf, nbchar)) {
2400 if (ctxt->sax->ignorableWhitespace != NULL)
2401 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2402 } else {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00002403 htmlCheckParagraph(ctxt);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002404 if (ctxt->sax->characters != NULL)
2405 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002406 }
2407 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002408 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002409}
2410
2411/**
2412 * htmlParseExternalID:
2413 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002414 * @publicID: a xmlChar** receiving PubidLiteral
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002415 * @strict: indicate whether we should restrict parsing to only
2416 * production [75], see NOTE below
2417 *
2418 * Parse an External ID or a Public ID
2419 *
2420 * NOTE: Productions [75] and [83] interract badly since [75] can generate
2421 * 'PUBLIC' S PubidLiteral S SystemLiteral
2422 *
2423 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2424 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2425 *
2426 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2427 *
2428 * Returns the function returns SystemLiteral and in the second
2429 * case publicID receives PubidLiteral, is strict is off
2430 * it is possible to return NULL and have publicID set.
2431 */
2432
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002433xmlChar *
2434htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID, int strict) {
2435 xmlChar *URI = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002436
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002437 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2438 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2439 (UPP(4) == 'E') && (UPP(5) == 'M')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002440 SKIP(6);
2441 if (!IS_BLANK(CUR)) {
2442 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2443 ctxt->sax->error(ctxt->userData,
2444 "Space required after 'SYSTEM'\n");
2445 ctxt->wellFormed = 0;
2446 }
2447 SKIP_BLANKS;
2448 URI = htmlParseSystemLiteral(ctxt);
2449 if (URI == NULL) {
2450 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2451 ctxt->sax->error(ctxt->userData,
2452 "htmlParseExternalID: SYSTEM, no URI\n");
2453 ctxt->wellFormed = 0;
2454 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002455 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2456 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2457 (UPP(4) == 'I') && (UPP(5) == 'C')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002458 SKIP(6);
2459 if (!IS_BLANK(CUR)) {
2460 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2461 ctxt->sax->error(ctxt->userData,
2462 "Space required after 'PUBLIC'\n");
2463 ctxt->wellFormed = 0;
2464 }
2465 SKIP_BLANKS;
2466 *publicID = htmlParsePubidLiteral(ctxt);
2467 if (*publicID == NULL) {
2468 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2469 ctxt->sax->error(ctxt->userData,
2470 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2471 ctxt->wellFormed = 0;
2472 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002473 SKIP_BLANKS;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002474 if ((CUR == '"') || (CUR == '\'')) {
2475 URI = htmlParseSystemLiteral(ctxt);
2476 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002477 }
2478 return(URI);
2479}
2480
2481/**
2482 * htmlParseComment:
2483 * @ctxt: an HTML parser context
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002484 *
2485 * Parse an XML (SGML) comment <!-- .... -->
2486 *
2487 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2488 */
2489void
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002490htmlParseComment(htmlParserCtxtPtr ctxt) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002491 xmlChar *buf = NULL;
Daniel Veillard87b95392000-08-12 21:12:04 +00002492 int len;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002493 int size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard87b95392000-08-12 21:12:04 +00002494 int q, ql;
2495 int r, rl;
2496 int cur, l;
2497 xmlParserInputState state;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002498
2499 /*
2500 * Check that there is a comment right here.
2501 */
Daniel Veillard87b95392000-08-12 21:12:04 +00002502 if ((RAW != '<') || (NXT(1) != '!') ||
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002503 (NXT(2) != '-') || (NXT(3) != '-')) return;
2504
Daniel Veillard87b95392000-08-12 21:12:04 +00002505 state = ctxt->instate;
2506 ctxt->instate = XML_PARSER_COMMENT;
2507 SHRINK;
2508 SKIP(4);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002509 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2510 if (buf == NULL) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00002511 xmlGenericError(xmlGenericErrorContext,
2512 "malloc of %d byte failed\n", size);
Daniel Veillard87b95392000-08-12 21:12:04 +00002513 ctxt->instate = state;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002514 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002515 }
Daniel Veillard87b95392000-08-12 21:12:04 +00002516 q = CUR_CHAR(ql);
2517 NEXTL(ql);
2518 r = CUR_CHAR(rl);
2519 NEXTL(rl);
2520 cur = CUR_CHAR(l);
2521 len = 0;
2522 while (IS_CHAR(cur) &&
2523 ((cur != '>') ||
2524 (r != '-') || (q != '-'))) {
2525 if (len + 5 >= size) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002526 size *= 2;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002527 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002528 if (buf == NULL) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00002529 xmlGenericError(xmlGenericErrorContext,
2530 "realloc of %d byte failed\n", size);
Daniel Veillard87b95392000-08-12 21:12:04 +00002531 ctxt->instate = state;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002532 return;
2533 }
2534 }
Daniel Veillard87b95392000-08-12 21:12:04 +00002535 COPY_BUF(ql,buf,len,q);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002536 q = r;
Daniel Veillard87b95392000-08-12 21:12:04 +00002537 ql = rl;
2538 r = cur;
2539 rl = l;
2540 NEXTL(l);
2541 cur = CUR_CHAR(l);
2542 if (cur == 0) {
2543 SHRINK;
2544 GROW;
2545 cur = CUR_CHAR(l);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002546 }
2547 }
Daniel Veillard87b95392000-08-12 21:12:04 +00002548 buf[len] = 0;
2549 if (!IS_CHAR(cur)) {
Daniel Veillarda2c6da92000-09-16 18:15:00 +00002550 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
Daniel Veillard87b95392000-08-12 21:12:04 +00002551 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2552 ctxt->sax->error(ctxt->userData,
2553 "Comment not terminated \n<!--%.50s\n", buf);
Daniel Veillard87b95392000-08-12 21:12:04 +00002554 ctxt->wellFormed = 0;
2555 xmlFree(buf);
2556 } else {
2557 NEXT;
2558 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2559 (!ctxt->disableSAX))
2560 ctxt->sax->comment(ctxt->userData, buf);
2561 xmlFree(buf);
2562 }
2563 ctxt->instate = state;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002564}
2565
2566/**
2567 * htmlParseCharRef:
2568 * @ctxt: an HTML parser context
2569 *
2570 * parse Reference declarations
2571 *
2572 * [66] CharRef ::= '&#' [0-9]+ ';' |
2573 * '&#x' [0-9a-fA-F]+ ';'
2574 *
2575 * Returns the value parsed (as an int)
2576 */
2577int
2578htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2579 int val = 0;
2580
2581 if ((CUR == '&') && (NXT(1) == '#') &&
2582 (NXT(2) == 'x')) {
2583 SKIP(3);
2584 while (CUR != ';') {
2585 if ((CUR >= '0') && (CUR <= '9'))
2586 val = val * 16 + (CUR - '0');
2587 else if ((CUR >= 'a') && (CUR <= 'f'))
2588 val = val * 16 + (CUR - 'a') + 10;
2589 else if ((CUR >= 'A') && (CUR <= 'F'))
2590 val = val * 16 + (CUR - 'A') + 10;
2591 else {
2592 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2593 ctxt->sax->error(ctxt->userData,
2594 "htmlParseCharRef: invalid hexadecimal value\n");
2595 ctxt->wellFormed = 0;
Daniel Veillard748e45d2000-11-17 16:36:08 +00002596 return(0);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002597 }
2598 NEXT;
2599 }
2600 if (CUR == ';')
2601 NEXT;
2602 } else if ((CUR == '&') && (NXT(1) == '#')) {
2603 SKIP(2);
2604 while (CUR != ';') {
2605 if ((CUR >= '0') && (CUR <= '9'))
2606 val = val * 10 + (CUR - '0');
2607 else {
2608 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2609 ctxt->sax->error(ctxt->userData,
2610 "htmlParseCharRef: invalid decimal value\n");
2611 ctxt->wellFormed = 0;
Daniel Veillard748e45d2000-11-17 16:36:08 +00002612 return(0);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002613 }
2614 NEXT;
2615 }
2616 if (CUR == ';')
2617 NEXT;
2618 } else {
2619 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2620 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2621 ctxt->wellFormed = 0;
2622 }
2623 /*
2624 * Check the value IS_CHAR ...
2625 */
2626 if (IS_CHAR(val)) {
2627 return(val);
2628 } else {
2629 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002630 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002631 val);
2632 ctxt->wellFormed = 0;
2633 }
2634 return(0);
2635}
2636
2637
2638/**
2639 * htmlParseDocTypeDecl :
2640 * @ctxt: an HTML parser context
2641 *
2642 * parse a DOCTYPE declaration
2643 *
2644 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2645 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2646 */
2647
2648void
2649htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002650 xmlChar *name;
2651 xmlChar *ExternalID = NULL;
2652 xmlChar *URI = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002653
2654 /*
2655 * We know that '<!DOCTYPE' has been detected.
2656 */
2657 SKIP(9);
2658
2659 SKIP_BLANKS;
2660
2661 /*
2662 * Parse the DOCTYPE name.
2663 */
2664 name = htmlParseName(ctxt);
2665 if (name == NULL) {
2666 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2667 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2668 ctxt->wellFormed = 0;
2669 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002670 /*
2671 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2672 */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002673
2674 SKIP_BLANKS;
2675
2676 /*
2677 * Check for SystemID and ExternalID
2678 */
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002679 URI = htmlParseExternalID(ctxt, &ExternalID, 0);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002680 SKIP_BLANKS;
2681
2682 /*
2683 * We should be at the end of the DOCTYPE declaration.
2684 */
2685 if (CUR != '>') {
2686 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2687 ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
2688 ctxt->wellFormed = 0;
2689 /* We shouldn't try to resynchronize ... */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002690 }
2691 NEXT;
2692
2693 /*
Daniel Veillardd83eb822000-06-30 18:39:56 +00002694 * Create or update the document accordingly to the DOCTYPE
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002695 */
Daniel Veillardd83eb822000-06-30 18:39:56 +00002696 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2697 (!ctxt->disableSAX))
2698 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002699
2700 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002701 * Cleanup, since we don't use all those identifiers
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002702 */
Daniel Veillard6454aec1999-09-02 22:04:43 +00002703 if (URI != NULL) xmlFree(URI);
2704 if (ExternalID != NULL) xmlFree(ExternalID);
2705 if (name != NULL) xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002706}
2707
2708/**
2709 * htmlParseAttribute:
2710 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002711 * @value: a xmlChar ** used to store the value of the attribute
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002712 *
2713 * parse an attribute
2714 *
2715 * [41] Attribute ::= Name Eq AttValue
2716 *
2717 * [25] Eq ::= S? '=' S?
2718 *
2719 * With namespace:
2720 *
2721 * [NS 11] Attribute ::= QName Eq AttValue
2722 *
2723 * Also the case QName == xmlns:??? is handled independently as a namespace
2724 * definition.
2725 *
2726 * Returns the attribute name, and the value in *value.
2727 */
2728
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002729xmlChar *
2730htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002731 xmlChar *name, *val = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002732
2733 *value = NULL;
Daniel Veillard970112a2000-10-03 09:33:21 +00002734 name = htmlParseHTMLName(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002735 if (name == NULL) {
2736 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2737 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2738 ctxt->wellFormed = 0;
2739 return(NULL);
2740 }
2741
2742 /*
2743 * read the value
2744 */
2745 SKIP_BLANKS;
2746 if (CUR == '=') {
2747 NEXT;
2748 SKIP_BLANKS;
2749 val = htmlParseAttValue(ctxt);
Daniel Veillardbe803962000-06-28 23:40:59 +00002750 /******
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002751 } else {
Daniel Veillardbe803962000-06-28 23:40:59 +00002752 * TODO : some attribute must have values, some may not
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002753 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002754 ctxt->sax->warning(ctxt->userData,
Daniel Veillardbe803962000-06-28 23:40:59 +00002755 "No value for attribute %s\n", name); */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002756 }
2757
2758 *value = val;
2759 return(name);
2760}
2761
2762/**
Daniel Veillard365e13b2000-07-02 07:56:37 +00002763 * htmlCheckEncoding:
2764 * @ctxt: an HTML parser context
2765 * @attvalue: the attribute value
2766 *
2767 * Checks an http-equiv attribute from a Meta tag to detect
2768 * the encoding
2769 * If a new encoding is detected the parser is switched to decode
2770 * it and pass UTF8
2771 */
2772void
2773htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2774 const xmlChar *encoding;
2775
2776 if ((ctxt == NULL) || (attvalue == NULL))
2777 return;
2778
Daniel Veillardb656ebe2000-09-22 13:51:48 +00002779 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
Daniel Veillard365e13b2000-07-02 07:56:37 +00002780 if (encoding != NULL) {
2781 encoding += 8;
2782 } else {
Daniel Veillardb656ebe2000-09-22 13:51:48 +00002783 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
Daniel Veillard365e13b2000-07-02 07:56:37 +00002784 if (encoding != NULL)
2785 encoding += 9;
2786 }
2787 if (encoding != NULL) {
2788 xmlCharEncoding enc;
2789 xmlCharEncodingHandlerPtr handler;
2790
2791 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2792
2793 if (ctxt->input->encoding != NULL)
2794 xmlFree((xmlChar *) ctxt->input->encoding);
2795 ctxt->input->encoding = xmlStrdup(encoding);
2796
2797 enc = xmlParseCharEncoding((const char *) encoding);
2798 /*
2799 * registered set of known encodings
2800 */
2801 if (enc != XML_CHAR_ENCODING_ERROR) {
2802 xmlSwitchEncoding(ctxt, enc);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002803 ctxt->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillard365e13b2000-07-02 07:56:37 +00002804 } else {
2805 /*
2806 * fallback for unknown encodings
2807 */
2808 handler = xmlFindCharEncodingHandler((const char *) encoding);
2809 if (handler != NULL) {
2810 xmlSwitchToEncoding(ctxt, handler);
Daniel Veillard87b95392000-08-12 21:12:04 +00002811 ctxt->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillard365e13b2000-07-02 07:56:37 +00002812 } else {
2813 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2814 }
2815 }
Daniel Veillard87b95392000-08-12 21:12:04 +00002816
2817 if ((ctxt->input->buf != NULL) &&
2818 (ctxt->input->buf->encoder != NULL) &&
2819 (ctxt->input->buf->raw != NULL) &&
2820 (ctxt->input->buf->buffer != NULL)) {
2821 int nbchars;
2822 int processed;
2823
2824 /*
2825 * convert as much as possible to the parser reading buffer.
2826 */
2827 processed = ctxt->input->cur - ctxt->input->base;
2828 xmlBufferShrink(ctxt->input->buf->buffer, processed);
2829 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2830 ctxt->input->buf->buffer,
2831 ctxt->input->buf->raw);
2832 if (nbchars < 0) {
Daniel Veillarda2c6da92000-09-16 18:15:00 +00002833 ctxt->errNo = XML_ERR_INVALID_ENCODING;
Daniel Veillard87b95392000-08-12 21:12:04 +00002834 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2835 ctxt->sax->error(ctxt->userData,
2836 "htmlCheckEncoding: encoder error\n");
Daniel Veillard87b95392000-08-12 21:12:04 +00002837 }
2838 ctxt->input->base =
2839 ctxt->input->cur = ctxt->input->buf->buffer->content;
2840 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00002841 }
2842}
2843
2844/**
2845 * htmlCheckMeta:
2846 * @ctxt: an HTML parser context
2847 * @atts: the attributes values
2848 *
2849 * Checks an attributes from a Meta tag
2850 */
2851void
2852htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2853 int i;
2854 const xmlChar *att, *value;
2855 int http = 0;
2856 const xmlChar *content = NULL;
2857
2858 if ((ctxt == NULL) || (atts == NULL))
2859 return;
2860
2861 i = 0;
2862 att = atts[i++];
2863 while (att != NULL) {
2864 value = atts[i++];
Daniel Veillardb656ebe2000-09-22 13:51:48 +00002865 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
2866 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
Daniel Veillard365e13b2000-07-02 07:56:37 +00002867 http = 1;
Daniel Veillardb656ebe2000-09-22 13:51:48 +00002868 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
Daniel Veillard365e13b2000-07-02 07:56:37 +00002869 content = value;
2870 att = atts[i++];
2871 }
2872 if ((http) && (content != NULL))
2873 htmlCheckEncoding(ctxt, content);
2874
2875}
2876
2877/**
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002878 * htmlParseStartTag:
2879 * @ctxt: an HTML parser context
2880 *
2881 * parse a start of tag either for rule element or
2882 * EmptyElement. In both case we don't parse the tag closing chars.
2883 *
2884 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2885 *
2886 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2887 *
2888 * With namespace:
2889 *
2890 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2891 *
2892 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2893 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002894 */
2895
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002896void
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002897htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002898 xmlChar *name;
2899 xmlChar *attname;
2900 xmlChar *attvalue;
2901 const xmlChar **atts = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002902 int nbatts = 0;
2903 int maxatts = 0;
Daniel Veillard365e13b2000-07-02 07:56:37 +00002904 int meta = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002905 int i;
2906
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002907 if (CUR != '<') return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002908 NEXT;
2909
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002910 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002911 name = htmlParseHTMLName(ctxt);
2912 if (name == NULL) {
2913 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2914 ctxt->sax->error(ctxt->userData,
2915 "htmlParseStartTag: invalid element name\n");
2916 ctxt->wellFormed = 0;
Daniel Veillard126f2792000-10-24 17:10:12 +00002917 /* Dump the bogus tag like browsers do */
2918 while ((IS_CHAR(CUR)) && (CUR != '>'))
2919 NEXT;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002920 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002921 }
Daniel Veillard8b5dd832000-10-01 20:28:44 +00002922 if (xmlStrEqual(name, BAD_CAST"meta"))
Daniel Veillard365e13b2000-07-02 07:56:37 +00002923 meta = 1;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002924
2925 /*
2926 * Check for auto-closure of HTML elements.
2927 */
2928 htmlAutoClose(ctxt, name);
2929
2930 /*
Daniel Veillardbe803962000-06-28 23:40:59 +00002931 * Check for implied HTML elements.
2932 */
2933 htmlCheckImplied(ctxt, name);
2934
2935 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002936 * Now parse the attributes, it ends up with the ending
2937 *
2938 * (S Attribute)* S?
2939 */
2940 SKIP_BLANKS;
2941 while ((IS_CHAR(CUR)) &&
2942 (CUR != '>') &&
2943 ((CUR != '/') || (NXT(1) != '>'))) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002944 long cons = ctxt->nbChars;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002945
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002946 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002947 attname = htmlParseAttribute(ctxt, &attvalue);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002948 if (attname != NULL) {
Daniel Veillard365e13b2000-07-02 07:56:37 +00002949
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002950 /*
2951 * Well formedness requires at most one declaration of an attribute
2952 */
2953 for (i = 0; i < nbatts;i += 2) {
Daniel Veillard8b5dd832000-10-01 20:28:44 +00002954 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002955 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002956 ctxt->sax->error(ctxt->userData,
2957 "Attribute %s redefined\n",
2958 attname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002959 ctxt->wellFormed = 0;
Daniel Veillard6454aec1999-09-02 22:04:43 +00002960 xmlFree(attname);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002961 if (attvalue != NULL)
2962 xmlFree(attvalue);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002963 goto failed;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002964 }
2965 }
2966
2967 /*
2968 * Add the pair to atts
2969 */
2970 if (atts == NULL) {
2971 maxatts = 10;
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002972 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002973 if (atts == NULL) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00002974 xmlGenericError(xmlGenericErrorContext,
2975 "malloc of %ld byte failed\n",
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002976 maxatts * (long)sizeof(xmlChar *));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002977 if (name != NULL) xmlFree(name);
2978 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002979 }
Daniel Veillard51e3b151999-11-12 17:02:31 +00002980 } else if (nbatts + 4 > maxatts) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002981 maxatts *= 2;
Daniel Veillard4b0755c2000-09-25 14:26:28 +00002982 atts = (const xmlChar **) xmlRealloc((void *) atts,
2983 maxatts * sizeof(xmlChar *));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002984 if (atts == NULL) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00002985 xmlGenericError(xmlGenericErrorContext,
2986 "realloc of %ld byte failed\n",
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002987 maxatts * (long)sizeof(xmlChar *));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002988 if (name != NULL) xmlFree(name);
2989 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002990 }
2991 }
2992 atts[nbatts++] = attname;
2993 atts[nbatts++] = attvalue;
2994 atts[nbatts] = NULL;
2995 atts[nbatts + 1] = NULL;
2996 }
Daniel Veillard126f2792000-10-24 17:10:12 +00002997 else {
2998 /* Dump the bogus attribute string up to the next blank or
2999 * the end of the tag. */
3000 while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
3001 && ((CUR != '/') || (NXT(1) != '>')))
3002 NEXT;
3003 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003004
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003005failed:
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003006 SKIP_BLANKS;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003007 if (cons == ctxt->nbChars) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003008 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3009 ctxt->sax->error(ctxt->userData,
3010 "htmlParseStartTag: problem parsing attributes\n");
3011 ctxt->wellFormed = 0;
3012 break;
3013 }
3014 }
3015
3016 /*
Daniel Veillard365e13b2000-07-02 07:56:37 +00003017 * Handle specific association to the META tag
3018 */
3019 if (meta)
3020 htmlCheckMeta(ctxt, atts);
3021
3022 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003023 * SAX: Start of Element !
3024 */
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003025 htmlnamePush(ctxt, xmlStrdup(name));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003026#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003027 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003028#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003029 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3030 ctxt->sax->startElement(ctxt->userData, name, atts);
3031
3032 if (atts != NULL) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003033 for (i = 0;i < nbatts;i++) {
3034 if (atts[i] != NULL)
3035 xmlFree((xmlChar *) atts[i]);
3036 }
Daniel Veillard3f6f7f62000-06-30 17:58:25 +00003037 xmlFree((void *) atts);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003038 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003039 if (name != NULL) xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003040}
3041
3042/**
3043 * htmlParseEndTag:
3044 * @ctxt: an HTML parser context
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003045 *
3046 * parse an end of tag
3047 *
3048 * [42] ETag ::= '</' Name S? '>'
3049 *
3050 * With namespace
3051 *
3052 * [NS 9] ETag ::= '</' QName S? '>'
3053 */
3054
3055void
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003056htmlParseEndTag(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003057 xmlChar *name;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003058 xmlChar *oldname;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003059 int i;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003060
3061 if ((CUR != '<') || (NXT(1) != '/')) {
3062 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3063 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3064 ctxt->wellFormed = 0;
3065 return;
3066 }
3067 SKIP(2);
3068
3069 name = htmlParseHTMLName(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003070 if (name == NULL) return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003071
3072 /*
3073 * We should definitely be at the ending "S? '>'" part
3074 */
3075 SKIP_BLANKS;
3076 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3077 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3078 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3079 ctxt->wellFormed = 0;
3080 } else
3081 NEXT;
3082
3083 /*
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003084 * If the name read is not one of the element in the parsing stack
3085 * then return, it's just an error.
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003086 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003087 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard8b5dd832000-10-01 20:28:44 +00003088 if (xmlStrEqual(name, ctxt->nameTab[i])) break;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003089 }
3090 if (i < 0) {
3091 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003092 ctxt->sax->error(ctxt->userData,
3093 "Unexpected end tag : %s\n", name);
Daniel Veillard6454aec1999-09-02 22:04:43 +00003094 xmlFree(name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003095 ctxt->wellFormed = 0;
3096 return;
3097 }
3098
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003099
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003100 /*
3101 * Check for auto-closure of HTML elements.
3102 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003103
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003104 htmlAutoCloseOnClose(ctxt, name);
3105
3106 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003107 * Well formedness constraints, opening and closing must match.
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003108 * With the exception that the autoclose may have popped stuff out
3109 * of the stack.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003110 */
Daniel Veillard8b5dd832000-10-01 20:28:44 +00003111 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003112#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003113 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003114#endif
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003115 if ((ctxt->name != NULL) &&
Daniel Veillard8b5dd832000-10-01 20:28:44 +00003116 (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003117 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3118 ctxt->sax->error(ctxt->userData,
3119 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003120 name, ctxt->name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003121 ctxt->wellFormed = 0;
3122 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003123 }
3124
3125 /*
3126 * SAX: End of Tag
3127 */
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003128 oldname = ctxt->name;
Daniel Veillard8b5dd832000-10-01 20:28:44 +00003129 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003130 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3131 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003132 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003133 if (oldname != NULL) {
3134#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003135 xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003136#endif
3137 xmlFree(oldname);
3138#ifdef DEBUG
3139 } else {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003140 xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003141#endif
3142 }
3143 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003144
3145 if (name != NULL)
Daniel Veillard6454aec1999-09-02 22:04:43 +00003146 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003147
3148 return;
3149}
3150
3151
3152/**
3153 * htmlParseReference:
3154 * @ctxt: an HTML parser context
3155 *
3156 * parse and handle entity references in content,
3157 * this will end-up in a call to character() since this is either a
3158 * CharRef, or a predefined entity.
3159 */
3160void
3161htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003162 htmlEntityDescPtr ent;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003163 xmlChar out[6];
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003164 xmlChar *name;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003165 if (CUR != '&') return;
3166
3167 if (NXT(1) == '#') {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003168 unsigned int c;
3169 int bits, i = 0;
3170
3171 c = htmlParseCharRef(ctxt);
Daniel Veillard748e45d2000-11-17 16:36:08 +00003172 if (c == 0)
3173 return;
3174
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003175 if (c < 0x80) { out[i++]= c; bits= -6; }
3176 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3177 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3178 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3179
3180 for ( ; bits >= 0; bits-= 6) {
3181 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3182 }
3183 out[i] = 0;
3184
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003185 htmlCheckParagraph(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003186 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003187 ctxt->sax->characters(ctxt->userData, out, i);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003188 } else {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003189 ent = htmlParseEntityRef(ctxt, &name);
Daniel Veillard71b656e2000-01-05 14:46:17 +00003190 if (name == NULL) {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003191 htmlCheckParagraph(ctxt);
Daniel Veillard1255ab72000-08-14 15:13:33 +00003192 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3193 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
Daniel Veillard71b656e2000-01-05 14:46:17 +00003194 return;
3195 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003196 if ((ent == NULL) || (ent->value <= 0)) {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003197 htmlCheckParagraph(ctxt);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003198 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
Daniel Veillardb96e6431999-08-29 21:02:19 +00003199 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003200 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
Daniel Veillard71b656e2000-01-05 14:46:17 +00003201 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003202 }
3203 } else {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003204 unsigned int c;
3205 int bits, i = 0;
3206
3207 c = ent->value;
3208 if (c < 0x80)
3209 { out[i++]= c; bits= -6; }
3210 else if (c < 0x800)
3211 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3212 else if (c < 0x10000)
3213 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3214 else
3215 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3216
3217 for ( ; bits >= 0; bits-= 6) {
3218 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3219 }
3220 out[i] = 0;
3221
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003222 htmlCheckParagraph(ctxt);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003223 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003224 ctxt->sax->characters(ctxt->userData, out, i);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003225 }
Daniel Veillard6454aec1999-09-02 22:04:43 +00003226 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003227 }
3228}
3229
3230/**
3231 * htmlParseContent:
3232 * @ctxt: an HTML parser context
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003233 * @name: the node name
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003234 *
3235 * Parse a content: comment, sub-element, reference or text.
3236 *
3237 */
3238
3239void
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003240htmlParseContent(htmlParserCtxtPtr ctxt) {
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003241 xmlChar *currentNode;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003242 int depth;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003243
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003244 currentNode = xmlStrdup(ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003245 depth = ctxt->nameNr;
3246 while (1) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003247 long cons = ctxt->nbChars;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003248
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003249 GROW;
3250 /*
3251 * Our tag or one of it's parent or children is ending.
3252 */
3253 if ((CUR == '<') && (NXT(1) == '/')) {
3254 htmlParseEndTag(ctxt);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003255 if (currentNode != NULL) xmlFree(currentNode);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003256 return;
3257 }
3258
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003259 /*
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003260 * Has this node been popped out during parsing of
3261 * the next element
3262 */
Daniel Veillard8b5dd832000-10-01 20:28:44 +00003263 if ((!xmlStrEqual(currentNode, ctxt->name)) &&
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003264 (depth >= ctxt->nameNr)) {
3265 if (currentNode != NULL) xmlFree(currentNode);
3266 return;
3267 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003268
Daniel Veillard7eda8452000-10-14 23:38:43 +00003269 if ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3270 (xmlStrEqual(currentNode, BAD_CAST"style"))) {
3271 /*
3272 * Handle SCRIPT/STYLE separately
3273 */
3274 htmlParseScript(ctxt);
3275 } else {
3276 /*
3277 * Sometimes DOCTYPE arrives in the middle of the document
3278 */
3279 if ((CUR == '<') && (NXT(1) == '!') &&
3280 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3281 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3282 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3283 (UPP(8) == 'E')) {
Daniel Veillard35008381999-10-25 13:15:52 +00003284 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3285 ctxt->sax->error(ctxt->userData,
Daniel Veillard7eda8452000-10-14 23:38:43 +00003286 "Misplaced DOCTYPE declaration\n");
Daniel Veillard35008381999-10-25 13:15:52 +00003287 ctxt->wellFormed = 0;
Daniel Veillard7eda8452000-10-14 23:38:43 +00003288 htmlParseDocTypeDecl(ctxt);
Daniel Veillard35008381999-10-25 13:15:52 +00003289 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003290
Daniel Veillard7eda8452000-10-14 23:38:43 +00003291 /*
3292 * First case : a comment
3293 */
3294 if ((CUR == '<') && (NXT(1) == '!') &&
3295 (NXT(2) == '-') && (NXT(3) == '-')) {
3296 htmlParseComment(ctxt);
3297 }
3298
3299 /*
3300 * Second case : a sub-element.
3301 */
3302 else if (CUR == '<') {
3303 htmlParseElement(ctxt);
3304 }
3305
3306 /*
3307 * Third case : a reference. If if has not been resolved,
3308 * parsing returns it's Name, create the node
3309 */
3310 else if (CUR == '&') {
3311 htmlParseReference(ctxt);
3312 }
3313
3314 /*
3315 * Fourth : end of the resource
3316 */
3317 else if (CUR == 0) {
3318 htmlAutoClose(ctxt, NULL);
3319 }
3320
3321 /*
3322 * Last case, text. Note that References are handled directly.
3323 */
3324 else {
3325 htmlParseCharData(ctxt, 0);
3326 }
3327
3328 if (cons == ctxt->nbChars) {
3329 if (ctxt->node != NULL) {
3330 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3331 ctxt->sax->error(ctxt->userData,
3332 "detected an error in element content\n");
3333 ctxt->wellFormed = 0;
3334 }
3335 break;
3336 }
3337 }
Daniel Veillarde2d034d1999-07-27 19:52:06 +00003338 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003339 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003340 if (currentNode != NULL) xmlFree(currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003341}
3342
3343/**
3344 * htmlParseElement:
3345 * @ctxt: an HTML parser context
3346 *
3347 * parse an HTML element, this is highly recursive
3348 *
3349 * [39] element ::= EmptyElemTag | STag content ETag
3350 *
3351 * [41] Attribute ::= Name Eq AttValue
3352 */
3353
3354void
3355htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003356 xmlChar *name;
Daniel Veillard7d2c2761999-10-11 15:09:51 +00003357 xmlChar *currentNode = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003358 htmlElemDescPtr info;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003359 htmlParserNodeInfo node_info;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003360 xmlChar *oldname;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003361 int depth = ctxt->nameNr;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003362
3363 /* Capture start position */
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003364 if (ctxt->record_info) {
3365 node_info.begin_pos = ctxt->input->consumed +
3366 (CUR_PTR - ctxt->input->base);
3367 node_info.begin_line = ctxt->input->line;
3368 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003369
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003370 oldname = xmlStrdup(ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003371 htmlParseStartTag(ctxt);
3372 name = ctxt->name;
3373#ifdef DEBUG
3374 if (oldname == NULL)
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003375 xmlGenericError(xmlGenericErrorContext,
3376 "Start of element %s\n", name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003377 else if (name == NULL)
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003378 xmlGenericError(xmlGenericErrorContext,
3379 "Start of element failed, was %s\n", oldname);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003380 else
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003381 xmlGenericError(xmlGenericErrorContext,
3382 "Start of element %s, was %s\n", name, oldname);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003383#endif
Daniel Veillard8b5dd832000-10-01 20:28:44 +00003384 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003385 (name == NULL)) {
3386 if (CUR == '>')
3387 NEXT;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003388 if (oldname != NULL)
3389 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003390 return;
3391 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003392 if (oldname != NULL)
3393 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003394
3395 /*
3396 * Lookup the info for that element.
3397 */
3398 info = htmlTagLookup(name);
3399 if (info == NULL) {
3400 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3401 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3402 name);
3403 ctxt->wellFormed = 0;
3404 } else if (info->depr) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003405/***************************
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003406 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3407 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3408 name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003409 ***************************/
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003410 }
3411
3412 /*
3413 * Check for an Empty Element labelled the XML/SGML way
3414 */
3415 if ((CUR == '/') && (NXT(1) == '>')) {
3416 SKIP(2);
3417 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3418 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003419 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003420#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003421 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003422#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003423 if (oldname != NULL)
3424 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003425 return;
3426 }
3427
Daniel Veillarde2d034d1999-07-27 19:52:06 +00003428 if (CUR == '>') {
3429 NEXT;
3430 } else {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003431 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard87b95392000-08-12 21:12:04 +00003432 ctxt->sax->error(ctxt->userData,
3433 "Couldn't find end of Start Tag %s\n",
3434 name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003435 ctxt->wellFormed = 0;
3436
3437 /*
3438 * end of parsing of this node.
3439 */
Daniel Veillard8b5dd832000-10-01 20:28:44 +00003440 if (xmlStrEqual(name, ctxt->name)) {
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003441 nodePop(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003442 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003443#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003444 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003445#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003446 if (oldname != NULL)
3447 xmlFree(oldname);
3448 }
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003449
3450 /*
3451 * Capture end position and add node
3452 */
3453 if ( currentNode != NULL && ctxt->record_info ) {
3454 node_info.end_pos = ctxt->input->consumed +
3455 (CUR_PTR - ctxt->input->base);
3456 node_info.end_line = ctxt->input->line;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003457 node_info.node = ctxt->node;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003458 xmlParserAddNodeInfo(ctxt, &node_info);
3459 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003460 return;
3461 }
3462
3463 /*
3464 * Check for an Empty Element from DTD definition
3465 */
3466 if ((info != NULL) && (info->empty)) {
3467 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3468 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003469 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003470#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003471 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003472#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003473 if (oldname != NULL)
3474 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003475 return;
3476 }
3477
3478 /*
3479 * Parse the content of the element:
3480 */
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003481 currentNode = xmlStrdup(ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003482 depth = ctxt->nameNr;
3483 while (IS_CHAR(CUR)) {
3484 htmlParseContent(ctxt);
3485 if (ctxt->nameNr < depth) break;
3486 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003487
3488 if (!IS_CHAR(CUR)) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003489 /************
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003490 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3491 ctxt->sax->error(ctxt->userData,
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003492 "Premature end of data in tag %s\n", currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003493 ctxt->wellFormed = 0;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003494 *************/
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003495
3496 /*
3497 * end of parsing of this node.
3498 */
3499 nodePop(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003500 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003501#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003502 xmlGenericError(xmlGenericErrorContext,"Premature end of tag %s : popping out %s\n", name, oldname);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003503#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003504 if (oldname != NULL)
3505 xmlFree(oldname);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003506 if (currentNode != NULL)
3507 xmlFree(currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003508 return;
3509 }
3510
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003511 /*
3512 * Capture end position and add node
3513 */
3514 if ( currentNode != NULL && ctxt->record_info ) {
3515 node_info.end_pos = ctxt->input->consumed +
3516 (CUR_PTR - ctxt->input->base);
3517 node_info.end_line = ctxt->input->line;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003518 node_info.node = ctxt->node;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003519 xmlParserAddNodeInfo(ctxt, &node_info);
3520 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003521 if (currentNode != NULL)
3522 xmlFree(currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003523}
3524
3525/**
3526 * htmlParseDocument :
3527 * @ctxt: an HTML parser context
3528 *
3529 * parse an HTML document (and build a tree if using the standard SAX
3530 * interface).
3531 *
3532 * Returns 0, -1 in case of error. the parser context is augmented
3533 * as a result of the parsing.
3534 */
3535
3536int
3537htmlParseDocument(htmlParserCtxtPtr ctxt) {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003538 xmlDtdPtr dtd;
3539
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003540 htmlDefaultSAXHandlerInit();
3541 ctxt->html = 1;
3542
Daniel Veillarde2d034d1999-07-27 19:52:06 +00003543 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003544 /*
Daniel Veillardb96e6431999-08-29 21:02:19 +00003545 * SAX: beginning of the document processing.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003546 */
3547 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3548 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3549
3550 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003551 * Wipe out everything which is before the first '<'
3552 */
Daniel Veillard35008381999-10-25 13:15:52 +00003553 SKIP_BLANKS;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003554 if (CUR == 0) {
3555 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3556 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3557 ctxt->wellFormed = 0;
3558 }
3559
Daniel Veillardbe803962000-06-28 23:40:59 +00003560 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3561 ctxt->sax->startDocument(ctxt->userData);
3562
3563
Daniel Veillard35008381999-10-25 13:15:52 +00003564 /*
3565 * Parse possible comments before any content
3566 */
3567 while ((CUR == '<') && (NXT(1) == '!') &&
3568 (NXT(2) == '-') && (NXT(3) == '-')) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003569 htmlParseComment(ctxt);
Daniel Veillard35008381999-10-25 13:15:52 +00003570 SKIP_BLANKS;
3571 }
3572
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003573
3574 /*
3575 * Then possibly doc type declaration(s) and more Misc
3576 * (doctypedecl Misc*)?
3577 */
3578 if ((CUR == '<') && (NXT(1) == '!') &&
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003579 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3580 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3581 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3582 (UPP(8) == 'E')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003583 htmlParseDocTypeDecl(ctxt);
3584 }
3585 SKIP_BLANKS;
3586
3587 /*
Daniel Veillard87b95392000-08-12 21:12:04 +00003588 * Parse possible comments before any content
3589 */
3590 while ((CUR == '<') && (NXT(1) == '!') &&
3591 (NXT(2) == '-') && (NXT(3) == '-')) {
3592 htmlParseComment(ctxt);
3593 SKIP_BLANKS;
3594 }
3595
3596 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003597 * Time to start parsing the tree itself
3598 */
Daniel Veillard35008381999-10-25 13:15:52 +00003599 htmlParseContent(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003600
3601 /*
Daniel Veillard365e13b2000-07-02 07:56:37 +00003602 * autoclose
3603 */
3604 if (CUR == 0)
3605 htmlAutoClose(ctxt, NULL);
3606
3607
3608 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003609 * SAX: end of the document processing.
3610 */
3611 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3612 ctxt->sax->endDocument(ctxt->userData);
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003613
3614 if (ctxt->myDoc != NULL) {
3615 dtd = xmlGetIntSubset(ctxt->myDoc);
3616 if (dtd == NULL)
3617 ctxt->myDoc->intSubset =
3618 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3619 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3620 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3621 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003622 if (! ctxt->wellFormed) return(-1);
3623 return(0);
3624}
3625
3626
Daniel Veillarddbfd6411999-12-28 16:35:14 +00003627/************************************************************************
3628 * *
3629 * Parser contexts handling *
3630 * *
3631 ************************************************************************/
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003632
3633/**
3634 * xmlInitParserCtxt:
3635 * @ctxt: an HTML parser context
3636 *
3637 * Initialize a parser context
3638 */
3639
3640void
3641htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3642{
3643 htmlSAXHandler *sax;
3644
Daniel Veillard35008381999-10-25 13:15:52 +00003645 if (ctxt == NULL) return;
3646 memset(ctxt, 0, sizeof(htmlParserCtxt));
3647
Daniel Veillard6454aec1999-09-02 22:04:43 +00003648 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003649 if (sax == NULL) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003650 xmlGenericError(xmlGenericErrorContext,
3651 "htmlInitParserCtxt: out of memory\n");
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003652 }
Daniel Veillard4fb87ee2000-09-19 12:25:59 +00003653 else
3654 memset(sax, 0, sizeof(htmlSAXHandler));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003655
3656 /* Allocate the Input stack */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003657 ctxt->inputTab = (htmlParserInputPtr *)
3658 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3659 if (ctxt->inputTab == NULL) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003660 xmlGenericError(xmlGenericErrorContext,
3661 "htmlInitParserCtxt: out of memory\n");
Daniel Veillard39c7d712000-09-10 16:14:55 +00003662 ctxt->inputNr = 0;
3663 ctxt->inputMax = 0;
3664 ctxt->input = NULL;
3665 return;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003666 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003667 ctxt->inputNr = 0;
3668 ctxt->inputMax = 5;
3669 ctxt->input = NULL;
3670 ctxt->version = NULL;
3671 ctxt->encoding = NULL;
3672 ctxt->standalone = -1;
Daniel Veillarddbfd6411999-12-28 16:35:14 +00003673 ctxt->instate = XML_PARSER_START;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003674
3675 /* Allocate the Node stack */
Daniel Veillard6454aec1999-09-02 22:04:43 +00003676 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
Daniel Veillard39c7d712000-09-10 16:14:55 +00003677 if (ctxt->nodeTab == NULL) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003678 xmlGenericError(xmlGenericErrorContext,
3679 "htmlInitParserCtxt: out of memory\n");
Daniel Veillard39c7d712000-09-10 16:14:55 +00003680 ctxt->nodeNr = 0;
3681 ctxt->nodeMax = 0;
3682 ctxt->node = NULL;
3683 ctxt->inputNr = 0;
3684 ctxt->inputMax = 0;
3685 ctxt->input = NULL;
3686 return;
3687 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003688 ctxt->nodeNr = 0;
3689 ctxt->nodeMax = 10;
3690 ctxt->node = NULL;
3691
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003692 /* Allocate the Name stack */
3693 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Daniel Veillard39c7d712000-09-10 16:14:55 +00003694 if (ctxt->nameTab == NULL) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003695 xmlGenericError(xmlGenericErrorContext,
3696 "htmlInitParserCtxt: out of memory\n");
Daniel Veillard39c7d712000-09-10 16:14:55 +00003697 ctxt->nameNr = 0;
3698 ctxt->nameMax = 10;
3699 ctxt->name = NULL;
3700 ctxt->nodeNr = 0;
3701 ctxt->nodeMax = 0;
3702 ctxt->node = NULL;
3703 ctxt->inputNr = 0;
3704 ctxt->inputMax = 0;
3705 ctxt->input = NULL;
3706 return;
3707 }
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003708 ctxt->nameNr = 0;
3709 ctxt->nameMax = 10;
3710 ctxt->name = NULL;
3711
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003712 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3713 else {
3714 ctxt->sax = sax;
3715 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3716 }
3717 ctxt->userData = ctxt;
3718 ctxt->myDoc = NULL;
3719 ctxt->wellFormed = 1;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003720 ctxt->replaceEntities = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003721 ctxt->html = 1;
3722 ctxt->record_info = 0;
Daniel Veillard35008381999-10-25 13:15:52 +00003723 ctxt->validate = 0;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003724 ctxt->nbChars = 0;
Daniel Veillarddbfd6411999-12-28 16:35:14 +00003725 ctxt->checkIndex = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003726 xmlInitNodeInfoSeq(&ctxt->node_seq);
3727}
3728
3729/**
3730 * htmlFreeParserCtxt:
3731 * @ctxt: an HTML parser context
3732 *
3733 * Free all the memory used by a parser context. However the parsed
3734 * document in ctxt->myDoc is not freed.
3735 */
3736
3737void
3738htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3739{
Daniel Veillard365e13b2000-07-02 07:56:37 +00003740 xmlFreeParserCtxt(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003741}
3742
3743/**
3744 * htmlCreateDocParserCtxt :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003745 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003746 * @encoding: a free form C string describing the HTML document encoding, or NULL
3747 *
3748 * Create a parser context for an HTML document.
3749 *
3750 * Returns the new parser context or NULL
3751 */
3752htmlParserCtxtPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003753htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003754 htmlParserCtxtPtr ctxt;
3755 htmlParserInputPtr input;
3756 /* htmlCharEncoding enc; */
3757
Daniel Veillard6454aec1999-09-02 22:04:43 +00003758 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003759 if (ctxt == NULL) {
3760 perror("malloc");
3761 return(NULL);
3762 }
3763 htmlInitParserCtxt(ctxt);
Daniel Veillard6454aec1999-09-02 22:04:43 +00003764 input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003765 if (input == NULL) {
3766 perror("malloc");
Daniel Veillard6454aec1999-09-02 22:04:43 +00003767 xmlFree(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003768 return(NULL);
3769 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003770 memset(input, 0, sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003771
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003772 input->line = 1;
3773 input->col = 1;
3774 input->base = cur;
3775 input->cur = cur;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003776
3777 inputPush(ctxt, input);
3778 return(ctxt);
3779}
3780
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003781/************************************************************************
3782 * *
3783 * Progressive parsing interfaces *
3784 * *
3785 ************************************************************************/
3786
3787/**
3788 * htmlParseLookupSequence:
3789 * @ctxt: an HTML parser context
3790 * @first: the first char to lookup
3791 * @next: the next char to lookup or zero
3792 * @third: the next char to lookup or zero
3793 *
3794 * Try to find if a sequence (first, next, third) or just (first next) or
3795 * (first) is available in the input stream.
3796 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3797 * to avoid rescanning sequences of bytes, it DOES change the state of the
3798 * parser, do not use liberally.
3799 * This is basically similar to xmlParseLookupSequence()
3800 *
3801 * Returns the index to the current parsing point if the full sequence
3802 * is available, -1 otherwise.
3803 */
3804int
3805htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3806 xmlChar next, xmlChar third) {
3807 int base, len;
3808 htmlParserInputPtr in;
3809 const xmlChar *buf;
3810
3811 in = ctxt->input;
3812 if (in == NULL) return(-1);
3813 base = in->cur - in->base;
3814 if (base < 0) return(-1);
3815 if (ctxt->checkIndex > base)
3816 base = ctxt->checkIndex;
3817 if (in->buf == NULL) {
3818 buf = in->base;
3819 len = in->length;
3820 } else {
3821 buf = in->buf->buffer->content;
3822 len = in->buf->buffer->use;
3823 }
3824 /* take into account the sequence length */
3825 if (third) len -= 2;
3826 else if (next) len --;
3827 for (;base < len;base++) {
3828 if (buf[base] == first) {
3829 if (third != 0) {
3830 if ((buf[base + 1] != next) ||
3831 (buf[base + 2] != third)) continue;
3832 } else if (next != 0) {
3833 if (buf[base + 1] != next) continue;
3834 }
3835 ctxt->checkIndex = 0;
3836#ifdef DEBUG_PUSH
3837 if (next == 0)
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003838 xmlGenericError(xmlGenericErrorContext,
3839 "HPP: lookup '%c' found at %d\n",
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003840 first, base);
3841 else if (third == 0)
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003842 xmlGenericError(xmlGenericErrorContext,
3843 "HPP: lookup '%c%c' found at %d\n",
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003844 first, next, base);
3845 else
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003846 xmlGenericError(xmlGenericErrorContext,
3847 "HPP: lookup '%c%c%c' found at %d\n",
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003848 first, next, third, base);
3849#endif
3850 return(base - (in->cur - in->base));
3851 }
3852 }
3853 ctxt->checkIndex = base;
3854#ifdef DEBUG_PUSH
3855 if (next == 0)
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003856 xmlGenericError(xmlGenericErrorContext,
3857 "HPP: lookup '%c' failed\n", first);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003858 else if (third == 0)
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003859 xmlGenericError(xmlGenericErrorContext,
3860 "HPP: lookup '%c%c' failed\n", first, next);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003861 else
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003862 xmlGenericError(xmlGenericErrorContext,
3863 "HPP: lookup '%c%c%c' failed\n", first, next, third);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003864#endif
3865 return(-1);
3866}
3867
3868/**
Daniel Veillard71b656e2000-01-05 14:46:17 +00003869 * htmlParseTryOrFinish:
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003870 * @ctxt: an HTML parser context
Daniel Veillard71b656e2000-01-05 14:46:17 +00003871 * @terminate: last chunk indicator
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003872 *
3873 * Try to progress on parsing
3874 *
3875 * Returns zero if no parsing was possible
3876 */
3877int
Daniel Veillard71b656e2000-01-05 14:46:17 +00003878htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003879 int ret = 0;
3880 htmlParserInputPtr in;
Daniel Veillard365e13b2000-07-02 07:56:37 +00003881 int avail = 0;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003882 xmlChar cur, next;
3883
3884#ifdef DEBUG_PUSH
3885 switch (ctxt->instate) {
3886 case XML_PARSER_EOF:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003887 xmlGenericError(xmlGenericErrorContext,
3888 "HPP: try EOF\n"); break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003889 case XML_PARSER_START:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003890 xmlGenericError(xmlGenericErrorContext,
3891 "HPP: try START\n"); break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003892 case XML_PARSER_MISC:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003893 xmlGenericError(xmlGenericErrorContext,
3894 "HPP: try MISC\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003895 case XML_PARSER_COMMENT:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003896 xmlGenericError(xmlGenericErrorContext,
3897 "HPP: try COMMENT\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003898 case XML_PARSER_PROLOG:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003899 xmlGenericError(xmlGenericErrorContext,
3900 "HPP: try PROLOG\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003901 case XML_PARSER_START_TAG:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003902 xmlGenericError(xmlGenericErrorContext,
3903 "HPP: try START_TAG\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003904 case XML_PARSER_CONTENT:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003905 xmlGenericError(xmlGenericErrorContext,
3906 "HPP: try CONTENT\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003907 case XML_PARSER_CDATA_SECTION:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003908 xmlGenericError(xmlGenericErrorContext,
3909 "HPP: try CDATA_SECTION\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003910 case XML_PARSER_END_TAG:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003911 xmlGenericError(xmlGenericErrorContext,
3912 "HPP: try END_TAG\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003913 case XML_PARSER_ENTITY_DECL:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003914 xmlGenericError(xmlGenericErrorContext,
3915 "HPP: try ENTITY_DECL\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003916 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003917 xmlGenericError(xmlGenericErrorContext,
3918 "HPP: try ENTITY_VALUE\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003919 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003920 xmlGenericError(xmlGenericErrorContext,
3921 "HPP: try ATTRIBUTE_VALUE\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003922 case XML_PARSER_DTD:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003923 xmlGenericError(xmlGenericErrorContext,
3924 "HPP: try DTD\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003925 case XML_PARSER_EPILOG:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003926 xmlGenericError(xmlGenericErrorContext,
3927 "HPP: try EPILOG\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003928 case XML_PARSER_PI:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003929 xmlGenericError(xmlGenericErrorContext,
3930 "HPP: try PI\n");break;
Daniel Veillard7eda8452000-10-14 23:38:43 +00003931 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003932 xmlGenericError(xmlGenericErrorContext,
3933 "HPP: try SYSTEM_LITERAL\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003934 }
3935#endif
3936
3937 while (1) {
3938
3939 in = ctxt->input;
3940 if (in == NULL) break;
3941 if (in->buf == NULL)
3942 avail = in->length - (in->cur - in->base);
3943 else
3944 avail = in->buf->buffer->use - (in->cur - in->base);
Daniel Veillard365e13b2000-07-02 07:56:37 +00003945 if ((avail == 0) && (terminate)) {
3946 htmlAutoClose(ctxt, NULL);
Daniel Veillard87b95392000-08-12 21:12:04 +00003947 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
3948 /*
3949 * SAX: end of the document processing.
3950 */
Daniel Veillard365e13b2000-07-02 07:56:37 +00003951 ctxt->instate = XML_PARSER_EOF;
Daniel Veillard87b95392000-08-12 21:12:04 +00003952 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3953 ctxt->sax->endDocument(ctxt->userData);
3954 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00003955 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003956 if (avail < 1)
3957 goto done;
3958 switch (ctxt->instate) {
3959 case XML_PARSER_EOF:
3960 /*
3961 * Document parsing is done !
3962 */
3963 goto done;
3964 case XML_PARSER_START:
3965 /*
3966 * Very first chars read from the document flow.
3967 */
3968 cur = in->cur[0];
3969 if (IS_BLANK(cur)) {
3970 SKIP_BLANKS;
3971 if (in->buf == NULL)
3972 avail = in->length - (in->cur - in->base);
3973 else
3974 avail = in->buf->buffer->use - (in->cur - in->base);
3975 }
3976 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3977 ctxt->sax->setDocumentLocator(ctxt->userData,
3978 &xmlDefaultSAXLocator);
Daniel Veillardd83eb822000-06-30 18:39:56 +00003979 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
3980 (!ctxt->disableSAX))
3981 ctxt->sax->startDocument(ctxt->userData);
3982
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003983 cur = in->cur[0];
3984 next = in->cur[1];
3985 if ((cur == '<') && (next == '!') &&
3986 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3987 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3988 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3989 (UPP(8) == 'E')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003990 if ((!terminate) &&
3991 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003992 goto done;
3993#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003994 xmlGenericError(xmlGenericErrorContext,
3995 "HPP: Parsing internal subset\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003996#endif
3997 htmlParseDocTypeDecl(ctxt);
3998 ctxt->instate = XML_PARSER_PROLOG;
3999#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004000 xmlGenericError(xmlGenericErrorContext,
4001 "HPP: entering PROLOG\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004002#endif
4003 } else {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004004 ctxt->instate = XML_PARSER_MISC;
4005 }
4006#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004007 xmlGenericError(xmlGenericErrorContext,
4008 "HPP: entering MISC\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004009#endif
4010 break;
4011 case XML_PARSER_MISC:
4012 SKIP_BLANKS;
4013 if (in->buf == NULL)
4014 avail = in->length - (in->cur - in->base);
4015 else
4016 avail = in->buf->buffer->use - (in->cur - in->base);
4017 if (avail < 2)
4018 goto done;
4019 cur = in->cur[0];
4020 next = in->cur[1];
4021 if ((cur == '<') && (next == '!') &&
4022 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00004023 if ((!terminate) &&
4024 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004025 goto done;
4026#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004027 xmlGenericError(xmlGenericErrorContext,
4028 "HPP: Parsing Comment\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004029#endif
4030 htmlParseComment(ctxt);
4031 ctxt->instate = XML_PARSER_MISC;
4032 } else if ((cur == '<') && (next == '!') &&
4033 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4034 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4035 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4036 (UPP(8) == 'E')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00004037 if ((!terminate) &&
4038 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004039 goto done;
4040#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004041 xmlGenericError(xmlGenericErrorContext,
4042 "HPP: Parsing internal subset\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004043#endif
4044 htmlParseDocTypeDecl(ctxt);
4045 ctxt->instate = XML_PARSER_PROLOG;
4046#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004047 xmlGenericError(xmlGenericErrorContext,
4048 "HPP: entering PROLOG\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004049#endif
4050 } else if ((cur == '<') && (next == '!') &&
4051 (avail < 9)) {
4052 goto done;
4053 } else {
4054 ctxt->instate = XML_PARSER_START_TAG;
4055#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004056 xmlGenericError(xmlGenericErrorContext,
4057 "HPP: entering START_TAG\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004058#endif
4059 }
4060 break;
4061 case XML_PARSER_PROLOG:
4062 SKIP_BLANKS;
4063 if (in->buf == NULL)
4064 avail = in->length - (in->cur - in->base);
4065 else
4066 avail = in->buf->buffer->use - (in->cur - in->base);
4067 if (avail < 2)
4068 goto done;
4069 cur = in->cur[0];
4070 next = in->cur[1];
4071 if ((cur == '<') && (next == '!') &&
4072 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00004073 if ((!terminate) &&
4074 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004075 goto done;
4076#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004077 xmlGenericError(xmlGenericErrorContext,
4078 "HPP: Parsing Comment\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004079#endif
4080 htmlParseComment(ctxt);
4081 ctxt->instate = XML_PARSER_PROLOG;
4082 } else if ((cur == '<') && (next == '!') &&
4083 (avail < 4)) {
4084 goto done;
4085 } else {
4086 ctxt->instate = XML_PARSER_START_TAG;
4087#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004088 xmlGenericError(xmlGenericErrorContext,
4089 "HPP: entering START_TAG\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004090#endif
4091 }
4092 break;
4093 case XML_PARSER_EPILOG:
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004094 if (in->buf == NULL)
4095 avail = in->length - (in->cur - in->base);
4096 else
4097 avail = in->buf->buffer->use - (in->cur - in->base);
Daniel Veillard87b95392000-08-12 21:12:04 +00004098 if (avail < 1)
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004099 goto done;
4100 cur = in->cur[0];
Daniel Veillard87b95392000-08-12 21:12:04 +00004101 if (IS_BLANK(cur)) {
4102 htmlParseCharData(ctxt, 0);
4103 goto done;
4104 }
4105 if (avail < 2)
4106 goto done;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004107 next = in->cur[1];
4108 if ((cur == '<') && (next == '!') &&
4109 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00004110 if ((!terminate) &&
4111 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004112 goto done;
4113#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004114 xmlGenericError(xmlGenericErrorContext,
4115 "HPP: Parsing Comment\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004116#endif
4117 htmlParseComment(ctxt);
4118 ctxt->instate = XML_PARSER_EPILOG;
4119 } else if ((cur == '<') && (next == '!') &&
4120 (avail < 4)) {
4121 goto done;
4122 } else {
Daniel Veillarda2c6da92000-09-16 18:15:00 +00004123 ctxt->errNo = XML_ERR_DOCUMENT_END;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004124 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4125 ctxt->sax->error(ctxt->userData,
4126 "Extra content at the end of the document\n");
4127 ctxt->wellFormed = 0;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004128 ctxt->instate = XML_PARSER_EOF;
4129#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004130 xmlGenericError(xmlGenericErrorContext,
4131 "HPP: entering EOF\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004132#endif
4133 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4134 ctxt->sax->endDocument(ctxt->userData);
4135 goto done;
4136 }
4137 break;
4138 case XML_PARSER_START_TAG: {
4139 xmlChar *name, *oldname;
4140 int depth = ctxt->nameNr;
4141 htmlElemDescPtr info;
4142
4143 if (avail < 2)
4144 goto done;
4145 cur = in->cur[0];
4146 if (cur != '<') {
4147 ctxt->instate = XML_PARSER_CONTENT;
4148#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004149 xmlGenericError(xmlGenericErrorContext,
4150 "HPP: entering CONTENT\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004151#endif
4152 break;
4153 }
Daniel Veillard71b656e2000-01-05 14:46:17 +00004154 if ((!terminate) &&
4155 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004156 goto done;
4157
4158 oldname = xmlStrdup(ctxt->name);
4159 htmlParseStartTag(ctxt);
4160 name = ctxt->name;
4161#ifdef DEBUG
4162 if (oldname == NULL)
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004163 xmlGenericError(xmlGenericErrorContext,
4164 "Start of element %s\n", name);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004165 else if (name == NULL)
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004166 xmlGenericError(xmlGenericErrorContext,
4167 "Start of element failed, was %s\n",
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004168 oldname);
4169 else
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004170 xmlGenericError(xmlGenericErrorContext,
4171 "Start of element %s, was %s\n",
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004172 name, oldname);
4173#endif
4174 if (((depth == ctxt->nameNr) &&
Daniel Veillard8b5dd832000-10-01 20:28:44 +00004175 (xmlStrEqual(oldname, ctxt->name))) ||
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004176 (name == NULL)) {
4177 if (CUR == '>')
4178 NEXT;
4179 if (oldname != NULL)
4180 xmlFree(oldname);
4181 break;
4182 }
4183 if (oldname != NULL)
4184 xmlFree(oldname);
4185
4186 /*
4187 * Lookup the info for that element.
4188 */
4189 info = htmlTagLookup(name);
4190 if (info == NULL) {
4191 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4192 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4193 name);
4194 ctxt->wellFormed = 0;
4195 } else if (info->depr) {
4196 /***************************
4197 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4198 ctxt->sax->warning(ctxt->userData,
4199 "Tag %s is deprecated\n",
4200 name);
4201 ***************************/
4202 }
4203
4204 /*
4205 * Check for an Empty Element labelled the XML/SGML way
4206 */
4207 if ((CUR == '/') && (NXT(1) == '>')) {
4208 SKIP(2);
4209 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4210 ctxt->sax->endElement(ctxt->userData, name);
4211 oldname = htmlnamePop(ctxt);
4212#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004213 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004214 oldname);
4215#endif
4216 if (oldname != NULL)
4217 xmlFree(oldname);
4218 ctxt->instate = XML_PARSER_CONTENT;
4219#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004220 xmlGenericError(xmlGenericErrorContext,
4221 "HPP: entering CONTENT\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004222#endif
4223 break;
4224 }
4225
4226 if (CUR == '>') {
4227 NEXT;
4228 } else {
4229 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4230 ctxt->sax->error(ctxt->userData,
4231 "Couldn't find end of Start Tag %s\n",
4232 name);
4233 ctxt->wellFormed = 0;
4234
4235 /*
4236 * end of parsing of this node.
4237 */
Daniel Veillard8b5dd832000-10-01 20:28:44 +00004238 if (xmlStrEqual(name, ctxt->name)) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004239 nodePop(ctxt);
4240 oldname = htmlnamePop(ctxt);
4241#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004242 xmlGenericError(xmlGenericErrorContext,
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004243 "End of start tag problem: popping out %s\n", oldname);
4244#endif
4245 if (oldname != NULL)
4246 xmlFree(oldname);
4247 }
4248
4249 ctxt->instate = XML_PARSER_CONTENT;
4250#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004251 xmlGenericError(xmlGenericErrorContext,
4252 "HPP: entering CONTENT\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004253#endif
4254 break;
4255 }
4256
4257 /*
4258 * Check for an Empty Element from DTD definition
4259 */
4260 if ((info != NULL) && (info->empty)) {
4261 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4262 ctxt->sax->endElement(ctxt->userData, name);
4263 oldname = htmlnamePop(ctxt);
4264#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004265 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004266#endif
4267 if (oldname != NULL)
4268 xmlFree(oldname);
4269 }
4270 ctxt->instate = XML_PARSER_CONTENT;
4271#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004272 xmlGenericError(xmlGenericErrorContext,
4273 "HPP: entering CONTENT\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004274#endif
4275 break;
4276 }
Daniel Veillard87b95392000-08-12 21:12:04 +00004277 case XML_PARSER_CONTENT: {
4278 long cons;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004279 /*
4280 * Handle preparsed entities and charRef
4281 */
4282 if (ctxt->token != 0) {
Daniel Veillard365e13b2000-07-02 07:56:37 +00004283 xmlChar chr[2] = { 0 , 0 } ;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004284
Daniel Veillard365e13b2000-07-02 07:56:37 +00004285 chr[0] = (xmlChar) ctxt->token;
Daniel Veillardb8f25c92000-08-19 19:52:36 +00004286 htmlCheckParagraph(ctxt);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004287 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
Daniel Veillard365e13b2000-07-02 07:56:37 +00004288 ctxt->sax->characters(ctxt->userData, chr, 1);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004289 ctxt->token = 0;
4290 ctxt->checkIndex = 0;
4291 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00004292 if ((avail == 1) && (terminate)) {
4293 cur = in->cur[0];
4294 if ((cur != '<') && (cur != '&')) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00004295 if (ctxt->sax != NULL) {
4296 if (IS_BLANK(cur)) {
4297 if (ctxt->sax->ignorableWhitespace != NULL)
4298 ctxt->sax->ignorableWhitespace(
4299 ctxt->userData, &cur, 1);
4300 } else {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00004301 htmlCheckParagraph(ctxt);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00004302 if (ctxt->sax->characters != NULL)
4303 ctxt->sax->characters(
4304 ctxt->userData, &cur, 1);
4305 }
4306 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00004307 ctxt->token = 0;
4308 ctxt->checkIndex = 0;
4309 NEXT;
4310 }
4311 break;
4312 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004313 if (avail < 2)
4314 goto done;
4315 cur = in->cur[0];
4316 next = in->cur[1];
Daniel Veillard87b95392000-08-12 21:12:04 +00004317 cons = ctxt->nbChars;
Daniel Veillard7eda8452000-10-14 23:38:43 +00004318 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4319 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004320 /*
Daniel Veillard7eda8452000-10-14 23:38:43 +00004321 * Handle SCRIPT/STYLE separately
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004322 */
Daniel Veillard7eda8452000-10-14 23:38:43 +00004323 if ((!terminate) &&
4324 (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4325 goto done;
4326 htmlParseScript(ctxt);
4327 if ((cur == '<') && (next == '/')) {
4328 ctxt->instate = XML_PARSER_END_TAG;
4329 ctxt->checkIndex = 0;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004330#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004331 xmlGenericError(xmlGenericErrorContext,
4332 "HPP: entering END_TAG\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004333#endif
Daniel Veillard7eda8452000-10-14 23:38:43 +00004334 break;
4335 }
4336 } else {
4337 /*
4338 * Sometimes DOCTYPE arrives in the middle of the document
4339 */
4340 if ((cur == '<') && (next == '!') &&
4341 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4342 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4343 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4344 (UPP(8) == 'E')) {
4345 if ((!terminate) &&
4346 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4347 goto done;
4348 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4349 ctxt->sax->error(ctxt->userData,
4350 "Misplaced DOCTYPE declaration\n");
4351 ctxt->wellFormed = 0;
4352 htmlParseDocTypeDecl(ctxt);
4353 } else if ((cur == '<') && (next == '!') &&
4354 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4355 if ((!terminate) &&
4356 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4357 goto done;
4358#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004359 xmlGenericError(xmlGenericErrorContext,
4360 "HPP: Parsing Comment\n");
Daniel Veillard7eda8452000-10-14 23:38:43 +00004361#endif
4362 htmlParseComment(ctxt);
4363 ctxt->instate = XML_PARSER_CONTENT;
4364 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4365 goto done;
4366 } else if ((cur == '<') && (next == '/')) {
4367 ctxt->instate = XML_PARSER_END_TAG;
4368 ctxt->checkIndex = 0;
4369#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004370 xmlGenericError(xmlGenericErrorContext,
4371 "HPP: entering END_TAG\n");
Daniel Veillard7eda8452000-10-14 23:38:43 +00004372#endif
4373 break;
4374 } else if (cur == '<') {
4375 ctxt->instate = XML_PARSER_START_TAG;
4376 ctxt->checkIndex = 0;
4377#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004378 xmlGenericError(xmlGenericErrorContext,
4379 "HPP: entering START_TAG\n");
Daniel Veillard7eda8452000-10-14 23:38:43 +00004380#endif
4381 break;
4382 } else if (cur == '&') {
4383 if ((!terminate) &&
4384 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4385 goto done;
4386#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004387 xmlGenericError(xmlGenericErrorContext,
4388 "HPP: Parsing Reference\n");
Daniel Veillard7eda8452000-10-14 23:38:43 +00004389#endif
4390 /* TODO: check generation of subtrees if noent !!! */
4391 htmlParseReference(ctxt);
4392 } else {
4393 /* TODO Avoid the extra copy, handle directly !!!!!! */
4394 /*
4395 * Goal of the following test is :
4396 * - minimize calls to the SAX 'character' callback
4397 * when they are mergeable
4398 */
4399 if ((ctxt->inputNr == 1) &&
4400 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4401 if ((!terminate) &&
4402 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4403 goto done;
4404 }
4405 ctxt->checkIndex = 0;
4406#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004407 xmlGenericError(xmlGenericErrorContext,
4408 "HPP: Parsing char data\n");
Daniel Veillard7eda8452000-10-14 23:38:43 +00004409#endif
4410 htmlParseCharData(ctxt, 0);
4411 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004412 }
Daniel Veillard87b95392000-08-12 21:12:04 +00004413 if (cons == ctxt->nbChars) {
4414 if (ctxt->node != NULL) {
4415 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4416 ctxt->sax->error(ctxt->userData,
4417 "detected an error in element content\n");
4418 ctxt->wellFormed = 0;
Daniel Veillard87b95392000-08-12 21:12:04 +00004419 }
Daniel Veillard8ddb5a72000-09-23 10:28:52 +00004420 NEXT;
Daniel Veillard87b95392000-08-12 21:12:04 +00004421 break;
4422 }
4423
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004424 break;
Daniel Veillard87b95392000-08-12 21:12:04 +00004425 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004426 case XML_PARSER_END_TAG:
4427 if (avail < 2)
4428 goto done;
Daniel Veillard71b656e2000-01-05 14:46:17 +00004429 if ((!terminate) &&
4430 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004431 goto done;
4432 htmlParseEndTag(ctxt);
4433 if (ctxt->nameNr == 0) {
4434 ctxt->instate = XML_PARSER_EPILOG;
4435 } else {
4436 ctxt->instate = XML_PARSER_CONTENT;
4437 }
4438 ctxt->checkIndex = 0;
4439#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004440 xmlGenericError(xmlGenericErrorContext,
4441 "HPP: entering CONTENT\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004442#endif
4443 break;
4444 case XML_PARSER_CDATA_SECTION:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004445 xmlGenericError(xmlGenericErrorContext,
4446 "HPP: internal error, state == CDATA\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004447 ctxt->instate = XML_PARSER_CONTENT;
4448 ctxt->checkIndex = 0;
4449#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004450 xmlGenericError(xmlGenericErrorContext,
4451 "HPP: entering CONTENT\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004452#endif
4453 break;
4454 case XML_PARSER_DTD:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004455 xmlGenericError(xmlGenericErrorContext,
4456 "HPP: internal error, state == DTD\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004457 ctxt->instate = XML_PARSER_CONTENT;
4458 ctxt->checkIndex = 0;
4459#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004460 xmlGenericError(xmlGenericErrorContext,
4461 "HPP: entering CONTENT\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004462#endif
4463 break;
4464 case XML_PARSER_COMMENT:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004465 xmlGenericError(xmlGenericErrorContext,
4466 "HPP: internal error, state == COMMENT\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004467 ctxt->instate = XML_PARSER_CONTENT;
4468 ctxt->checkIndex = 0;
4469#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004470 xmlGenericError(xmlGenericErrorContext,
4471 "HPP: entering CONTENT\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004472#endif
4473 break;
4474 case XML_PARSER_PI:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004475 xmlGenericError(xmlGenericErrorContext,
4476 "HPP: internal error, state == PI\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004477 ctxt->instate = XML_PARSER_CONTENT;
4478 ctxt->checkIndex = 0;
4479#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004480 xmlGenericError(xmlGenericErrorContext,
4481 "HPP: entering CONTENT\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004482#endif
4483 break;
4484 case XML_PARSER_ENTITY_DECL:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004485 xmlGenericError(xmlGenericErrorContext,
4486 "HPP: internal error, state == ENTITY_DECL\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004487 ctxt->instate = XML_PARSER_CONTENT;
4488 ctxt->checkIndex = 0;
4489#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004490 xmlGenericError(xmlGenericErrorContext,
4491 "HPP: entering CONTENT\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004492#endif
4493 break;
4494 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004495 xmlGenericError(xmlGenericErrorContext,
4496 "HPP: internal error, state == ENTITY_VALUE\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004497 ctxt->instate = XML_PARSER_CONTENT;
4498 ctxt->checkIndex = 0;
4499#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004500 xmlGenericError(xmlGenericErrorContext,
4501 "HPP: entering DTD\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004502#endif
4503 break;
4504 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004505 xmlGenericError(xmlGenericErrorContext,
4506 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004507 ctxt->instate = XML_PARSER_START_TAG;
4508 ctxt->checkIndex = 0;
4509#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004510 xmlGenericError(xmlGenericErrorContext,
4511 "HPP: entering START_TAG\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004512#endif
4513 break;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00004514 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004515 xmlGenericError(xmlGenericErrorContext,
4516 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
Daniel Veillard32bc74e2000-07-14 14:49:25 +00004517 ctxt->instate = XML_PARSER_CONTENT;
4518 ctxt->checkIndex = 0;
4519#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004520 xmlGenericError(xmlGenericErrorContext,
4521 "HPP: entering CONTENT\n");
Daniel Veillard32bc74e2000-07-14 14:49:25 +00004522#endif
4523 break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004524 }
4525 }
4526done:
Daniel Veillard365e13b2000-07-02 07:56:37 +00004527 if ((avail == 0) && (terminate)) {
4528 htmlAutoClose(ctxt, NULL);
Daniel Veillard87b95392000-08-12 21:12:04 +00004529 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4530 /*
4531 * SAX: end of the document processing.
4532 */
Daniel Veillard365e13b2000-07-02 07:56:37 +00004533 ctxt->instate = XML_PARSER_EOF;
Daniel Veillard87b95392000-08-12 21:12:04 +00004534 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4535 ctxt->sax->endDocument(ctxt->userData);
4536 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00004537 }
Daniel Veillardb8f25c92000-08-19 19:52:36 +00004538 if ((ctxt->myDoc != NULL) &&
4539 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4540 (ctxt->instate == XML_PARSER_EPILOG))) {
4541 xmlDtdPtr dtd;
4542 dtd = xmlGetIntSubset(ctxt->myDoc);
4543 if (dtd == NULL)
4544 ctxt->myDoc->intSubset =
4545 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4546 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4547 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4548 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004549#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004550 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004551#endif
4552 return(ret);
4553}
4554
4555/**
Daniel Veillard71b656e2000-01-05 14:46:17 +00004556 * htmlParseTry:
4557 * @ctxt: an HTML parser context
4558 *
4559 * Try to progress on parsing
4560 *
4561 * Returns zero if no parsing was possible
4562 */
4563int
4564htmlParseTry(htmlParserCtxtPtr ctxt) {
4565 return(htmlParseTryOrFinish(ctxt, 0));
4566}
4567
4568/**
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004569 * htmlParseChunk:
4570 * @ctxt: an XML parser context
4571 * @chunk: an char array
4572 * @size: the size in byte of the chunk
4573 * @terminate: last chunk indicator
4574 *
4575 * Parse a Chunk of memory
4576 *
4577 * Returns zero if no error, the xmlParserErrors otherwise.
4578 */
4579int
4580htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4581 int terminate) {
4582 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4583 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4584 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4585 int cur = ctxt->input->cur - ctxt->input->base;
4586
4587 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4588 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4589 ctxt->input->cur = ctxt->input->base + cur;
4590#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004591 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004592#endif
4593
Daniel Veillardd0f7f742000-02-02 17:42:48 +00004594 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4595 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard52402ce2000-08-22 23:36:12 +00004596 } else if (ctxt->instate != XML_PARSER_EOF) {
4597 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
Daniel Veillard71b656e2000-01-05 14:46:17 +00004598 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard52402ce2000-08-22 23:36:12 +00004599 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004600 if (terminate) {
4601 if ((ctxt->instate != XML_PARSER_EOF) &&
4602 (ctxt->instate != XML_PARSER_EPILOG) &&
4603 (ctxt->instate != XML_PARSER_MISC)) {
Daniel Veillarda2c6da92000-09-16 18:15:00 +00004604 ctxt->errNo = XML_ERR_DOCUMENT_END;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004605 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4606 ctxt->sax->error(ctxt->userData,
4607 "Extra content at the end of the document\n");
4608 ctxt->wellFormed = 0;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004609 }
4610 if (ctxt->instate != XML_PARSER_EOF) {
4611 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4612 ctxt->sax->endDocument(ctxt->userData);
4613 }
4614 ctxt->instate = XML_PARSER_EOF;
4615 }
4616 return((xmlParserErrors) ctxt->errNo);
4617}
4618
4619/************************************************************************
4620 * *
4621 * User entry points *
4622 * *
4623 ************************************************************************/
4624
4625/**
4626 * htmlCreatePushParserCtxt :
4627 * @sax: a SAX handler
4628 * @user_data: The user data returned on SAX callbacks
4629 * @chunk: a pointer to an array of chars
4630 * @size: number of chars in the array
4631 * @filename: an optional file name or URI
4632 * @enc: an optional encoding
4633 *
4634 * Create a parser context for using the HTML parser in push mode
4635 * To allow content encoding detection, @size should be >= 4
4636 * The value of @filename is used for fetching external entities
4637 * and error/warning reports.
4638 *
4639 * Returns the new parser context or NULL
4640 */
4641htmlParserCtxtPtr
4642htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4643 const char *chunk, int size, const char *filename,
4644 xmlCharEncoding enc) {
4645 htmlParserCtxtPtr ctxt;
4646 htmlParserInputPtr inputStream;
4647 xmlParserInputBufferPtr buf;
4648
4649 buf = xmlAllocParserInputBuffer(enc);
4650 if (buf == NULL) return(NULL);
4651
4652 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4653 if (ctxt == NULL) {
4654 xmlFree(buf);
4655 return(NULL);
4656 }
4657 memset(ctxt, 0, sizeof(htmlParserCtxt));
4658 htmlInitParserCtxt(ctxt);
4659 if (sax != NULL) {
4660 if (ctxt->sax != &htmlDefaultSAXHandler)
4661 xmlFree(ctxt->sax);
4662 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4663 if (ctxt->sax == NULL) {
4664 xmlFree(buf);
4665 xmlFree(ctxt);
4666 return(NULL);
4667 }
4668 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4669 if (user_data != NULL)
4670 ctxt->userData = user_data;
4671 }
4672 if (filename == NULL) {
4673 ctxt->directory = NULL;
4674 } else {
4675 ctxt->directory = xmlParserGetDirectory(filename);
4676 }
4677
4678 inputStream = htmlNewInputStream(ctxt);
4679 if (inputStream == NULL) {
4680 xmlFreeParserCtxt(ctxt);
4681 return(NULL);
4682 }
4683
4684 if (filename == NULL)
4685 inputStream->filename = NULL;
4686 else
4687 inputStream->filename = xmlMemStrdup(filename);
4688 inputStream->buf = buf;
4689 inputStream->base = inputStream->buf->buffer->content;
4690 inputStream->cur = inputStream->buf->buffer->content;
4691
4692 inputPush(ctxt, inputStream);
4693
4694 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4695 (ctxt->input->buf != NULL)) {
4696 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4697#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004698 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004699#endif
4700 }
4701
4702 return(ctxt);
4703}
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004704
4705/**
4706 * htmlSAXParseDoc :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00004707 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004708 * @encoding: a free form C string describing the HTML document encoding, or NULL
4709 * @sax: the SAX handler block
4710 * @userData: if using SAX, this pointer will be provided on callbacks.
4711 *
4712 * parse an HTML in-memory document and build a tree.
4713 * It use the given SAX function block to handle the parsing callback.
4714 * If sax is NULL, fallback to the default DOM tree building routines.
4715 *
4716 * Returns the resulting document tree
4717 */
4718
4719htmlDocPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00004720htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004721 htmlDocPtr ret;
4722 htmlParserCtxtPtr ctxt;
4723
4724 if (cur == NULL) return(NULL);
4725
4726
4727 ctxt = htmlCreateDocParserCtxt(cur, encoding);
4728 if (ctxt == NULL) return(NULL);
4729 if (sax != NULL) {
4730 ctxt->sax = sax;
4731 ctxt->userData = userData;
4732 }
4733
4734 htmlParseDocument(ctxt);
4735 ret = ctxt->myDoc;
4736 if (sax != NULL) {
4737 ctxt->sax = NULL;
4738 ctxt->userData = NULL;
4739 }
4740 htmlFreeParserCtxt(ctxt);
4741
4742 return(ret);
4743}
4744
4745/**
4746 * htmlParseDoc :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00004747 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004748 * @encoding: a free form C string describing the HTML document encoding, or NULL
4749 *
4750 * parse an HTML in-memory document and build a tree.
4751 *
4752 * Returns the resulting document tree
4753 */
4754
4755htmlDocPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00004756htmlParseDoc(xmlChar *cur, const char *encoding) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004757 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4758}
4759
4760
4761/**
4762 * htmlCreateFileParserCtxt :
4763 * @filename: the filename
4764 * @encoding: a free form C string describing the HTML document encoding, or NULL
4765 *
4766 * Create a parser context for a file content.
4767 * Automatic support for ZLIB/Compress compressed document is provided
4768 * by default if found at compile-time.
4769 *
4770 * Returns the new parser context or NULL
4771 */
4772htmlParserCtxtPtr
4773htmlCreateFileParserCtxt(const char *filename, const char *encoding)
4774{
4775 htmlParserCtxtPtr ctxt;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004776 htmlParserInputPtr inputStream;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00004777 xmlParserInputBufferPtr buf;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004778 /* htmlCharEncoding enc; */
4779
Daniel Veillarde2d034d1999-07-27 19:52:06 +00004780 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
4781 if (buf == NULL) return(NULL);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004782
Daniel Veillard6454aec1999-09-02 22:04:43 +00004783 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004784 if (ctxt == NULL) {
4785 perror("malloc");
4786 return(NULL);
4787 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00004788 memset(ctxt, 0, sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004789 htmlInitParserCtxt(ctxt);
Daniel Veillard6454aec1999-09-02 22:04:43 +00004790 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004791 if (inputStream == NULL) {
4792 perror("malloc");
Daniel Veillard6454aec1999-09-02 22:04:43 +00004793 xmlFree(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004794 return(NULL);
4795 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00004796 memset(inputStream, 0, sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004797
Daniel Veillard6454aec1999-09-02 22:04:43 +00004798 inputStream->filename = xmlMemStrdup(filename);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004799 inputStream->line = 1;
4800 inputStream->col = 1;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00004801 inputStream->buf = buf;
Daniel Veillard35008381999-10-25 13:15:52 +00004802 inputStream->directory = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004803
Daniel Veillarde2d034d1999-07-27 19:52:06 +00004804 inputStream->base = inputStream->buf->buffer->content;
4805 inputStream->cur = inputStream->buf->buffer->content;
4806 inputStream->free = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004807
4808 inputPush(ctxt, inputStream);
4809 return(ctxt);
4810}
4811
4812/**
4813 * htmlSAXParseFile :
4814 * @filename: the filename
4815 * @encoding: a free form C string describing the HTML document encoding, or NULL
4816 * @sax: the SAX handler block
4817 * @userData: if using SAX, this pointer will be provided on callbacks.
4818 *
4819 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4820 * compressed document is provided by default if found at compile-time.
4821 * It use the given SAX function block to handle the parsing callback.
4822 * If sax is NULL, fallback to the default DOM tree building routines.
4823 *
4824 * Returns the resulting document tree
4825 */
4826
4827htmlDocPtr
4828htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
4829 void *userData) {
4830 htmlDocPtr ret;
4831 htmlParserCtxtPtr ctxt;
Daniel Veillard87b95392000-08-12 21:12:04 +00004832 htmlSAXHandlerPtr oldsax = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004833
4834 ctxt = htmlCreateFileParserCtxt(filename, encoding);
4835 if (ctxt == NULL) return(NULL);
4836 if (sax != NULL) {
Daniel Veillard87b95392000-08-12 21:12:04 +00004837 oldsax = ctxt->sax;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004838 ctxt->sax = sax;
4839 ctxt->userData = userData;
4840 }
4841
4842 htmlParseDocument(ctxt);
4843
4844 ret = ctxt->myDoc;
4845 if (sax != NULL) {
Daniel Veillard87b95392000-08-12 21:12:04 +00004846 ctxt->sax = oldsax;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004847 ctxt->userData = NULL;
4848 }
4849 htmlFreeParserCtxt(ctxt);
4850
4851 return(ret);
4852}
4853
4854/**
4855 * htmlParseFile :
4856 * @filename: the filename
4857 * @encoding: a free form C string describing the HTML document encoding, or NULL
4858 *
4859 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4860 * compressed document is provided by default if found at compile-time.
4861 *
4862 * Returns the resulting document tree
4863 */
4864
4865htmlDocPtr
4866htmlParseFile(const char *filename, const char *encoding) {
4867 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
4868}
Daniel Veillard361d8452000-04-03 19:48:13 +00004869
4870#endif /* LIBXML_HTML_ENABLED */