blob: a395c7cde9f87702a469e12e203e15debb39c4ef [file] [log] [blame]
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * Daniel.Veillard@w3.org
7 */
8
9#ifdef WIN32
Daniel Veillard3c558c31999-12-22 11:30:41 +000010#include "win32config.h"
Daniel Veillardbe70ff71999-07-05 16:50:46 +000011#else
Daniel Veillard7f7d1111999-09-22 09:46:25 +000012#include "config.h"
Daniel Veillardbe70ff71999-07-05 16:50:46 +000013#endif
Daniel Veillard7f7d1111999-09-22 09:46:25 +000014
Daniel Veillardb71379b2000-10-09 12:30:39 +000015#include <libxml/xmlversion.h>
Daniel Veillard361d8452000-04-03 19:48:13 +000016#ifdef LIBXML_HTML_ENABLED
Daniel Veillardbe70ff71999-07-05 16:50:46 +000017#include <stdio.h>
Daniel Veillard32bc74e2000-07-14 14:49:25 +000018#include <string.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000019#ifdef HAVE_CTYPE_H
20#include <ctype.h>
21#endif
22#ifdef HAVE_STDLIB_H
Daniel Veillardbe70ff71999-07-05 16:50:46 +000023#include <stdlib.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000024#endif
25#ifdef HAVE_SYS_STAT_H
Daniel Veillardbe70ff71999-07-05 16:50:46 +000026#include <sys/stat.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000027#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +000028#ifdef HAVE_FCNTL_H
29#include <fcntl.h>
30#endif
31#ifdef HAVE_UNISTD_H
32#include <unistd.h>
33#endif
34#ifdef HAVE_ZLIB_H
35#include <zlib.h>
36#endif
37
Daniel Veillard361d8452000-04-03 19:48:13 +000038#include <libxml/xmlmemory.h>
39#include <libxml/tree.h>
Daniel Veillardaaf58b92000-10-06 14:07:26 +000040#include <libxml/parser.h>
41#include <libxml/parserInternals.h>
Daniel Veillardb71379b2000-10-09 12:30:39 +000042#include <libxml/xmlerror.h>
Daniel Veillard361d8452000-04-03 19:48:13 +000043#include <libxml/HTMLparser.h>
44#include <libxml/entities.h>
45#include <libxml/encoding.h>
46#include <libxml/valid.h>
Daniel Veillard361d8452000-04-03 19:48:13 +000047#include <libxml/xmlIO.h>
Daniel Veillarde2d034d1999-07-27 19:52:06 +000048
49#define HTML_MAX_NAMELEN 1000
Daniel Veillard32bc74e2000-07-14 14:49:25 +000050#define HTML_PARSER_BIG_BUFFER_SIZE 1000
Daniel Veillard5e5c6231999-12-29 12:49:06 +000051#define HTML_PARSER_BUFFER_SIZE 100
Daniel Veillardbe70ff71999-07-05 16:50:46 +000052
Daniel Veillard82150d81999-07-07 07:32:15 +000053/* #define DEBUG */
Daniel Veillard5e5c6231999-12-29 12:49:06 +000054/* #define DEBUG_PUSH */
Daniel Veillard5233ffc1999-07-06 22:25:25 +000055
Daniel Veillarda6d8eb62000-12-27 10:46:47 +000056int htmlOmittedDefaultValue = 1;
57
Daniel Veillard5233ffc1999-07-06 22:25:25 +000058/************************************************************************
59 * *
60 * Parser stacks related functions and macros *
61 * *
62 ************************************************************************/
63
64/*
65 * Generic function for accessing stacks in the Parser Context
66 */
67
Daniel Veillarddbfd6411999-12-28 16:35:14 +000068#define PUSH_AND_POP(scope, type, name) \
69scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000070 if (ctxt->name##Nr >= ctxt->name##Max) { \
71 ctxt->name##Max *= 2; \
Daniel Veillard32bc74e2000-07-14 14:49:25 +000072 ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000073 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
74 if (ctxt->name##Tab == NULL) { \
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +000075 xmlGenericError(xmlGenericErrorContext, \
76 "realloc failed !\n"); \
Daniel Veillard0142b842000-01-14 14:45:24 +000077 return(0); \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000078 } \
79 } \
80 ctxt->name##Tab[ctxt->name##Nr] = value; \
81 ctxt->name = value; \
82 return(ctxt->name##Nr++); \
83} \
Daniel Veillarddbfd6411999-12-28 16:35:14 +000084scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000085 type ret; \
Daniel Veillard7c1206f1999-10-14 09:10:25 +000086 if (ctxt->name##Nr < 0) return(0); \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000087 ctxt->name##Nr--; \
Daniel Veillard7c1206f1999-10-14 09:10:25 +000088 if (ctxt->name##Nr < 0) return(0); \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000089 if (ctxt->name##Nr > 0) \
90 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
91 else \
92 ctxt->name = NULL; \
93 ret = ctxt->name##Tab[ctxt->name##Nr]; \
94 ctxt->name##Tab[ctxt->name##Nr] = 0; \
95 return(ret); \
96} \
97
Daniel Veillarddbfd6411999-12-28 16:35:14 +000098PUSH_AND_POP(extern, xmlNodePtr, node)
99PUSH_AND_POP(extern, xmlChar*, name)
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000100
101/*
102 * Macros for accessing the content. Those should be used only by the parser,
103 * and not exported.
104 *
105 * Dirty macros, i.e. one need to make assumption on the context to use them
106 *
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000107 * CUR_PTR return the current pointer to the xmlChar to be parsed.
108 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000109 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
110 * in UNICODE mode. This should be used internally by the parser
111 * only to compare to ASCII values otherwise it would break when
112 * running with UTF-8 encoding.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000113 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000114 * to compare on ASCII based substring.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000115 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000116 * it should be used only to compare on ASCII based substring.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000117 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000118 * strings within the parser.
119 *
120 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
121 *
122 * CURRENT Returns the current char value, with the full decoding of
123 * UTF-8 if we are using this mode. It returns an int.
124 * NEXT Skip to the next character, this does the proper decoding
125 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000126 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
127 */
128
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000129#define UPPER (toupper(*ctxt->input->cur))
Daniel Veillardcf461992000-03-14 18:30:20 +0000130
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000131#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
Daniel Veillardcf461992000-03-14 18:30:20 +0000132
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000133#define NXT(val) ctxt->input->cur[(val)]
Daniel Veillardcf461992000-03-14 18:30:20 +0000134
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000135#define UPP(val) (toupper(ctxt->input->cur[(val)]))
Daniel Veillardcf461992000-03-14 18:30:20 +0000136
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000137#define CUR_PTR ctxt->input->cur
Daniel Veillardcf461992000-03-14 18:30:20 +0000138
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000139#define SHRINK xmlParserInputShrink(ctxt->input)
Daniel Veillardcf461992000-03-14 18:30:20 +0000140
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000141#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000142
Daniel Veillardcf461992000-03-14 18:30:20 +0000143#define CURRENT ((int) (*ctxt->input->cur))
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000144
Daniel Veillard126f2792000-10-24 17:10:12 +0000145#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
Daniel Veillardcf461992000-03-14 18:30:20 +0000146
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000147/* Inported from XML */
148
149/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
150#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard126f2792000-10-24 17:10:12 +0000151#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000152
153#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
154#define NXT(val) ctxt->input->cur[(val)]
155#define CUR_PTR ctxt->input->cur
156
157
Daniel Veillard126f2792000-10-24 17:10:12 +0000158#define NEXTL(l) do { \
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000159 if (*(ctxt->input->cur) == '\n') { \
160 ctxt->input->line++; ctxt->input->col = 1; \
161 } else ctxt->input->col++; \
Daniel Veillard126f2792000-10-24 17:10:12 +0000162 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
163 } while (0)
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000164
165/************
166 \
167 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
168 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
169 ************/
170
Daniel Veillard126f2792000-10-24 17:10:12 +0000171#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
172#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000173
174#define COPY_BUF(l,b,i,v) \
175 if (l == 1) b[i++] = (xmlChar) v; \
Daniel Veillard126f2792000-10-24 17:10:12 +0000176 else i += xmlCopyChar(l,&b[i],v)
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000177
178/**
179 * htmlCurrentChar:
180 * @ctxt: the HTML parser context
181 * @len: pointer to the length of the char read
182 *
183 * The current char value, if using UTF-8 this may actaully span multiple
184 * bytes in the input buffer. Implement the end of line normalization:
185 * 2.11 End-of-Line Handling
186 * If the encoding is unspecified, in the case we find an ISO-Latin-1
187 * char, then the encoding converter is plugged in automatically.
188 *
189 * Returns the current char value and its lenght
190 */
191
192int
193htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
194 if (ctxt->instate == XML_PARSER_EOF)
195 return(0);
196
197 if (ctxt->token != 0) {
198 *len = 0;
199 return(ctxt->token);
200 }
201 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
202 /*
203 * We are supposed to handle UTF8, check it's valid
204 * From rfc2044: encoding of the Unicode values on UTF-8:
205 *
206 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
207 * 0000 0000-0000 007F 0xxxxxxx
208 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
209 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
210 *
211 * Check for the 0x110000 limit too
212 */
213 const unsigned char *cur = ctxt->input->cur;
214 unsigned char c;
215 unsigned int val;
216
217 c = *cur;
218 if (c & 0x80) {
219 if (cur[1] == 0)
220 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
221 if ((cur[1] & 0xc0) != 0x80)
222 goto encoding_error;
223 if ((c & 0xe0) == 0xe0) {
224
225 if (cur[2] == 0)
226 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
227 if ((cur[2] & 0xc0) != 0x80)
228 goto encoding_error;
229 if ((c & 0xf0) == 0xf0) {
230 if (cur[3] == 0)
231 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
232 if (((c & 0xf8) != 0xf0) ||
233 ((cur[3] & 0xc0) != 0x80))
234 goto encoding_error;
235 /* 4-byte code */
236 *len = 4;
237 val = (cur[0] & 0x7) << 18;
238 val |= (cur[1] & 0x3f) << 12;
239 val |= (cur[2] & 0x3f) << 6;
240 val |= cur[3] & 0x3f;
241 } else {
242 /* 3-byte code */
243 *len = 3;
244 val = (cur[0] & 0xf) << 12;
245 val |= (cur[1] & 0x3f) << 6;
246 val |= cur[2] & 0x3f;
247 }
248 } else {
249 /* 2-byte code */
250 *len = 2;
251 val = (cur[0] & 0x1f) << 6;
252 val |= cur[1] & 0x3f;
253 }
254 if (!IS_CHAR(val)) {
Daniel Veillarda2c6da92000-09-16 18:15:00 +0000255 ctxt->errNo = XML_ERR_INVALID_ENCODING;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000256 if ((ctxt->sax != NULL) &&
257 (ctxt->sax->error != NULL))
258 ctxt->sax->error(ctxt->userData,
259 "Char 0x%X out of allowed range\n", val);
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000260 ctxt->wellFormed = 0;
261 ctxt->disableSAX = 1;
262 }
263 return(val);
264 } else {
265 /* 1-byte code */
266 *len = 1;
267 return((int) *ctxt->input->cur);
268 }
269 }
270 /*
271 * Assume it's a fixed lenght encoding (1) with
272 * a compatibke encoding for the ASCII set, since
273 * XML constructs only use < 128 chars
274 */
275 *len = 1;
276 if ((int) *ctxt->input->cur < 0x80)
277 return((int) *ctxt->input->cur);
278
279 /*
280 * Humm this is bad, do an automatic flow conversion
281 */
282 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
283 ctxt->charset = XML_CHAR_ENCODING_UTF8;
284 return(xmlCurrentChar(ctxt, len));
285
286encoding_error:
287 /*
288 * If we detect an UTF8 error that probably mean that the
289 * input encoding didn't get properly advertized in the
290 * declaration header. Report the error and switch the encoding
291 * to ISO-Latin-1 (if you don't like this policy, just declare the
292 * encoding !)
293 */
Daniel Veillarda2c6da92000-09-16 18:15:00 +0000294 ctxt->errNo = XML_ERR_INVALID_ENCODING;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000295 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
296 ctxt->sax->error(ctxt->userData,
297 "Input is not proper UTF-8, indicate encoding !\n");
298 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
299 ctxt->input->cur[0], ctxt->input->cur[1],
300 ctxt->input->cur[2], ctxt->input->cur[3]);
301 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000302
303 ctxt->charset = XML_CHAR_ENCODING_8859_1;
304 *len = 1;
305 return((int) *ctxt->input->cur);
306}
307
Daniel Veillardcf461992000-03-14 18:30:20 +0000308/**
309 * htmlNextChar:
310 * @ctxt: the HTML parser context
311 *
312 * Skip to the next char input char.
313 */
314
315void
316htmlNextChar(htmlParserCtxtPtr ctxt) {
Daniel Veillard3f6f7f62000-06-30 17:58:25 +0000317 if (ctxt->instate == XML_PARSER_EOF)
318 return;
Daniel Veillardcf461992000-03-14 18:30:20 +0000319 if ((*ctxt->input->cur == 0) &&
320 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
321 xmlPopInput(ctxt);
322 } else {
323 if (*(ctxt->input->cur) == '\n') {
324 ctxt->input->line++; ctxt->input->col = 1;
325 } else ctxt->input->col++;
326 ctxt->input->cur++;
327 ctxt->nbChars++;
328 if (*ctxt->input->cur == 0)
329 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
330 }
331}
332
333/**
334 * htmlSkipBlankChars:
335 * @ctxt: the HTML parser context
336 *
337 * skip all blanks character found at that point in the input streams.
338 *
339 * Returns the number of space chars skipped
340 */
341
342int
343htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
344 int res = 0;
345
346 while (IS_BLANK(*(ctxt->input->cur))) {
347 if ((*ctxt->input->cur == 0) &&
348 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
349 xmlPopInput(ctxt);
350 } else {
351 if (*(ctxt->input->cur) == '\n') {
352 ctxt->input->line++; ctxt->input->col = 1;
353 } else ctxt->input->col++;
354 ctxt->input->cur++;
355 ctxt->nbChars++;
356 if (*ctxt->input->cur == 0)
357 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
358 }
359 res++;
360 }
361 return(res);
362}
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000363
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000364
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000365
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000366/************************************************************************
367 * *
368 * The list of HTML elements and their properties *
369 * *
370 ************************************************************************/
371
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000372/*
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000373 * Start Tag: 1 means the start tag can be ommited
374 * End Tag: 1 means the end tag can be ommited
375 * 2 means it's forbidden (empty elements)
376 * Depr: this element is deprecated
377 * DTD: 1 means that this element is valid only in the Loose DTD
378 * 2 means that this element is valid only in the Frameset DTD
379 *
380 * Name,Start Tag,End Tag, Empty, Depr., DTD, Description
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000381 */
382htmlElemDesc html40ElementTable[] = {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000383{ "a", 0, 0, 0, 0, 0, "anchor " },
384{ "abbr", 0, 0, 0, 0, 0, "abbreviated form" },
385{ "acronym", 0, 0, 0, 0, 0, "" },
386{ "address", 0, 0, 0, 0, 0, "information on author " },
387{ "applet", 0, 0, 0, 1, 1, "java applet " },
388{ "area", 0, 2, 1, 0, 0, "client-side image map area " },
389{ "b", 0, 0, 0, 0, 0, "bold text style" },
390{ "base", 0, 2, 1, 0, 0, "document base uri " },
391{ "basefont", 0, 2, 1, 1, 1, "base font size " },
392{ "bdo", 0, 0, 0, 0, 0, "i18n bidi over-ride " },
393{ "big", 0, 0, 0, 0, 0, "large text style" },
394{ "blockquote", 0, 0, 0, 0, 0, "long quotation " },
395{ "body", 1, 1, 0, 0, 0, "document body " },
396{ "br", 0, 2, 1, 0, 0, "forced line break " },
397{ "button", 0, 0, 0, 0, 0, "push button " },
398{ "caption", 0, 0, 0, 0, 0, "table caption " },
399{ "center", 0, 0, 0, 1, 1, "shorthand for div align=center " },
400{ "cite", 0, 0, 0, 0, 0, "citation" },
401{ "code", 0, 0, 0, 0, 0, "computer code fragment" },
402{ "col", 0, 2, 1, 0, 0, "table column " },
403{ "colgroup", 0, 1, 0, 0, 0, "table column group " },
404{ "dd", 0, 1, 0, 0, 0, "definition description " },
405{ "del", 0, 0, 0, 0, 0, "deleted text " },
406{ "dfn", 0, 0, 0, 0, 0, "instance definition" },
407{ "dir", 0, 0, 0, 1, 1, "directory list" },
408{ "div", 0, 0, 0, 0, 0, "generic language/style container"},
409{ "dl", 0, 0, 0, 0, 0, "definition list " },
410{ "dt", 0, 1, 0, 0, 0, "definition term " },
411{ "em", 0, 0, 0, 0, 0, "emphasis" },
412{ "fieldset", 0, 0, 0, 0, 0, "form control group " },
413{ "font", 0, 0, 0, 1, 1, "local change to font " },
414{ "form", 0, 0, 0, 0, 0, "interactive form " },
415{ "frame", 0, 2, 1, 0, 2, "subwindow " },
416{ "frameset", 0, 0, 0, 0, 2, "window subdivision" },
417{ "h1", 0, 0, 0, 0, 0, "heading " },
418{ "h2", 0, 0, 0, 0, 0, "heading " },
419{ "h3", 0, 0, 0, 0, 0, "heading " },
420{ "h4", 0, 0, 0, 0, 0, "heading " },
421{ "h5", 0, 0, 0, 0, 0, "heading " },
422{ "h6", 0, 0, 0, 0, 0, "heading " },
423{ "head", 1, 1, 0, 0, 0, "document head " },
424{ "hr", 0, 2, 1, 0, 0, "horizontal rule " },
425{ "html", 1, 1, 0, 0, 0, "document root element " },
426{ "i", 0, 0, 0, 0, 0, "italic text style" },
427{ "iframe", 0, 0, 0, 0, 1, "inline subwindow " },
428{ "img", 0, 2, 1, 0, 0, "embedded image " },
429{ "input", 0, 2, 1, 0, 0, "form control " },
430{ "ins", 0, 0, 0, 0, 0, "inserted text" },
431{ "isindex", 0, 2, 1, 1, 1, "single line prompt " },
432{ "kbd", 0, 0, 0, 0, 0, "text to be entered by the user" },
433{ "label", 0, 0, 0, 0, 0, "form field label text " },
434{ "legend", 0, 0, 0, 0, 0, "fieldset legend " },
435{ "li", 0, 1, 0, 0, 0, "list item " },
436{ "link", 0, 2, 1, 0, 0, "a media-independent link " },
437{ "map", 0, 0, 0, 0, 0, "client-side image map " },
438{ "menu", 0, 0, 0, 1, 1, "menu list " },
439{ "meta", 0, 2, 1, 0, 0, "generic metainformation " },
440{ "noframes", 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " },
441{ "noscript", 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
442{ "object", 0, 0, 0, 0, 0, "generic embedded object " },
443{ "ol", 0, 0, 0, 0, 0, "ordered list " },
444{ "optgroup", 0, 0, 0, 0, 0, "option group " },
445{ "option", 0, 1, 0, 0, 0, "selectable choice " },
446{ "p", 0, 1, 0, 0, 0, "paragraph " },
447{ "param", 0, 2, 1, 0, 0, "named property value " },
448{ "pre", 0, 0, 0, 0, 0, "preformatted text " },
449{ "q", 0, 0, 0, 0, 0, "short inline quotation " },
450{ "s", 0, 0, 0, 1, 1, "strike-through text style" },
451{ "samp", 0, 0, 0, 0, 0, "sample program output, scripts, etc." },
452{ "script", 0, 0, 0, 0, 0, "script statements " },
453{ "select", 0, 0, 0, 0, 0, "option selector " },
454{ "small", 0, 0, 0, 0, 0, "small text style" },
455{ "span", 0, 0, 0, 0, 0, "generic language/style container " },
456{ "strike", 0, 0, 0, 1, 1, "strike-through text" },
457{ "strong", 0, 0, 0, 0, 0, "strong emphasis" },
458{ "style", 0, 0, 0, 0, 0, "style info " },
459{ "sub", 0, 0, 0, 0, 0, "subscript" },
460{ "sup", 0, 0, 0, 0, 0, "superscript " },
461{ "table", 0, 0, 0, 0, 0, "&#160;" },
462{ "tbody", 1, 1, 0, 0, 0, "table body " },
463{ "td", 0, 1, 0, 0, 0, "table data cell" },
464{ "textarea", 0, 0, 0, 0, 0, "multi-line text field " },
465{ "tfoot", 0, 1, 0, 0, 0, "table footer " },
466{ "th", 0, 1, 0, 0, 0, "table header cell" },
467{ "thead", 0, 1, 0, 0, 0, "table header " },
468{ "title", 0, 0, 0, 0, 0, "document title " },
469{ "tr", 0, 1, 0, 0, 0, "table row " },
470{ "tt", 0, 0, 0, 0, 0, "teletype or monospaced text style" },
471{ "u", 0, 0, 0, 1, 1, "underlined text style" },
472{ "ul", 0, 0, 0, 0, 0, "unordered list " },
473{ "var", 0, 0, 0, 0, 0, "instance of a variable or program argument" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000474};
475
476/*
477 * start tags that imply the end of a current element
478 * any tag of each line implies the end of the current element if the type of
479 * that element is in the same line
480 */
Daniel Veillardb96e6431999-08-29 21:02:19 +0000481char *htmlEquEnd[] = {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000482"dt", "dd", "li", "option", NULL,
483"h1", "h2", "h3", "h4", "h5", "h6", NULL,
484"ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000485NULL
486};
487/*
488 * acording the HTML DTD, HR should be added to the 2nd line above, as it
489 * is not allowed within a H1, H2, H3, etc. But we should tolerate that case
490 * because many documents contain rules in headings...
491 */
492
493/*
494 * start tags that imply the end of current element
495 */
Daniel Veillardb96e6431999-08-29 21:02:19 +0000496char *htmlStartClose[] = {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000497"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
498 "dl", "ul", "ol", "menu", "dir", "address", "pre",
499 "listing", "xmp", "head", NULL,
500"head", "p", NULL,
501"title", "p", NULL,
502"body", "head", "style", "link", "title", "p", NULL,
503"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
504 "pre", "listing", "xmp", "head", "li", NULL,
505"hr", "p", "head", NULL,
506"h1", "p", "head", NULL,
507"h2", "p", "head", NULL,
508"h3", "p", "head", NULL,
509"h4", "p", "head", NULL,
510"h5", "p", "head", NULL,
511"h6", "p", "head", NULL,
512"dir", "p", "head", NULL,
513"address", "p", "head", "ul", NULL,
514"pre", "p", "head", "ul", NULL,
515"listing", "p", "head", NULL,
516"xmp", "p", "head", NULL,
517"blockquote", "p", "head", NULL,
518"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
519 "xmp", "head", NULL,
520"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
521 "head", "dd", NULL,
522"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
523 "head", "dt", NULL,
524"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
525 "listing", "xmp", NULL,
526"ol", "p", "head", "ul", NULL,
527"menu", "p", "head", "ul", NULL,
528"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
529"div", "p", "head", NULL,
530"noscript", "p", "head", NULL,
531"center", "font", "b", "i", "p", "head", NULL,
532"a", "a", NULL,
533"caption", "p", NULL,
534"colgroup", "caption", "colgroup", "col", "p", NULL,
535"col", "caption", "col", "p", NULL,
536"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
537 "listing", "xmp", "a", NULL,
538"th", "th", "td", NULL,
539"td", "th", "td", "p", NULL,
540"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
541"thead", "caption", "col", "colgroup", NULL,
542"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
543 "tbody", "p", NULL,
544"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
545 "tfoot", "tbody", "p", NULL,
546"optgroup", "option", NULL,
Daniel Veillard126f2792000-10-24 17:10:12 +0000547"option", "option", NULL,
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000548"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
549 "pre", "listing", "xmp", "a", NULL,
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000550NULL
551};
552
Daniel Veillardb8f25c92000-08-19 19:52:36 +0000553/*
554 * The list of HTML elements which are supposed not to have
555 * CDATA content and where a p element will be implied
556 *
557 * TODO: extend that list by reading the HTML SGML DtD on
558 * implied paragraph
559 */
560static char *htmlNoContentElements[] = {
561 "html",
562 "head",
563 "body",
564 NULL
565};
566
Daniel Veillard47e12f22000-10-15 14:24:25 +0000567/*
568 * The list of HTML attributes which are of content %Script;
569 * NOTE: when adding ones, check htmlIsScriptAttribute() since
570 * it assumes the name starts with 'on'
571 */
572static char *htmlScriptAttributes[] = {
573 "onclick",
574 "ondblclick",
575 "onmousedown",
576 "onmouseup",
577 "onmouseover",
578 "onmousemove",
579 "onmouseout",
580 "onkeypress",
581 "onkeydown",
582 "onkeyup",
583 "onload",
584 "onunload",
585 "onfocus",
586 "onblur",
587 "onsubmit",
588 "onrest",
589 "onchange",
590 "onselect"
591};
592
593
Daniel Veillardb96e6431999-08-29 21:02:19 +0000594static char** htmlStartCloseIndex[100];
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000595static int htmlStartCloseIndexinitialized = 0;
596
597/************************************************************************
598 * *
599 * functions to handle HTML specific data *
600 * *
601 ************************************************************************/
602
603/**
604 * htmlInitAutoClose:
605 *
606 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
Daniel Veillardbc765302000-10-01 18:23:35 +0000607 * This is not reentrant. Call xmlInitParser() once before processing in
608 * case of use in multithreaded programs.
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000609 */
610void
611htmlInitAutoClose(void) {
612 int index, i = 0;
613
614 if (htmlStartCloseIndexinitialized) return;
615
616 for (index = 0;index < 100;index ++) htmlStartCloseIndex[index] = NULL;
617 index = 0;
618 while ((htmlStartClose[i] != NULL) && (index < 100 - 1)) {
619 htmlStartCloseIndex[index++] = &htmlStartClose[i];
620 while (htmlStartClose[i] != NULL) i++;
621 i++;
622 }
Daniel Veillardbc765302000-10-01 18:23:35 +0000623 htmlStartCloseIndexinitialized = 1;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000624}
625
626/**
627 * htmlTagLookup:
Daniel Veillardb656ebe2000-09-22 13:51:48 +0000628 * @tag: The tag name in lowercase
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000629 *
630 * Lookup the HTML tag in the ElementTable
631 *
632 * Returns the related htmlElemDescPtr or NULL if not found.
633 */
634htmlElemDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000635htmlTagLookup(const xmlChar *tag) {
Daniel Veillard47f3f312000-08-27 22:40:15 +0000636 int i;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000637
638 for (i = 0; i < (sizeof(html40ElementTable) /
639 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000640 if (xmlStrEqual(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000641 return(&html40ElementTable[i]);
642 }
643 return(NULL);
644}
645
646/**
647 * htmlCheckAutoClose:
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000648 * @newtag: The new tag name
649 * @oldtag: The old tag name
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000650 *
651 * Checks wether the new tag is one of the registered valid tags for closing old.
652 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
653 *
654 * Returns 0 if no, 1 if yes.
655 */
656int
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000657htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000658 int i, index;
Daniel Veillard39c7d712000-09-10 16:14:55 +0000659 char **close = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000660
661 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
662
663 /* inefficient, but not a big deal */
664 for (index = 0; index < 100;index++) {
665 close = htmlStartCloseIndex[index];
666 if (close == NULL) return(0);
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000667 if (xmlStrEqual(BAD_CAST *close, newtag)) break;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000668 }
669
670 i = close - htmlStartClose;
671 i++;
672 while (htmlStartClose[i] != NULL) {
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000673 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000674 return(1);
675 }
676 i++;
677 }
678 return(0);
679}
680
681/**
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000682 * htmlAutoCloseOnClose:
683 * @ctxt: an HTML parser context
684 * @newtag: The new tag name
685 *
686 * The HTmL DtD allows an ending tag to implicitely close other tags.
687 */
688void
689htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
690 htmlElemDescPtr info;
691 xmlChar *oldname;
692 int i;
693
694#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000695 xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000696 for (i = 0;i < ctxt->nameNr;i++)
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000697 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000698#endif
699
700 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000701 if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000702 }
703 if (i < 0) return;
704
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000705 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000706 info = htmlTagLookup(ctxt->name);
707 if ((info == NULL) || (info->endTag == 1)) {
708#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000709 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000710#endif
711 } else {
712 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
713 ctxt->sax->error(ctxt->userData,
714 "Opening and ending tag mismatch: %s and %s\n",
715 newtag, ctxt->name);
716 ctxt->wellFormed = 0;
717 }
718 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
719 ctxt->sax->endElement(ctxt->userData, ctxt->name);
720 oldname = htmlnamePop(ctxt);
721 if (oldname != NULL) {
722#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000723 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000724#endif
725 xmlFree(oldname);
726 }
727 }
728}
729
730/**
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000731 * htmlAutoClose:
732 * @ctxt: an HTML parser context
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000733 * @newtag: The new tag name or NULL
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000734 *
735 * The HTmL DtD allows a tag to implicitely close other tags.
736 * The list is kept in htmlStartClose array. This function is
737 * called when a new tag has been detected and generates the
738 * appropriates closes if possible/needed.
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000739 * If newtag is NULL this mean we are at the end of the resource
Daniel Veillard365e13b2000-07-02 07:56:37 +0000740 * and we should check
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000741 */
742void
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000743htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000744 xmlChar *oldname;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000745 while ((newtag != NULL) && (ctxt->name != NULL) &&
746 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000747#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000748 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000749#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000750 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000751 ctxt->sax->endElement(ctxt->userData, ctxt->name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +0000752 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000753 if (oldname != NULL) {
754#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000755 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000756#endif
757 xmlFree(oldname);
758 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000759 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000760 if (newtag == NULL) {
761 htmlAutoCloseOnClose(ctxt, BAD_CAST"head");
762 htmlAutoCloseOnClose(ctxt, BAD_CAST"body");
763 htmlAutoCloseOnClose(ctxt, BAD_CAST"html");
764 }
765 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000766 ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
767 (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
768 (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
Daniel Veillard365e13b2000-07-02 07:56:37 +0000769#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000770 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
Daniel Veillard365e13b2000-07-02 07:56:37 +0000771#endif
772 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
773 ctxt->sax->endElement(ctxt->userData, ctxt->name);
774 oldname = htmlnamePop(ctxt);
775 if (oldname != NULL) {
776#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000777 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
Daniel Veillard365e13b2000-07-02 07:56:37 +0000778#endif
779 xmlFree(oldname);
780 }
781 }
782
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000783}
784
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000785/**
Daniel Veillard5cb5ab81999-12-21 15:35:29 +0000786 * htmlAutoCloseTag:
787 * @doc: the HTML document
788 * @name: The tag name
789 * @elem: the HTML element
790 *
791 * The HTmL DtD allows a tag to implicitely close other tags.
792 * The list is kept in htmlStartClose array. This function checks
793 * if the element or one of it's children would autoclose the
794 * given tag.
795 *
796 * Returns 1 if autoclose, 0 otherwise
797 */
798int
799htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
800 htmlNodePtr child;
801
802 if (elem == NULL) return(1);
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000803 if (xmlStrEqual(name, elem->name)) return(0);
Daniel Veillard5cb5ab81999-12-21 15:35:29 +0000804 if (htmlCheckAutoClose(elem->name, name)) return(1);
Daniel Veillardcf461992000-03-14 18:30:20 +0000805 child = elem->children;
Daniel Veillard5cb5ab81999-12-21 15:35:29 +0000806 while (child != NULL) {
807 if (htmlAutoCloseTag(doc, name, child)) return(1);
808 child = child->next;
809 }
810 return(0);
811}
812
813/**
814 * htmlIsAutoClosed:
815 * @doc: the HTML document
816 * @elem: the HTML element
817 *
818 * The HTmL DtD allows a tag to implicitely close other tags.
819 * The list is kept in htmlStartClose array. This function checks
820 * if a tag is autoclosed by one of it's child
821 *
822 * Returns 1 if autoclosed, 0 otherwise
823 */
824int
825htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
826 htmlNodePtr child;
827
828 if (elem == NULL) return(1);
Daniel Veillardcf461992000-03-14 18:30:20 +0000829 child = elem->children;
Daniel Veillard5cb5ab81999-12-21 15:35:29 +0000830 while (child != NULL) {
831 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
832 child = child->next;
833 }
834 return(0);
835}
836
837/**
Daniel Veillardbe803962000-06-28 23:40:59 +0000838 * htmlCheckImplied:
839 * @ctxt: an HTML parser context
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000840 * @newtag: The new tag name
Daniel Veillardbe803962000-06-28 23:40:59 +0000841 *
Daniel Veillarda6d8eb62000-12-27 10:46:47 +0000842 * The HTML DtD allows a tag to exists only implicitely
Daniel Veillardbe803962000-06-28 23:40:59 +0000843 * called when a new tag has been detected and generates the
844 * appropriates implicit tags if missing
845 */
846void
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000847htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillarda6d8eb62000-12-27 10:46:47 +0000848 if (!htmlOmittedDefaultValue)
849 return;
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000850 if (xmlStrEqual(newtag, BAD_CAST"html"))
Daniel Veillardbe803962000-06-28 23:40:59 +0000851 return;
852 if (ctxt->nameNr <= 0) {
853#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000854 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
Daniel Veillardbe803962000-06-28 23:40:59 +0000855#endif
856 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
857 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
858 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
859 }
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000860 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
Daniel Veillardbe803962000-06-28 23:40:59 +0000861 return;
Daniel Veillardf62ceff2000-11-24 23:36:01 +0000862 if ((ctxt->nameNr <= 1) &&
863 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
864 (xmlStrEqual(newtag, BAD_CAST"style")) ||
865 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
866 (xmlStrEqual(newtag, BAD_CAST"link")) ||
867 (xmlStrEqual(newtag, BAD_CAST"title")) ||
868 (xmlStrEqual(newtag, BAD_CAST"base")))) {
Daniel Veillardbe803962000-06-28 23:40:59 +0000869 /*
870 * dropped OBJECT ... i you put it first BODY will be
871 * assumed !
872 */
873#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000874 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
Daniel Veillardbe803962000-06-28 23:40:59 +0000875#endif
876 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
877 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
878 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
Daniel Veillardf62ceff2000-11-24 23:36:01 +0000879 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
880 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
881 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
882 int i;
883 for (i = 0;i < ctxt->nameNr;i++) {
884 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
885 return;
886 }
887 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
888 return;
889 }
Daniel Veillardbe803962000-06-28 23:40:59 +0000890 }
Daniel Veillardf62ceff2000-11-24 23:36:01 +0000891
892#ifdef DEBUG
893 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
894#endif
895 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
896 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
897 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
Daniel Veillardbe803962000-06-28 23:40:59 +0000898 }
899}
900
Daniel Veillardb8f25c92000-08-19 19:52:36 +0000901/**
902 * htmlCheckParagraph
903 * @ctxt: an HTML parser context
904 *
905 * Check whether a p element need to be implied before inserting
906 * characters in the current element.
907 *
908 * Returns 1 if a paragraph has been inserted, 0 if not and -1
909 * in case of error.
910 */
911
912int
913htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
914 const xmlChar *tag;
915 int i;
916
917 if (ctxt == NULL)
918 return(-1);
919 tag = ctxt->name;
920 if (tag == NULL) {
921 htmlAutoClose(ctxt, BAD_CAST"p");
922 htmlCheckImplied(ctxt, BAD_CAST"p");
923 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
924 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
925 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
926 return(1);
927 }
Daniel Veillard45cff692001-01-03 18:02:04 +0000928 if (!htmlOmittedDefaultValue)
929 return;
Daniel Veillardb8f25c92000-08-19 19:52:36 +0000930 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000931 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Daniel Veillardb8f25c92000-08-19 19:52:36 +0000932#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000933 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
Daniel Veillardb8f25c92000-08-19 19:52:36 +0000934#endif
935 htmlAutoClose(ctxt, BAD_CAST"p");
936 htmlCheckImplied(ctxt, BAD_CAST"p");
937 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
938 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
939 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
940 return(1);
941 }
942 }
943 return(0);
944}
945
Daniel Veillard47e12f22000-10-15 14:24:25 +0000946/**
947 * htmlIsScriptAttribute:
948 * @name: an attribute name
949 *
950 * Check if an attribute is of content type Script
951 *
952 * Returns 1 is the attribute is a script 0 otherwise
953 */
954int
955htmlIsScriptAttribute(const xmlChar *name) {
956 int i;
957
958 if (name == NULL)
959 return(0);
960 /*
961 * all script attributes start with 'on'
962 */
963 if ((name[0] != 'o') || (name[1] != 'n'))
964 return(0);
965 for (i = 0;
966 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
967 i++) {
968 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
969 return(1);
970 }
971 return(0);
972}
973
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000974/************************************************************************
975 * *
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000976 * The list of HTML predefined entities *
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000977 * *
978 ************************************************************************/
979
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000980
981htmlEntityDesc html40EntitiesTable[] = {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000982/*
Daniel Veillard47f3f312000-08-27 22:40:15 +0000983 * the 4 absolute ones, plus apostrophe.
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000984 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000985{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
986{ 38, "amp", "ampersand, U+0026 ISOnum" },
Daniel Veillard47f3f312000-08-27 22:40:15 +0000987{ 39, "apos", "single quote" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000988{ 60, "lt", "less-than sign, U+003C ISOnum" },
989{ 62, "gt", "greater-than sign, U+003E ISOnum" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000990
991/*
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000992 * A bunch still in the 128-255 range
993 * Replacing them depend really on the charset used.
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000994 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000995{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
996{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
997{ 162, "cent", "cent sign, U+00A2 ISOnum" },
998{ 163, "pound","pound sign, U+00A3 ISOnum" },
999{ 164, "curren","currency sign, U+00A4 ISOnum" },
1000{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1001{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1002{ 167, "sect", "section sign, U+00A7 ISOnum" },
1003{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1004{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1005{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1006{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1007{ 172, "not", "not sign, U+00AC ISOnum" },
1008{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1009{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1010{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1011{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1012{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1013{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1014{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1015{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1016{ 181, "micro","micro sign, U+00B5 ISOnum" },
1017{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +00001018{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001019{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1020{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1021{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +00001022{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001023{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1024{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1025{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1026{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1027{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1028{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1029{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1030{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1031{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1032{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1033{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1034{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1035{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1036{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1037{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1038{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1039{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1040{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1041{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1042{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1043{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1044{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1045{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1046{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1047{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1048{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1049{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1050{ 215, "times","multiplication sign, U+00D7 ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +00001051{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001052{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1053{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1054{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1055{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1056{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1057{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1058{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1059{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1060{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1061{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1062{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1063{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1064{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1065{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1066{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1067{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1068{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1069{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1070{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1071{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1072{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1073{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1074{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1075{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1076{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1077{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1078{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1079{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1080{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1081{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1082{ 247, "divide","division sign, U+00F7 ISOnum" },
1083{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1084{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1085{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1086{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1087{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1088{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1089{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1090{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001091
Daniel Veillard47f3f312000-08-27 22:40:15 +00001092{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1093{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1094{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1095{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1096{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1097
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001098/*
1099 * Anything below should really be kept as entities references
1100 */
1101{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001102
Daniel Veillard47f3f312000-08-27 22:40:15 +00001103{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1104{ 732, "tilde","small tilde, U+02DC ISOdia" },
1105
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001106{ 913, "Alpha","greek capital letter alpha, U+0391" },
1107{ 914, "Beta", "greek capital letter beta, U+0392" },
1108{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1109{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1110{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1111{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1112{ 919, "Eta", "greek capital letter eta, U+0397" },
1113{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1114{ 921, "Iota", "greek capital letter iota, U+0399" },
1115{ 922, "Kappa","greek capital letter kappa, U+039A" },
1116{ 923, "Lambda""greek capital letter lambda, U+039B ISOgrk3" },
1117{ 924, "Mu", "greek capital letter mu, U+039C" },
1118{ 925, "Nu", "greek capital letter nu, U+039D" },
1119{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1120{ 927, "Omicron","greek capital letter omicron, U+039F" },
1121{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1122{ 929, "Rho", "greek capital letter rho, U+03A1" },
1123{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1124{ 932, "Tau", "greek capital letter tau, U+03A4" },
1125{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1126{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1127{ 935, "Chi", "greek capital letter chi, U+03A7" },
1128{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1129{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001130
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001131{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1132{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1133{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1134{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1135{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1136{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1137{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1138{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1139{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1140{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1141{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1142{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1143{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1144{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1145{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1146{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1147{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1148{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1149{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1150{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1151{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1152{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1153{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1154{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1155{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1156{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1157{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1158{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001159
Daniel Veillard47f3f312000-08-27 22:40:15 +00001160{ 8194, "ensp", "en space, U+2002 ISOpub" },
1161{ 8195, "emsp", "em space, U+2003 ISOpub" },
1162{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1163{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1164{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1165{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1166{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1167{ 8211, "ndash","en dash, U+2013 ISOpub" },
1168{ 8212, "mdash","em dash, U+2014 ISOpub" },
1169{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1170{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1171{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1172{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1173{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1174{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1175{ 8224, "dagger","dagger, U+2020 ISOpub" },
1176{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1177
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001178{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1179{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
Daniel Veillard47f3f312000-08-27 22:40:15 +00001180
1181{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1182
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001183{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1184{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
Daniel Veillard47f3f312000-08-27 22:40:15 +00001185
1186{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1187{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1188
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001189{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1190{ 8260, "frasl","fraction slash, U+2044 NEW" },
1191
Daniel Veillard47f3f312000-08-27 22:40:15 +00001192{ 8364, "euro", "euro sign, U+20AC NEW" },
1193
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001194{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
Daniel Veillard47f3f312000-08-27 22:40:15 +00001195{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001196{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1197{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1198{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1199{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1200{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1201{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1202{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1203{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1204{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1205{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1206{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1207{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1208{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1209{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1210
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001211{ 8704, "forall","for all, U+2200 ISOtech" },
1212{ 8706, "part", "partial differential, U+2202 ISOtech" },
1213{ 8707, "exist","there exists, U+2203 ISOtech" },
1214{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1215{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1216{ 8712, "isin", "element of, U+2208 ISOtech" },
1217{ 8713, "notin","not an element of, U+2209 ISOtech" },
1218{ 8715, "ni", "contains as member, U+220B ISOtech" },
1219{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1220{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
1221{ 8722, "minus","minus sign, U+2212 ISOtech" },
1222{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1223{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1224{ 8733, "prop", "proportional to, U+221D ISOtech" },
1225{ 8734, "infin","infinity, U+221E ISOtech" },
1226{ 8736, "ang", "angle, U+2220 ISOamso" },
1227{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1228{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1229{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1230{ 8746, "cup", "union = cup, U+222A ISOtech" },
1231{ 8747, "int", "integral, U+222B ISOtech" },
1232{ 8756, "there4","therefore, U+2234 ISOtech" },
1233{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1234{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1235{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1236{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1237{ 8801, "equiv","identical to, U+2261 ISOtech" },
1238{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1239{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1240{ 8834, "sub", "subset of, U+2282 ISOtech" },
1241{ 8835, "sup", "superset of, U+2283 ISOtech" },
1242{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1243{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1244{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1245{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1246{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1247{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1248{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1249{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1250{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1251{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1252{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1253{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1254{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1255{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1256
1257{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1258{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1259{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1260{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1261
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001262};
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001263
1264/************************************************************************
1265 * *
1266 * Commodity functions to handle entities *
1267 * *
1268 ************************************************************************/
1269
1270/*
1271 * Macro used to grow the current buffer.
1272 */
1273#define growBuffer(buffer) { \
1274 buffer##_size *= 2; \
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001275 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001276 if (buffer == NULL) { \
1277 perror("realloc failed"); \
Daniel Veillard0142b842000-01-14 14:45:24 +00001278 return(NULL); \
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001279 } \
1280}
1281
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001282/**
1283 * htmlEntityLookup:
1284 * @name: the entity name
1285 *
1286 * Lookup the given entity in EntitiesTable
1287 *
1288 * TODO: the linear scan is really ugly, an hash table is really needed.
1289 *
1290 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1291 */
1292htmlEntityDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001293htmlEntityLookup(const xmlChar *name) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001294 int i;
1295
1296 for (i = 0;i < (sizeof(html40EntitiesTable)/
1297 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard8b5dd832000-10-01 20:28:44 +00001298 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001299#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001300 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001301#endif
1302 return(&html40EntitiesTable[i]);
1303 }
1304 }
1305 return(NULL);
1306}
1307
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001308/**
Daniel Veillard47f3f312000-08-27 22:40:15 +00001309 * htmlEntityValueLookup:
1310 * @value: the entity's unicode value
1311 *
1312 * Lookup the given entity in EntitiesTable
1313 *
1314 * TODO: the linear scan is really ugly, an hash table is really needed.
1315 *
1316 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1317 */
1318htmlEntityDescPtr
1319htmlEntityValueLookup(int value) {
1320 int i;
1321#ifdef DEBUG
1322 int lv = 0;
1323#endif
1324
1325 for (i = 0;i < (sizeof(html40EntitiesTable)/
1326 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard4b0755c2000-09-25 14:26:28 +00001327 if ((unsigned int) html40EntitiesTable[i].value >= value) {
1328 if ((unsigned int) html40EntitiesTable[i].value > value)
Daniel Veillard47f3f312000-08-27 22:40:15 +00001329 break;
1330#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001331 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
Daniel Veillard47f3f312000-08-27 22:40:15 +00001332#endif
1333 return(&html40EntitiesTable[i]);
1334 }
1335#ifdef DEBUG
1336 if (lv > html40EntitiesTable[i].value) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001337 xmlGenericError(xmlGenericErrorContext,
1338 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
Daniel Veillard47f3f312000-08-27 22:40:15 +00001339 lv, html40EntitiesTable[i].value);
1340 }
1341 lv = html40EntitiesTable[i].value;
1342#endif
1343 }
1344 return(NULL);
1345}
1346
1347/**
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001348 * UTF8ToHtml:
1349 * @out: a pointer to an array of bytes to store the result
1350 * @outlen: the length of @out
1351 * @in: a pointer to an array of UTF-8 chars
1352 * @inlen: the length of @in
1353 *
1354 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1355 * plus HTML entities block of chars out.
1356 *
1357 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1358 * The value of @inlen after return is the number of octets consumed
1359 * as the return value is positive, else unpredictiable.
1360 * The value of @outlen after return is the number of octets consumed.
1361 */
1362int
1363UTF8ToHtml(unsigned char* out, int *outlen,
1364 const unsigned char* in, int *inlen) {
1365 const unsigned char* processed = in;
1366 const unsigned char* outend;
1367 const unsigned char* outstart = out;
1368 const unsigned char* instart = in;
1369 const unsigned char* inend;
1370 unsigned int c, d;
1371 int trailing;
1372
1373 if (in == NULL) {
1374 /*
1375 * initialization nothing to do
1376 */
1377 *outlen = 0;
1378 *inlen = 0;
1379 return(0);
1380 }
1381 inend = in + (*inlen);
1382 outend = out + (*outlen);
1383 while (in < inend) {
1384 d = *in++;
1385 if (d < 0x80) { c= d; trailing= 0; }
1386 else if (d < 0xC0) {
1387 /* trailing byte in leading position */
1388 *outlen = out - outstart;
1389 *inlen = processed - instart;
1390 return(-2);
1391 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1392 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1393 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1394 else {
1395 /* no chance for this in Ascii */
1396 *outlen = out - outstart;
1397 *inlen = processed - instart;
1398 return(-2);
1399 }
1400
1401 if (inend - in < trailing) {
1402 break;
1403 }
1404
1405 for ( ; trailing; trailing--) {
1406 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1407 break;
1408 c <<= 6;
1409 c |= d & 0x3F;
1410 }
1411
1412 /* assertion: c is a single UTF-4 value */
1413 if (c < 0x80) {
Daniel Veillarde010c172000-08-28 10:04:51 +00001414 if (out + 1 >= outend)
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001415 break;
1416 *out++ = c;
1417 } else {
Daniel Veillard47f3f312000-08-27 22:40:15 +00001418 int len;
1419 htmlEntityDescPtr ent;
1420
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001421 /*
1422 * Try to lookup a predefined HTML entity for it
1423 */
1424
Daniel Veillard47f3f312000-08-27 22:40:15 +00001425 ent = htmlEntityValueLookup(c);
1426 if (ent == NULL) {
1427 /* no chance for this in Ascii */
1428 *outlen = out - outstart;
1429 *inlen = processed - instart;
1430 return(-2);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001431 }
Daniel Veillard47f3f312000-08-27 22:40:15 +00001432 len = strlen(ent->name);
Daniel Veillarde010c172000-08-28 10:04:51 +00001433 if (out + 2 + len >= outend)
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001434 break;
1435 *out++ = '&';
Daniel Veillard47f3f312000-08-27 22:40:15 +00001436 memcpy(out, ent->name, len);
1437 out += len;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001438 *out++ = ';';
1439 }
1440 processed = in;
1441 }
1442 *outlen = out - outstart;
1443 *inlen = processed - instart;
1444 return(0);
1445}
1446
Daniel Veillarde010c172000-08-28 10:04:51 +00001447/**
1448 * htmlEncodeEntities:
1449 * @out: a pointer to an array of bytes to store the result
1450 * @outlen: the length of @out
1451 * @in: a pointer to an array of UTF-8 chars
1452 * @inlen: the length of @in
1453 * @quoteChar: the quote character to escape (' or ") or zero.
1454 *
1455 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1456 * plus HTML entities block of chars out.
1457 *
1458 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1459 * The value of @inlen after return is the number of octets consumed
1460 * as the return value is positive, else unpredictiable.
1461 * The value of @outlen after return is the number of octets consumed.
1462 */
1463int
1464htmlEncodeEntities(unsigned char* out, int *outlen,
1465 const unsigned char* in, int *inlen, int quoteChar) {
1466 const unsigned char* processed = in;
1467 const unsigned char* outend = out + (*outlen);
1468 const unsigned char* outstart = out;
1469 const unsigned char* instart = in;
1470 const unsigned char* inend = in + (*inlen);
1471 unsigned int c, d;
1472 int trailing;
1473
1474 while (in < inend) {
1475 d = *in++;
1476 if (d < 0x80) { c= d; trailing= 0; }
1477 else if (d < 0xC0) {
1478 /* trailing byte in leading position */
1479 *outlen = out - outstart;
1480 *inlen = processed - instart;
1481 return(-2);
1482 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1483 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1484 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1485 else {
1486 /* no chance for this in Ascii */
1487 *outlen = out - outstart;
1488 *inlen = processed - instart;
1489 return(-2);
1490 }
1491
1492 if (inend - in < trailing)
1493 break;
1494
1495 while (trailing--) {
1496 if (((d= *in++) & 0xC0) != 0x80) {
1497 *outlen = out - outstart;
1498 *inlen = processed - instart;
1499 return(-2);
1500 }
1501 c <<= 6;
1502 c |= d & 0x3F;
1503 }
1504
1505 /* assertion: c is a single UTF-4 value */
1506 if (c < 0x80 && c != quoteChar && c != '&' && c != '<' && c != '>') {
1507 if (out >= outend)
1508 break;
1509 *out++ = c;
1510 } else {
1511 htmlEntityDescPtr ent;
1512 const char *cp;
1513 char nbuf[16];
1514 int len;
1515
1516 /*
1517 * Try to lookup a predefined HTML entity for it
1518 */
1519 ent = htmlEntityValueLookup(c);
1520 if (ent == NULL) {
1521 sprintf(nbuf, "#%u", c);
1522 cp = nbuf;
1523 }
1524 else
1525 cp = ent->name;
1526 len = strlen(cp);
1527 if (out + 2 + len > outend)
1528 break;
1529 *out++ = '&';
1530 memcpy(out, cp, len);
1531 out += len;
1532 *out++ = ';';
1533 }
1534 processed = in;
1535 }
1536 *outlen = out - outstart;
1537 *inlen = processed - instart;
1538 return(0);
1539}
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001540
1541/**
1542 * htmlDecodeEntities:
1543 * @ctxt: the parser context
1544 * @len: the len to decode (in bytes !), -1 for no size limit
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001545 * @end: an end marker xmlChar, 0 if none
1546 * @end2: an end marker xmlChar, 0 if none
1547 * @end3: an end marker xmlChar, 0 if none
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001548 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001549 * Subtitute the HTML entities by their value
1550 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001551 * DEPRECATED !!!!
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001552 *
1553 * Returns A newly allocated string with the substitution done. The caller
1554 * must deallocate it !
1555 */
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001556xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001557htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001558 xmlChar end, xmlChar end2, xmlChar end3) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001559 xmlChar *name = NULL;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001560 xmlChar *buffer = NULL;
1561 unsigned int buffer_size = 0;
1562 unsigned int nbchars = 0;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001563 htmlEntityDescPtr ent;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001564 unsigned int max = (unsigned int) len;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001565 int c,l;
1566
1567 if (ctxt->depth > 40) {
Daniel Veillarda2c6da92000-09-16 18:15:00 +00001568 ctxt->errNo = XML_ERR_ENTITY_LOOP;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001569 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1570 ctxt->sax->error(ctxt->userData,
1571 "Detected entity reference loop\n");
1572 ctxt->wellFormed = 0;
1573 ctxt->disableSAX = 1;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001574 return(NULL);
1575 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001576
1577 /*
1578 * allocate a translation buffer.
1579 */
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001580 buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001581 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001582 if (buffer == NULL) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001583 perror("xmlDecodeEntities: malloc failed");
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001584 return(NULL);
1585 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001586
1587 /*
1588 * Ok loop until we reach one of the ending char or a size limit.
1589 */
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001590 c = CUR_CHAR(l);
1591 while ((nbchars < max) && (c != end) &&
1592 (c != end2) && (c != end3)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001593
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001594 if (c == 0) break;
1595 if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
1596 int val = htmlParseCharRef(ctxt);
1597 COPY_BUF(0,buffer,nbchars,val);
1598 NEXTL(l);
1599 } else if ((c == '&') && (ctxt->token != '&')) {
1600 ent = htmlParseEntityRef(ctxt, &name);
1601 if (name != NULL) {
1602 if (ent != NULL) {
1603 int val = ent->value;
1604 COPY_BUF(0,buffer,nbchars,val);
1605 NEXTL(l);
1606 } else {
1607 const xmlChar *cur = name;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001608
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001609 buffer[nbchars++] = '&';
1610 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1611 growBuffer(buffer);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001612 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001613 while (*cur != 0) {
1614 buffer[nbchars++] = *cur++;
1615 }
1616 buffer[nbchars++] = ';';
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001617 }
1618 }
1619 } else {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001620 COPY_BUF(l,buffer,nbchars,c);
1621 NEXTL(l);
1622 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001623 growBuffer(buffer);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001624 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001625 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001626 c = CUR_CHAR(l);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001627 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001628 buffer[nbchars++] = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001629 return(buffer);
1630}
1631
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001632/************************************************************************
1633 * *
1634 * Commodity functions to handle streams *
1635 * *
1636 ************************************************************************/
1637
1638/**
1639 * htmlFreeInputStream:
1640 * @input: an htmlParserInputPtr
1641 *
1642 * Free up an input stream.
1643 */
1644void
1645htmlFreeInputStream(htmlParserInputPtr input) {
1646 if (input == NULL) return;
1647
1648 if (input->filename != NULL) xmlFree((char *) input->filename);
1649 if (input->directory != NULL) xmlFree((char *) input->directory);
1650 if ((input->free != NULL) && (input->base != NULL))
1651 input->free((xmlChar *) input->base);
1652 if (input->buf != NULL)
1653 xmlFreeParserInputBuffer(input->buf);
1654 memset(input, -1, sizeof(htmlParserInput));
1655 xmlFree(input);
1656}
1657
1658/**
1659 * htmlNewInputStream:
1660 * @ctxt: an HTML parser context
1661 *
1662 * Create a new input stream structure
1663 * Returns the new input stream or NULL
1664 */
1665htmlParserInputPtr
1666htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1667 htmlParserInputPtr input;
1668
1669 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1670 if (input == NULL) {
1671 ctxt->errNo = XML_ERR_NO_MEMORY;
1672 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1673 ctxt->sax->error(ctxt->userData,
1674 "malloc: couldn't allocate a new input stream\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001675 return(NULL);
1676 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001677 memset(input, 0, sizeof(htmlParserInput));
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001678 input->filename = NULL;
1679 input->directory = NULL;
1680 input->base = NULL;
1681 input->cur = NULL;
1682 input->buf = NULL;
1683 input->line = 1;
1684 input->col = 1;
1685 input->buf = NULL;
1686 input->free = NULL;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001687 input->version = NULL;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001688 input->consumed = 0;
1689 input->length = 0;
1690 return(input);
1691}
1692
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001693
1694/************************************************************************
1695 * *
1696 * Commodity functions, cleanup needed ? *
1697 * *
1698 ************************************************************************/
1699
1700/**
1701 * areBlanks:
1702 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001703 * @str: a xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001704 * @len: the size of @str
1705 *
1706 * Is this a sequence of blank chars that one can ignore ?
1707 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001708 * Returns 1 if ignorable 0 otherwise.
1709 */
1710
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001711static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001712 int i;
1713 xmlNodePtr lastChild;
1714
1715 for (i = 0;i < len;i++)
1716 if (!(IS_BLANK(str[i]))) return(0);
1717
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001718 if (CUR == 0) return(1);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001719 if (CUR != '<') return(0);
Daniel Veillarde010c172000-08-28 10:04:51 +00001720 if (ctxt->name == NULL)
1721 return(1);
Daniel Veillard8b5dd832000-10-01 20:28:44 +00001722 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
Daniel Veillard4948eb42000-08-29 09:41:15 +00001723 return(1);
Daniel Veillard8b5dd832000-10-01 20:28:44 +00001724 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
Daniel Veillarde010c172000-08-28 10:04:51 +00001725 return(1);
Daniel Veillard8b5dd832000-10-01 20:28:44 +00001726 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
Daniel Veillarde010c172000-08-28 10:04:51 +00001727 return(1);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001728 if (ctxt->node == NULL) return(0);
1729 lastChild = xmlGetLastChild(ctxt->node);
1730 if (lastChild == NULL) {
1731 if (ctxt->node->content != NULL) return(0);
Daniel Veillardc4f4f0b2000-10-29 17:46:30 +00001732 } else if (xmlNodeIsText(lastChild)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001733 return(0);
Daniel Veillardc4f4f0b2000-10-29 17:46:30 +00001734 } else if (xmlStrEqual(lastChild->name, BAD_CAST"b")) {
1735 return(0);
1736 } else if (xmlStrEqual(lastChild->name, BAD_CAST"bold")) {
1737 return(0);
1738 } else if (xmlStrEqual(lastChild->name, BAD_CAST"em")) {
1739 return(0);
1740 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001741 return(1);
1742}
1743
1744/**
1745 * htmlHandleEntity:
1746 * @ctxt: an HTML parser context
1747 * @entity: an XML entity pointer.
1748 *
1749 * Default handling of an HTML entity, call the parser with the
1750 * substitution string
1751 */
1752
1753void
1754htmlHandleEntity(htmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
1755 int len;
1756
1757 if (entity->content == NULL) {
1758 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1759 ctxt->sax->error(ctxt->userData, "htmlHandleEntity %s: content == NULL\n",
1760 entity->name);
1761 ctxt->wellFormed = 0;
1762 return;
1763 }
1764 len = xmlStrlen(entity->content);
1765
1766 /*
1767 * Just handle the content as a set of chars.
1768 */
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001769 htmlCheckParagraph(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001770 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
1771 ctxt->sax->characters(ctxt->userData, entity->content, len);
1772
1773}
1774
1775/**
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001776 * htmlNewDocNoDtD:
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001777 * @URI: URI for the dtd, or NULL
1778 * @ExternalID: the external ID of the DTD, or NULL
1779 *
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001780 * Returns a new document, do not intialize the DTD if not provided
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001781 */
1782htmlDocPtr
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001783htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001784 xmlDocPtr cur;
1785
1786 /*
1787 * Allocate a new document and fill the fields.
1788 */
Daniel Veillard6454aec1999-09-02 22:04:43 +00001789 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001790 if (cur == NULL) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001791 xmlGenericError(xmlGenericErrorContext,
1792 "xmlNewDoc : malloc failed\n");
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001793 return(NULL);
1794 }
Daniel Veillarde7a5a771999-08-30 13:05:42 +00001795 memset(cur, 0, sizeof(xmlDoc));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001796
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001797 cur->type = XML_HTML_DOCUMENT_NODE;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001798 cur->version = NULL;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001799 cur->intSubset = NULL;
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001800 if ((ExternalID != NULL) ||
1801 (URI != NULL))
Daniel Veillard5cb5ab81999-12-21 15:35:29 +00001802 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
Daniel Veillardbe803962000-06-28 23:40:59 +00001803 cur->doc = cur;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001804 cur->name = NULL;
Daniel Veillardcf461992000-03-14 18:30:20 +00001805 cur->children = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001806 cur->extSubset = NULL;
1807 cur->oldNs = NULL;
1808 cur->encoding = NULL;
1809 cur->standalone = 1;
1810 cur->compression = 0;
Daniel Veillardc08a2c61999-09-08 21:35:25 +00001811 cur->ids = NULL;
1812 cur->refs = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001813#ifndef XML_WITHOUT_CORBA
1814 cur->_private = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001815#endif
1816 return(cur);
1817}
1818
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001819/**
1820 * htmlNewDoc:
1821 * @URI: URI for the dtd, or NULL
1822 * @ExternalID: the external ID of the DTD, or NULL
1823 *
1824 * Returns a new document
1825 */
1826htmlDocPtr
1827htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1828 if ((URI == NULL) && (ExternalID == NULL))
1829 return(htmlNewDocNoDtD(
1830 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
1831 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"));
1832
1833 return(htmlNewDocNoDtD(URI, ExternalID));
1834}
1835
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001836
1837/************************************************************************
1838 * *
1839 * The parser itself *
1840 * Relates to http://www.w3.org/TR/html40 *
1841 * *
1842 ************************************************************************/
1843
1844/************************************************************************
1845 * *
1846 * The parser itself *
1847 * *
1848 ************************************************************************/
1849
1850/**
1851 * htmlParseHTMLName:
1852 * @ctxt: an HTML parser context
1853 *
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001854 * parse an HTML tag or attribute name, note that we convert it to lowercase
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001855 * since HTML names are not case-sensitive.
1856 *
1857 * Returns the Tag Name parsed or NULL
1858 */
1859
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001860xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001861htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001862 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001863 int i = 0;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001864 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001865
1866 if (!IS_LETTER(CUR) && (CUR != '_') &&
1867 (CUR != ':')) return(NULL);
1868
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001869 while ((i < HTML_PARSER_BUFFER_SIZE) &&
Daniel Veillard3f6f7f62000-06-30 17:58:25 +00001870 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
Daniel Veillarde8282ed2000-10-10 23:01:31 +00001871 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001872 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001873 else loc[i] = CUR;
1874 i++;
1875
1876 NEXT;
1877 }
1878
1879 ret = xmlStrndup(loc, i);
1880
1881 return(ret);
1882}
1883
1884/**
1885 * htmlParseName:
1886 * @ctxt: an HTML parser context
1887 *
1888 * parse an HTML name, this routine is case sensistive.
1889 *
1890 * Returns the Name parsed or NULL
1891 */
1892
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001893xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001894htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001895 xmlChar buf[HTML_MAX_NAMELEN];
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001896 int len = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001897
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001898 GROW;
1899 if (!IS_LETTER(CUR) && (CUR != '_')) {
1900 return(NULL);
1901 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001902
1903 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1904 (CUR == '.') || (CUR == '-') ||
1905 (CUR == '_') || (CUR == ':') ||
1906 (IS_COMBINING(CUR)) ||
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001907 (IS_EXTENDER(CUR))) {
1908 buf[len++] = CUR;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001909 NEXT;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001910 if (len >= HTML_MAX_NAMELEN) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001911 xmlGenericError(xmlGenericErrorContext,
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001912 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1913 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1914 (CUR == '.') || (CUR == '-') ||
1915 (CUR == '_') || (CUR == ':') ||
1916 (IS_COMBINING(CUR)) ||
1917 (IS_EXTENDER(CUR)))
1918 NEXT;
1919 break;
1920 }
1921 }
1922 return(xmlStrndup(buf, len));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001923}
1924
1925/**
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001926 * htmlParseHTMLAttribute:
1927 * @ctxt: an HTML parser context
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001928 * @stop: a char stop value
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001929 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001930 * parse an HTML attribute value till the stop (quote), if
1931 * stop is 0 then it stops at the first space
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001932 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001933 * Returns the attribute parsed or NULL
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001934 */
1935
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001936xmlChar *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001937htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00001938 xmlChar *buffer = NULL;
1939 int buffer_size = 0;
1940 xmlChar *out = NULL;
1941 xmlChar *name = NULL;
1942
1943 xmlChar *cur = NULL;
1944 htmlEntityDescPtr ent;
1945
1946 /*
1947 * allocate a translation buffer.
1948 */
Daniel Veillard7eda8452000-10-14 23:38:43 +00001949 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard71b656e2000-01-05 14:46:17 +00001950 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1951 if (buffer == NULL) {
1952 perror("htmlParseHTMLAttribute: malloc failed");
1953 return(NULL);
1954 }
1955 out = buffer;
1956
1957 /*
1958 * Ok loop until we reach one of the ending chars
1959 */
1960 while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1961 if ((stop == 0) && (IS_BLANK(CUR))) break;
1962 if (CUR == '&') {
1963 if (NXT(1) == '#') {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001964 unsigned int c;
1965 int bits;
1966
1967 c = htmlParseCharRef(ctxt);
1968 if (c < 0x80)
1969 { *out++ = c; bits= -6; }
1970 else if (c < 0x800)
1971 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1972 else if (c < 0x10000)
1973 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1974 else
1975 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1976
1977 for ( ; bits >= 0; bits-= 6) {
1978 *out++ = ((c >> bits) & 0x3F) | 0x80;
1979 }
Daniel Veillard71b656e2000-01-05 14:46:17 +00001980 } else {
1981 ent = htmlParseEntityRef(ctxt, &name);
1982 if (name == NULL) {
1983 *out++ = '&';
1984 if (out - buffer > buffer_size - 100) {
1985 int index = out - buffer;
1986
1987 growBuffer(buffer);
1988 out = &buffer[index];
1989 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001990 } else if (ent == NULL) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00001991 *out++ = '&';
1992 cur = name;
1993 while (*cur != 0) {
1994 if (out - buffer > buffer_size - 100) {
1995 int index = out - buffer;
1996
1997 growBuffer(buffer);
1998 out = &buffer[index];
1999 }
2000 *out++ = *cur++;
2001 }
2002 xmlFree(name);
2003 } else {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002004 unsigned int c;
2005 int bits;
2006
Daniel Veillard71b656e2000-01-05 14:46:17 +00002007 if (out - buffer > buffer_size - 100) {
2008 int index = out - buffer;
2009
2010 growBuffer(buffer);
2011 out = &buffer[index];
2012 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002013 c = (xmlChar)ent->value;
2014 if (c < 0x80)
2015 { *out++ = c; bits= -6; }
2016 else if (c < 0x800)
2017 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2018 else if (c < 0x10000)
2019 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2020 else
2021 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2022
2023 for ( ; bits >= 0; bits-= 6) {
2024 *out++ = ((c >> bits) & 0x3F) | 0x80;
2025 }
Daniel Veillard71b656e2000-01-05 14:46:17 +00002026 xmlFree(name);
2027 }
2028 }
2029 } else {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002030 unsigned int c;
Daniel Veillard4fb87ee2000-09-19 12:25:59 +00002031 int bits, l;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002032
Daniel Veillard71b656e2000-01-05 14:46:17 +00002033 if (out - buffer > buffer_size - 100) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002034 int index = out - buffer;
2035
2036 growBuffer(buffer);
2037 out = &buffer[index];
2038 }
Daniel Veillard4fb87ee2000-09-19 12:25:59 +00002039 c = CUR_CHAR(l);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002040 if (c < 0x80)
2041 { *out++ = c; bits= -6; }
2042 else if (c < 0x800)
2043 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2044 else if (c < 0x10000)
2045 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2046 else
2047 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2048
2049 for ( ; bits >= 0; bits-= 6) {
2050 *out++ = ((c >> bits) & 0x3F) | 0x80;
Daniel Veillard71b656e2000-01-05 14:46:17 +00002051 }
2052 NEXT;
2053 }
2054 }
2055 *out++ = 0;
2056 return(buffer);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002057}
2058
2059/**
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002060 * htmlParseNmtoken:
2061 * @ctxt: an HTML parser context
2062 *
2063 * parse an HTML Nmtoken.
2064 *
2065 * Returns the Nmtoken parsed or NULL
2066 */
2067
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002068xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002069htmlParseNmtoken(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002070 xmlChar buf[HTML_MAX_NAMELEN];
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002071 int len = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002072
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002073 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002074 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2075 (CUR == '.') || (CUR == '-') ||
2076 (CUR == '_') || (CUR == ':') ||
2077 (IS_COMBINING(CUR)) ||
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002078 (IS_EXTENDER(CUR))) {
2079 buf[len++] = CUR;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002080 NEXT;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002081 if (len >= HTML_MAX_NAMELEN) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00002082 xmlGenericError(xmlGenericErrorContext,
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002083 "htmlParseNmtoken: reached HTML_MAX_NAMELEN limit\n");
2084 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2085 (CUR == '.') || (CUR == '-') ||
2086 (CUR == '_') || (CUR == ':') ||
2087 (IS_COMBINING(CUR)) ||
2088 (IS_EXTENDER(CUR)))
2089 NEXT;
2090 break;
2091 }
2092 }
2093 return(xmlStrndup(buf, len));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002094}
2095
2096/**
2097 * htmlParseEntityRef:
2098 * @ctxt: an HTML parser context
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002099 * @str: location to store the entity name
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002100 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002101 * parse an HTML ENTITY references
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002102 *
2103 * [68] EntityRef ::= '&' Name ';'
2104 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002105 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2106 * if non-NULL *str will have to be freed by the caller.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002107 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002108htmlEntityDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002109htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2110 xmlChar *name;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002111 htmlEntityDescPtr ent = NULL;
2112 *str = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002113
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002114 if (CUR == '&') {
2115 NEXT;
2116 name = htmlParseName(ctxt);
2117 if (name == NULL) {
2118 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2119 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2120 ctxt->wellFormed = 0;
2121 } else {
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002122 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002123 if (CUR == ';') {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002124 *str = name;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002125
2126 /*
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002127 * Lookup the entity in the table.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002128 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002129 ent = htmlEntityLookup(name);
Daniel Veillard71b656e2000-01-05 14:46:17 +00002130 if (ent != NULL) /* OK that's ugly !!! */
2131 NEXT;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002132 } else {
2133 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2134 ctxt->sax->error(ctxt->userData,
2135 "htmlParseEntityRef: expecting ';'\n");
Daniel Veillard71b656e2000-01-05 14:46:17 +00002136 *str = name;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002137 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002138 }
2139 }
2140 return(ent);
2141}
2142
2143/**
2144 * htmlParseAttValue:
2145 * @ctxt: an HTML parser context
2146 *
2147 * parse a value for an attribute
2148 * Note: the parser won't do substitution of entities here, this
2149 * will be handled later in xmlStringGetNodeList, unless it was
2150 * asked for ctxt->replaceEntities != 0
2151 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002152 * Returns the AttValue parsed or NULL.
2153 */
2154
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002155xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002156htmlParseAttValue(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002157 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002158
2159 if (CUR == '"') {
2160 NEXT;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002161 ret = htmlParseHTMLAttribute(ctxt, '"');
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002162 if (CUR != '"') {
2163 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2164 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2165 ctxt->wellFormed = 0;
2166 } else
2167 NEXT;
2168 } else if (CUR == '\'') {
2169 NEXT;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002170 ret = htmlParseHTMLAttribute(ctxt, '\'');
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002171 if (CUR != '\'') {
2172 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2173 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2174 ctxt->wellFormed = 0;
2175 } else
2176 NEXT;
2177 } else {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002178 /*
2179 * That's an HTMLism, the attribute value may not be quoted
2180 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002181 ret = htmlParseHTMLAttribute(ctxt, 0);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002182 if (ret == NULL) {
2183 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2184 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2185 ctxt->wellFormed = 0;
2186 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002187 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002188 return(ret);
2189}
2190
2191/**
2192 * htmlParseSystemLiteral:
2193 * @ctxt: an HTML parser context
2194 *
2195 * parse an HTML Literal
2196 *
2197 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2198 *
2199 * Returns the SystemLiteral parsed or NULL
2200 */
2201
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002202xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002203htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002204 const xmlChar *q;
2205 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002206
2207 if (CUR == '"') {
2208 NEXT;
2209 q = CUR_PTR;
2210 while ((IS_CHAR(CUR)) && (CUR != '"'))
2211 NEXT;
2212 if (!IS_CHAR(CUR)) {
2213 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2214 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2215 ctxt->wellFormed = 0;
2216 } else {
2217 ret = xmlStrndup(q, CUR_PTR - q);
2218 NEXT;
2219 }
2220 } else if (CUR == '\'') {
2221 NEXT;
2222 q = CUR_PTR;
2223 while ((IS_CHAR(CUR)) && (CUR != '\''))
2224 NEXT;
2225 if (!IS_CHAR(CUR)) {
2226 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2227 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2228 ctxt->wellFormed = 0;
2229 } else {
2230 ret = xmlStrndup(q, CUR_PTR - q);
2231 NEXT;
2232 }
2233 } else {
2234 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardcf461992000-03-14 18:30:20 +00002235 ctxt->sax->error(ctxt->userData,
2236 "SystemLiteral \" or ' expected\n");
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002237 ctxt->wellFormed = 0;
2238 }
2239
2240 return(ret);
2241}
2242
2243/**
2244 * htmlParsePubidLiteral:
2245 * @ctxt: an HTML parser context
2246 *
2247 * parse an HTML public literal
2248 *
2249 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2250 *
2251 * Returns the PubidLiteral parsed or NULL.
2252 */
2253
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002254xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002255htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002256 const xmlChar *q;
2257 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002258 /*
2259 * Name ::= (Letter | '_') (NameChar)*
2260 */
2261 if (CUR == '"') {
2262 NEXT;
2263 q = CUR_PTR;
2264 while (IS_PUBIDCHAR(CUR)) NEXT;
2265 if (CUR != '"') {
2266 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2267 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2268 ctxt->wellFormed = 0;
2269 } else {
2270 ret = xmlStrndup(q, CUR_PTR - q);
2271 NEXT;
2272 }
2273 } else if (CUR == '\'') {
2274 NEXT;
2275 q = CUR_PTR;
2276 while ((IS_LETTER(CUR)) && (CUR != '\''))
2277 NEXT;
2278 if (!IS_LETTER(CUR)) {
2279 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2280 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2281 ctxt->wellFormed = 0;
2282 } else {
2283 ret = xmlStrndup(q, CUR_PTR - q);
2284 NEXT;
2285 }
2286 } else {
2287 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2288 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2289 ctxt->wellFormed = 0;
2290 }
2291
2292 return(ret);
2293}
2294
2295/**
Daniel Veillard7eda8452000-10-14 23:38:43 +00002296 * htmlParseScript:
2297 * @ctxt: an HTML parser context
2298 *
2299 * parse the content of an HTML SCRIPT or STYLE element
2300 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2301 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2302 * http://www.w3.org/TR/html4/types.html#type-script
2303 * http://www.w3.org/TR/html4/types.html#h-6.15
2304 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2305 *
2306 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2307 * element and the value of intrinsic event attributes. User agents must
2308 * not evaluate script data as HTML markup but instead must pass it on as
2309 * data to a script engine.
2310 * NOTES:
2311 * - The content is passed like CDATA
2312 * - the attributes for style and scripting "onXXX" are also described
2313 * as CDATA but SGML allows entities references in attributes so their
2314 * processing is identical as other attributes
2315 */
2316void
2317htmlParseScript(htmlParserCtxtPtr ctxt) {
2318 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2319 int nbchar = 0;
2320 xmlChar cur;
2321
2322 SHRINK;
2323 cur = CUR;
2324 while (IS_CHAR(cur)) {
2325 if ((cur == '<') && (NXT(1) == '/')) {
2326 /*
2327 * One should break here, the specification is clear:
2328 * Authors should therefore escape "</" within the content.
2329 * Escape mechanisms are specific to each scripting or
2330 * style sheet language.
2331 */
2332 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2333 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2334 break; /* while */
2335 }
2336 buf[nbchar++] = cur;
2337 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2338 if (ctxt->sax->cdataBlock!= NULL) {
2339 /*
2340 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2341 */
2342 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2343 }
2344 nbchar = 0;
2345 }
2346 NEXT;
2347 cur = CUR;
2348 }
Daniel Veillarda4964b72000-10-31 18:23:44 +00002349 if (!(IS_CHAR(cur))) {
2350 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2351 ctxt->sax->error(ctxt->userData,
2352 "Invalid char in CDATA 0x%X\n", cur);
2353 ctxt->wellFormed = 0;
2354 NEXT;
2355 }
2356
Daniel Veillard7eda8452000-10-14 23:38:43 +00002357 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2358 if (ctxt->sax->cdataBlock!= NULL) {
2359 /*
2360 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2361 */
2362 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2363 }
2364 }
2365}
2366
2367
2368/**
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002369 * htmlParseCharData:
2370 * @ctxt: an HTML parser context
2371 * @cdata: int indicating whether we are within a CDATA section
2372 *
2373 * parse a CharData section.
2374 * if we are within a CDATA section ']]>' marks an end of section.
2375 *
2376 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2377 */
2378
2379void
2380htmlParseCharData(htmlParserCtxtPtr ctxt, int cdata) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002381 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2382 int nbchar = 0;
2383 int cur, l;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002384
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002385 SHRINK;
2386 cur = CUR_CHAR(l);
2387 while (((cur != '<') || (ctxt->token == '<')) &&
2388 ((cur != '&') || (ctxt->token == '&')) &&
2389 (IS_CHAR(cur))) {
2390 COPY_BUF(l,buf,nbchar,cur);
2391 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2392 /*
2393 * Ok the segment is to be consumed as chars.
2394 */
2395 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2396 if (areBlanks(ctxt, buf, nbchar)) {
2397 if (ctxt->sax->ignorableWhitespace != NULL)
2398 ctxt->sax->ignorableWhitespace(ctxt->userData,
2399 buf, nbchar);
2400 } else {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00002401 htmlCheckParagraph(ctxt);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002402 if (ctxt->sax->characters != NULL)
2403 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2404 }
2405 }
2406 nbchar = 0;
2407 }
2408 NEXTL(l);
2409 cur = CUR_CHAR(l);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002410 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002411 if (nbchar != 0) {
2412 /*
2413 * Ok the segment is to be consumed as chars.
2414 */
2415 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2416 if (areBlanks(ctxt, buf, nbchar)) {
2417 if (ctxt->sax->ignorableWhitespace != NULL)
2418 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2419 } else {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00002420 htmlCheckParagraph(ctxt);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002421 if (ctxt->sax->characters != NULL)
2422 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002423 }
2424 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002425 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002426}
2427
2428/**
2429 * htmlParseExternalID:
2430 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002431 * @publicID: a xmlChar** receiving PubidLiteral
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002432 * @strict: indicate whether we should restrict parsing to only
2433 * production [75], see NOTE below
2434 *
2435 * Parse an External ID or a Public ID
2436 *
2437 * NOTE: Productions [75] and [83] interract badly since [75] can generate
2438 * 'PUBLIC' S PubidLiteral S SystemLiteral
2439 *
2440 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2441 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2442 *
2443 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2444 *
2445 * Returns the function returns SystemLiteral and in the second
2446 * case publicID receives PubidLiteral, is strict is off
2447 * it is possible to return NULL and have publicID set.
2448 */
2449
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002450xmlChar *
2451htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID, int strict) {
2452 xmlChar *URI = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002453
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002454 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2455 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2456 (UPP(4) == 'E') && (UPP(5) == 'M')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002457 SKIP(6);
2458 if (!IS_BLANK(CUR)) {
2459 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2460 ctxt->sax->error(ctxt->userData,
2461 "Space required after 'SYSTEM'\n");
2462 ctxt->wellFormed = 0;
2463 }
2464 SKIP_BLANKS;
2465 URI = htmlParseSystemLiteral(ctxt);
2466 if (URI == NULL) {
2467 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2468 ctxt->sax->error(ctxt->userData,
2469 "htmlParseExternalID: SYSTEM, no URI\n");
2470 ctxt->wellFormed = 0;
2471 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002472 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2473 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2474 (UPP(4) == 'I') && (UPP(5) == 'C')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002475 SKIP(6);
2476 if (!IS_BLANK(CUR)) {
2477 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2478 ctxt->sax->error(ctxt->userData,
2479 "Space required after 'PUBLIC'\n");
2480 ctxt->wellFormed = 0;
2481 }
2482 SKIP_BLANKS;
2483 *publicID = htmlParsePubidLiteral(ctxt);
2484 if (*publicID == NULL) {
2485 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2486 ctxt->sax->error(ctxt->userData,
2487 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2488 ctxt->wellFormed = 0;
2489 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002490 SKIP_BLANKS;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002491 if ((CUR == '"') || (CUR == '\'')) {
2492 URI = htmlParseSystemLiteral(ctxt);
2493 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002494 }
2495 return(URI);
2496}
2497
2498/**
2499 * htmlParseComment:
2500 * @ctxt: an HTML parser context
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002501 *
2502 * Parse an XML (SGML) comment <!-- .... -->
2503 *
2504 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2505 */
2506void
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002507htmlParseComment(htmlParserCtxtPtr ctxt) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002508 xmlChar *buf = NULL;
Daniel Veillard87b95392000-08-12 21:12:04 +00002509 int len;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002510 int size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard87b95392000-08-12 21:12:04 +00002511 int q, ql;
2512 int r, rl;
2513 int cur, l;
2514 xmlParserInputState state;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002515
2516 /*
2517 * Check that there is a comment right here.
2518 */
Daniel Veillard87b95392000-08-12 21:12:04 +00002519 if ((RAW != '<') || (NXT(1) != '!') ||
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002520 (NXT(2) != '-') || (NXT(3) != '-')) return;
2521
Daniel Veillard87b95392000-08-12 21:12:04 +00002522 state = ctxt->instate;
2523 ctxt->instate = XML_PARSER_COMMENT;
2524 SHRINK;
2525 SKIP(4);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002526 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2527 if (buf == NULL) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00002528 xmlGenericError(xmlGenericErrorContext,
2529 "malloc of %d byte failed\n", size);
Daniel Veillard87b95392000-08-12 21:12:04 +00002530 ctxt->instate = state;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002531 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002532 }
Daniel Veillard87b95392000-08-12 21:12:04 +00002533 q = CUR_CHAR(ql);
2534 NEXTL(ql);
2535 r = CUR_CHAR(rl);
2536 NEXTL(rl);
2537 cur = CUR_CHAR(l);
2538 len = 0;
2539 while (IS_CHAR(cur) &&
2540 ((cur != '>') ||
2541 (r != '-') || (q != '-'))) {
2542 if (len + 5 >= size) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002543 size *= 2;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002544 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002545 if (buf == NULL) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00002546 xmlGenericError(xmlGenericErrorContext,
2547 "realloc of %d byte failed\n", size);
Daniel Veillard87b95392000-08-12 21:12:04 +00002548 ctxt->instate = state;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002549 return;
2550 }
2551 }
Daniel Veillard87b95392000-08-12 21:12:04 +00002552 COPY_BUF(ql,buf,len,q);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002553 q = r;
Daniel Veillard87b95392000-08-12 21:12:04 +00002554 ql = rl;
2555 r = cur;
2556 rl = l;
2557 NEXTL(l);
2558 cur = CUR_CHAR(l);
2559 if (cur == 0) {
2560 SHRINK;
2561 GROW;
2562 cur = CUR_CHAR(l);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002563 }
2564 }
Daniel Veillard87b95392000-08-12 21:12:04 +00002565 buf[len] = 0;
2566 if (!IS_CHAR(cur)) {
Daniel Veillarda2c6da92000-09-16 18:15:00 +00002567 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
Daniel Veillard87b95392000-08-12 21:12:04 +00002568 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2569 ctxt->sax->error(ctxt->userData,
2570 "Comment not terminated \n<!--%.50s\n", buf);
Daniel Veillard87b95392000-08-12 21:12:04 +00002571 ctxt->wellFormed = 0;
2572 xmlFree(buf);
2573 } else {
2574 NEXT;
2575 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2576 (!ctxt->disableSAX))
2577 ctxt->sax->comment(ctxt->userData, buf);
2578 xmlFree(buf);
2579 }
2580 ctxt->instate = state;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002581}
2582
2583/**
2584 * htmlParseCharRef:
2585 * @ctxt: an HTML parser context
2586 *
2587 * parse Reference declarations
2588 *
2589 * [66] CharRef ::= '&#' [0-9]+ ';' |
2590 * '&#x' [0-9a-fA-F]+ ';'
2591 *
2592 * Returns the value parsed (as an int)
2593 */
2594int
2595htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2596 int val = 0;
2597
2598 if ((CUR == '&') && (NXT(1) == '#') &&
2599 (NXT(2) == 'x')) {
2600 SKIP(3);
2601 while (CUR != ';') {
2602 if ((CUR >= '0') && (CUR <= '9'))
2603 val = val * 16 + (CUR - '0');
2604 else if ((CUR >= 'a') && (CUR <= 'f'))
2605 val = val * 16 + (CUR - 'a') + 10;
2606 else if ((CUR >= 'A') && (CUR <= 'F'))
2607 val = val * 16 + (CUR - 'A') + 10;
2608 else {
2609 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2610 ctxt->sax->error(ctxt->userData,
2611 "htmlParseCharRef: invalid hexadecimal value\n");
2612 ctxt->wellFormed = 0;
Daniel Veillard748e45d2000-11-17 16:36:08 +00002613 return(0);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002614 }
2615 NEXT;
2616 }
2617 if (CUR == ';')
2618 NEXT;
2619 } else if ((CUR == '&') && (NXT(1) == '#')) {
2620 SKIP(2);
2621 while (CUR != ';') {
2622 if ((CUR >= '0') && (CUR <= '9'))
2623 val = val * 10 + (CUR - '0');
2624 else {
2625 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2626 ctxt->sax->error(ctxt->userData,
2627 "htmlParseCharRef: invalid decimal value\n");
2628 ctxt->wellFormed = 0;
Daniel Veillard748e45d2000-11-17 16:36:08 +00002629 return(0);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002630 }
2631 NEXT;
2632 }
2633 if (CUR == ';')
2634 NEXT;
2635 } else {
2636 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2637 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2638 ctxt->wellFormed = 0;
2639 }
2640 /*
2641 * Check the value IS_CHAR ...
2642 */
2643 if (IS_CHAR(val)) {
2644 return(val);
2645 } else {
2646 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002647 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002648 val);
2649 ctxt->wellFormed = 0;
2650 }
2651 return(0);
2652}
2653
2654
2655/**
2656 * htmlParseDocTypeDecl :
2657 * @ctxt: an HTML parser context
2658 *
2659 * parse a DOCTYPE declaration
2660 *
2661 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2662 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2663 */
2664
2665void
2666htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002667 xmlChar *name;
2668 xmlChar *ExternalID = NULL;
2669 xmlChar *URI = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002670
2671 /*
2672 * We know that '<!DOCTYPE' has been detected.
2673 */
2674 SKIP(9);
2675
2676 SKIP_BLANKS;
2677
2678 /*
2679 * Parse the DOCTYPE name.
2680 */
2681 name = htmlParseName(ctxt);
2682 if (name == NULL) {
2683 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2684 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2685 ctxt->wellFormed = 0;
2686 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002687 /*
2688 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2689 */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002690
2691 SKIP_BLANKS;
2692
2693 /*
2694 * Check for SystemID and ExternalID
2695 */
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002696 URI = htmlParseExternalID(ctxt, &ExternalID, 0);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002697 SKIP_BLANKS;
2698
2699 /*
2700 * We should be at the end of the DOCTYPE declaration.
2701 */
2702 if (CUR != '>') {
2703 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2704 ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
2705 ctxt->wellFormed = 0;
2706 /* We shouldn't try to resynchronize ... */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002707 }
2708 NEXT;
2709
2710 /*
Daniel Veillardd83eb822000-06-30 18:39:56 +00002711 * Create or update the document accordingly to the DOCTYPE
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002712 */
Daniel Veillardd83eb822000-06-30 18:39:56 +00002713 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2714 (!ctxt->disableSAX))
2715 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002716
2717 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002718 * Cleanup, since we don't use all those identifiers
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002719 */
Daniel Veillard6454aec1999-09-02 22:04:43 +00002720 if (URI != NULL) xmlFree(URI);
2721 if (ExternalID != NULL) xmlFree(ExternalID);
2722 if (name != NULL) xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002723}
2724
2725/**
2726 * htmlParseAttribute:
2727 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002728 * @value: a xmlChar ** used to store the value of the attribute
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002729 *
2730 * parse an attribute
2731 *
2732 * [41] Attribute ::= Name Eq AttValue
2733 *
2734 * [25] Eq ::= S? '=' S?
2735 *
2736 * With namespace:
2737 *
2738 * [NS 11] Attribute ::= QName Eq AttValue
2739 *
2740 * Also the case QName == xmlns:??? is handled independently as a namespace
2741 * definition.
2742 *
2743 * Returns the attribute name, and the value in *value.
2744 */
2745
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002746xmlChar *
2747htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002748 xmlChar *name, *val = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002749
2750 *value = NULL;
Daniel Veillard970112a2000-10-03 09:33:21 +00002751 name = htmlParseHTMLName(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002752 if (name == NULL) {
2753 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2754 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2755 ctxt->wellFormed = 0;
2756 return(NULL);
2757 }
2758
2759 /*
2760 * read the value
2761 */
2762 SKIP_BLANKS;
2763 if (CUR == '=') {
2764 NEXT;
2765 SKIP_BLANKS;
2766 val = htmlParseAttValue(ctxt);
Daniel Veillardbe803962000-06-28 23:40:59 +00002767 /******
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002768 } else {
Daniel Veillardbe803962000-06-28 23:40:59 +00002769 * TODO : some attribute must have values, some may not
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002770 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002771 ctxt->sax->warning(ctxt->userData,
Daniel Veillardbe803962000-06-28 23:40:59 +00002772 "No value for attribute %s\n", name); */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002773 }
2774
2775 *value = val;
2776 return(name);
2777}
2778
2779/**
Daniel Veillard365e13b2000-07-02 07:56:37 +00002780 * htmlCheckEncoding:
2781 * @ctxt: an HTML parser context
2782 * @attvalue: the attribute value
2783 *
2784 * Checks an http-equiv attribute from a Meta tag to detect
2785 * the encoding
2786 * If a new encoding is detected the parser is switched to decode
2787 * it and pass UTF8
2788 */
2789void
2790htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2791 const xmlChar *encoding;
2792
2793 if ((ctxt == NULL) || (attvalue == NULL))
2794 return;
2795
Daniel Veillarda6d8eb62000-12-27 10:46:47 +00002796 /* do not change encoding */
2797 if (ctxt->input->encoding != NULL)
2798 return;
2799
Daniel Veillardb656ebe2000-09-22 13:51:48 +00002800 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
Daniel Veillard365e13b2000-07-02 07:56:37 +00002801 if (encoding != NULL) {
2802 encoding += 8;
2803 } else {
Daniel Veillardb656ebe2000-09-22 13:51:48 +00002804 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
Daniel Veillard365e13b2000-07-02 07:56:37 +00002805 if (encoding != NULL)
2806 encoding += 9;
2807 }
2808 if (encoding != NULL) {
2809 xmlCharEncoding enc;
2810 xmlCharEncodingHandlerPtr handler;
2811
2812 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2813
2814 if (ctxt->input->encoding != NULL)
2815 xmlFree((xmlChar *) ctxt->input->encoding);
2816 ctxt->input->encoding = xmlStrdup(encoding);
2817
2818 enc = xmlParseCharEncoding((const char *) encoding);
2819 /*
2820 * registered set of known encodings
2821 */
2822 if (enc != XML_CHAR_ENCODING_ERROR) {
2823 xmlSwitchEncoding(ctxt, enc);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002824 ctxt->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillard365e13b2000-07-02 07:56:37 +00002825 } else {
2826 /*
2827 * fallback for unknown encodings
2828 */
2829 handler = xmlFindCharEncodingHandler((const char *) encoding);
2830 if (handler != NULL) {
2831 xmlSwitchToEncoding(ctxt, handler);
Daniel Veillard87b95392000-08-12 21:12:04 +00002832 ctxt->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillard365e13b2000-07-02 07:56:37 +00002833 } else {
2834 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2835 }
2836 }
Daniel Veillard87b95392000-08-12 21:12:04 +00002837
2838 if ((ctxt->input->buf != NULL) &&
2839 (ctxt->input->buf->encoder != NULL) &&
2840 (ctxt->input->buf->raw != NULL) &&
2841 (ctxt->input->buf->buffer != NULL)) {
2842 int nbchars;
2843 int processed;
2844
2845 /*
2846 * convert as much as possible to the parser reading buffer.
2847 */
2848 processed = ctxt->input->cur - ctxt->input->base;
2849 xmlBufferShrink(ctxt->input->buf->buffer, processed);
2850 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2851 ctxt->input->buf->buffer,
2852 ctxt->input->buf->raw);
2853 if (nbchars < 0) {
Daniel Veillarda2c6da92000-09-16 18:15:00 +00002854 ctxt->errNo = XML_ERR_INVALID_ENCODING;
Daniel Veillard87b95392000-08-12 21:12:04 +00002855 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2856 ctxt->sax->error(ctxt->userData,
2857 "htmlCheckEncoding: encoder error\n");
Daniel Veillard87b95392000-08-12 21:12:04 +00002858 }
2859 ctxt->input->base =
2860 ctxt->input->cur = ctxt->input->buf->buffer->content;
2861 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00002862 }
2863}
2864
2865/**
2866 * htmlCheckMeta:
2867 * @ctxt: an HTML parser context
2868 * @atts: the attributes values
2869 *
2870 * Checks an attributes from a Meta tag
2871 */
2872void
2873htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2874 int i;
2875 const xmlChar *att, *value;
2876 int http = 0;
2877 const xmlChar *content = NULL;
2878
2879 if ((ctxt == NULL) || (atts == NULL))
2880 return;
2881
2882 i = 0;
2883 att = atts[i++];
2884 while (att != NULL) {
2885 value = atts[i++];
Daniel Veillardb656ebe2000-09-22 13:51:48 +00002886 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
2887 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
Daniel Veillard365e13b2000-07-02 07:56:37 +00002888 http = 1;
Daniel Veillardb656ebe2000-09-22 13:51:48 +00002889 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
Daniel Veillard365e13b2000-07-02 07:56:37 +00002890 content = value;
2891 att = atts[i++];
2892 }
2893 if ((http) && (content != NULL))
2894 htmlCheckEncoding(ctxt, content);
2895
2896}
2897
2898/**
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002899 * htmlParseStartTag:
2900 * @ctxt: an HTML parser context
2901 *
2902 * parse a start of tag either for rule element or
2903 * EmptyElement. In both case we don't parse the tag closing chars.
2904 *
2905 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2906 *
2907 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2908 *
2909 * With namespace:
2910 *
2911 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2912 *
2913 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2914 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002915 */
2916
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002917void
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002918htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002919 xmlChar *name;
2920 xmlChar *attname;
2921 xmlChar *attvalue;
2922 const xmlChar **atts = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002923 int nbatts = 0;
2924 int maxatts = 0;
Daniel Veillard365e13b2000-07-02 07:56:37 +00002925 int meta = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002926 int i;
2927
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002928 if (CUR != '<') return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002929 NEXT;
2930
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002931 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002932 name = htmlParseHTMLName(ctxt);
2933 if (name == NULL) {
2934 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2935 ctxt->sax->error(ctxt->userData,
2936 "htmlParseStartTag: invalid element name\n");
2937 ctxt->wellFormed = 0;
Daniel Veillard126f2792000-10-24 17:10:12 +00002938 /* Dump the bogus tag like browsers do */
2939 while ((IS_CHAR(CUR)) && (CUR != '>'))
2940 NEXT;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002941 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002942 }
Daniel Veillard8b5dd832000-10-01 20:28:44 +00002943 if (xmlStrEqual(name, BAD_CAST"meta"))
Daniel Veillard365e13b2000-07-02 07:56:37 +00002944 meta = 1;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002945
2946 /*
2947 * Check for auto-closure of HTML elements.
2948 */
2949 htmlAutoClose(ctxt, name);
2950
2951 /*
Daniel Veillardbe803962000-06-28 23:40:59 +00002952 * Check for implied HTML elements.
2953 */
2954 htmlCheckImplied(ctxt, name);
2955
2956 /*
Daniel Veillardf62ceff2000-11-24 23:36:01 +00002957 * Avoid html at any level > 0, head at any level != 1
2958 * or any attempt to recurse body
2959 */
2960 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
2961 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2962 ctxt->sax->error(ctxt->userData,
2963 "htmlParseStartTag: misplaced <html> tag\n");
2964 ctxt->wellFormed = 0;
2965 xmlFree(name);
2966 return;
2967 }
2968 if ((ctxt->nameNr != 1) &&
2969 (xmlStrEqual(name, BAD_CAST"head"))) {
2970 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2971 ctxt->sax->error(ctxt->userData,
2972 "htmlParseStartTag: misplaced <head> tag\n");
2973 ctxt->wellFormed = 0;
2974 xmlFree(name);
2975 return;
2976 }
2977 if (xmlStrEqual(name, BAD_CAST"body")) {
2978 int i;
2979 for (i = 0;i < ctxt->nameNr;i++) {
2980 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
2981 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2982 ctxt->sax->error(ctxt->userData,
2983 "htmlParseStartTag: misplaced <body> tag\n");
2984 ctxt->wellFormed = 0;
2985 xmlFree(name);
2986 return;
2987 }
2988 }
2989 }
2990
2991 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002992 * Now parse the attributes, it ends up with the ending
2993 *
2994 * (S Attribute)* S?
2995 */
2996 SKIP_BLANKS;
2997 while ((IS_CHAR(CUR)) &&
2998 (CUR != '>') &&
2999 ((CUR != '/') || (NXT(1) != '>'))) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003000 long cons = ctxt->nbChars;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003001
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003002 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003003 attname = htmlParseAttribute(ctxt, &attvalue);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003004 if (attname != NULL) {
Daniel Veillard365e13b2000-07-02 07:56:37 +00003005
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003006 /*
3007 * Well formedness requires at most one declaration of an attribute
3008 */
3009 for (i = 0; i < nbatts;i += 2) {
Daniel Veillard8b5dd832000-10-01 20:28:44 +00003010 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003011 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003012 ctxt->sax->error(ctxt->userData,
3013 "Attribute %s redefined\n",
3014 attname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003015 ctxt->wellFormed = 0;
Daniel Veillard6454aec1999-09-02 22:04:43 +00003016 xmlFree(attname);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003017 if (attvalue != NULL)
3018 xmlFree(attvalue);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003019 goto failed;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003020 }
3021 }
3022
3023 /*
3024 * Add the pair to atts
3025 */
3026 if (atts == NULL) {
3027 maxatts = 10;
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003028 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003029 if (atts == NULL) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003030 xmlGenericError(xmlGenericErrorContext,
3031 "malloc of %ld byte failed\n",
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003032 maxatts * (long)sizeof(xmlChar *));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003033 if (name != NULL) xmlFree(name);
3034 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003035 }
Daniel Veillard51e3b151999-11-12 17:02:31 +00003036 } else if (nbatts + 4 > maxatts) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003037 maxatts *= 2;
Daniel Veillard4b0755c2000-09-25 14:26:28 +00003038 atts = (const xmlChar **) xmlRealloc((void *) atts,
3039 maxatts * sizeof(xmlChar *));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003040 if (atts == NULL) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003041 xmlGenericError(xmlGenericErrorContext,
3042 "realloc of %ld byte failed\n",
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003043 maxatts * (long)sizeof(xmlChar *));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003044 if (name != NULL) xmlFree(name);
3045 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003046 }
3047 }
3048 atts[nbatts++] = attname;
3049 atts[nbatts++] = attvalue;
3050 atts[nbatts] = NULL;
3051 atts[nbatts + 1] = NULL;
3052 }
Daniel Veillard126f2792000-10-24 17:10:12 +00003053 else {
3054 /* Dump the bogus attribute string up to the next blank or
3055 * the end of the tag. */
3056 while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
3057 && ((CUR != '/') || (NXT(1) != '>')))
3058 NEXT;
3059 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003060
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003061failed:
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003062 SKIP_BLANKS;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003063 if (cons == ctxt->nbChars) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003064 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3065 ctxt->sax->error(ctxt->userData,
3066 "htmlParseStartTag: problem parsing attributes\n");
3067 ctxt->wellFormed = 0;
3068 break;
3069 }
3070 }
3071
3072 /*
Daniel Veillard365e13b2000-07-02 07:56:37 +00003073 * Handle specific association to the META tag
3074 */
3075 if (meta)
3076 htmlCheckMeta(ctxt, atts);
3077
3078 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003079 * SAX: Start of Element !
3080 */
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003081 htmlnamePush(ctxt, xmlStrdup(name));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003082#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003083 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003084#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003085 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3086 ctxt->sax->startElement(ctxt->userData, name, atts);
3087
3088 if (atts != NULL) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003089 for (i = 0;i < nbatts;i++) {
3090 if (atts[i] != NULL)
3091 xmlFree((xmlChar *) atts[i]);
3092 }
Daniel Veillard3f6f7f62000-06-30 17:58:25 +00003093 xmlFree((void *) atts);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003094 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003095 if (name != NULL) xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003096}
3097
3098/**
3099 * htmlParseEndTag:
3100 * @ctxt: an HTML parser context
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003101 *
3102 * parse an end of tag
3103 *
3104 * [42] ETag ::= '</' Name S? '>'
3105 *
3106 * With namespace
3107 *
3108 * [NS 9] ETag ::= '</' QName S? '>'
3109 */
3110
3111void
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003112htmlParseEndTag(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003113 xmlChar *name;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003114 xmlChar *oldname;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003115 int i;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003116
3117 if ((CUR != '<') || (NXT(1) != '/')) {
3118 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3119 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3120 ctxt->wellFormed = 0;
3121 return;
3122 }
3123 SKIP(2);
3124
3125 name = htmlParseHTMLName(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003126 if (name == NULL) return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003127
3128 /*
3129 * We should definitely be at the ending "S? '>'" part
3130 */
3131 SKIP_BLANKS;
3132 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3133 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3134 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3135 ctxt->wellFormed = 0;
3136 } else
3137 NEXT;
3138
3139 /*
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003140 * If the name read is not one of the element in the parsing stack
3141 * then return, it's just an error.
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003142 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003143 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard8b5dd832000-10-01 20:28:44 +00003144 if (xmlStrEqual(name, ctxt->nameTab[i])) break;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003145 }
3146 if (i < 0) {
3147 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003148 ctxt->sax->error(ctxt->userData,
3149 "Unexpected end tag : %s\n", name);
Daniel Veillard6454aec1999-09-02 22:04:43 +00003150 xmlFree(name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003151 ctxt->wellFormed = 0;
3152 return;
3153 }
3154
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003155
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003156 /*
3157 * Check for auto-closure of HTML elements.
3158 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003159
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003160 htmlAutoCloseOnClose(ctxt, name);
3161
3162 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003163 * Well formedness constraints, opening and closing must match.
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003164 * With the exception that the autoclose may have popped stuff out
3165 * of the stack.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003166 */
Daniel Veillard8b5dd832000-10-01 20:28:44 +00003167 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003168#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003169 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003170#endif
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003171 if ((ctxt->name != NULL) &&
Daniel Veillard8b5dd832000-10-01 20:28:44 +00003172 (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003173 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3174 ctxt->sax->error(ctxt->userData,
3175 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003176 name, ctxt->name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003177 ctxt->wellFormed = 0;
3178 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003179 }
3180
3181 /*
3182 * SAX: End of Tag
3183 */
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003184 oldname = ctxt->name;
Daniel Veillard8b5dd832000-10-01 20:28:44 +00003185 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003186 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3187 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003188 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003189 if (oldname != NULL) {
3190#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003191 xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003192#endif
3193 xmlFree(oldname);
3194#ifdef DEBUG
3195 } else {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003196 xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003197#endif
3198 }
3199 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003200
3201 if (name != NULL)
Daniel Veillard6454aec1999-09-02 22:04:43 +00003202 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003203
3204 return;
3205}
3206
3207
3208/**
3209 * htmlParseReference:
3210 * @ctxt: an HTML parser context
3211 *
3212 * parse and handle entity references in content,
3213 * this will end-up in a call to character() since this is either a
3214 * CharRef, or a predefined entity.
3215 */
3216void
3217htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003218 htmlEntityDescPtr ent;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003219 xmlChar out[6];
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003220 xmlChar *name;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003221 if (CUR != '&') return;
3222
3223 if (NXT(1) == '#') {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003224 unsigned int c;
3225 int bits, i = 0;
3226
3227 c = htmlParseCharRef(ctxt);
Daniel Veillard748e45d2000-11-17 16:36:08 +00003228 if (c == 0)
3229 return;
3230
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003231 if (c < 0x80) { out[i++]= c; bits= -6; }
3232 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3233 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3234 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3235
3236 for ( ; bits >= 0; bits-= 6) {
3237 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3238 }
3239 out[i] = 0;
3240
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003241 htmlCheckParagraph(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003242 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003243 ctxt->sax->characters(ctxt->userData, out, i);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003244 } else {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003245 ent = htmlParseEntityRef(ctxt, &name);
Daniel Veillard71b656e2000-01-05 14:46:17 +00003246 if (name == NULL) {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003247 htmlCheckParagraph(ctxt);
Daniel Veillard1255ab72000-08-14 15:13:33 +00003248 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3249 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
Daniel Veillard71b656e2000-01-05 14:46:17 +00003250 return;
3251 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003252 if ((ent == NULL) || (ent->value <= 0)) {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003253 htmlCheckParagraph(ctxt);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003254 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
Daniel Veillardb96e6431999-08-29 21:02:19 +00003255 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003256 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
Daniel Veillard71b656e2000-01-05 14:46:17 +00003257 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003258 }
3259 } else {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003260 unsigned int c;
3261 int bits, i = 0;
3262
3263 c = ent->value;
3264 if (c < 0x80)
3265 { out[i++]= c; bits= -6; }
3266 else if (c < 0x800)
3267 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3268 else if (c < 0x10000)
3269 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3270 else
3271 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3272
3273 for ( ; bits >= 0; bits-= 6) {
3274 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3275 }
3276 out[i] = 0;
3277
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003278 htmlCheckParagraph(ctxt);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003279 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003280 ctxt->sax->characters(ctxt->userData, out, i);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003281 }
Daniel Veillard6454aec1999-09-02 22:04:43 +00003282 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003283 }
3284}
3285
3286/**
3287 * htmlParseContent:
3288 * @ctxt: an HTML parser context
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003289 * @name: the node name
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003290 *
3291 * Parse a content: comment, sub-element, reference or text.
3292 *
3293 */
3294
3295void
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003296htmlParseContent(htmlParserCtxtPtr ctxt) {
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003297 xmlChar *currentNode;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003298 int depth;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003299
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003300 currentNode = xmlStrdup(ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003301 depth = ctxt->nameNr;
3302 while (1) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003303 long cons = ctxt->nbChars;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003304
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003305 GROW;
3306 /*
3307 * Our tag or one of it's parent or children is ending.
3308 */
3309 if ((CUR == '<') && (NXT(1) == '/')) {
3310 htmlParseEndTag(ctxt);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003311 if (currentNode != NULL) xmlFree(currentNode);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003312 return;
3313 }
3314
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003315 /*
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003316 * Has this node been popped out during parsing of
3317 * the next element
3318 */
Daniel Veillard8b5dd832000-10-01 20:28:44 +00003319 if ((!xmlStrEqual(currentNode, ctxt->name)) &&
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003320 (depth >= ctxt->nameNr)) {
3321 if (currentNode != NULL) xmlFree(currentNode);
3322 return;
3323 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003324
Daniel Veillard7eda8452000-10-14 23:38:43 +00003325 if ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3326 (xmlStrEqual(currentNode, BAD_CAST"style"))) {
3327 /*
3328 * Handle SCRIPT/STYLE separately
3329 */
3330 htmlParseScript(ctxt);
3331 } else {
3332 /*
3333 * Sometimes DOCTYPE arrives in the middle of the document
3334 */
3335 if ((CUR == '<') && (NXT(1) == '!') &&
3336 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3337 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3338 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3339 (UPP(8) == 'E')) {
Daniel Veillard35008381999-10-25 13:15:52 +00003340 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3341 ctxt->sax->error(ctxt->userData,
Daniel Veillard7eda8452000-10-14 23:38:43 +00003342 "Misplaced DOCTYPE declaration\n");
Daniel Veillard35008381999-10-25 13:15:52 +00003343 ctxt->wellFormed = 0;
Daniel Veillard7eda8452000-10-14 23:38:43 +00003344 htmlParseDocTypeDecl(ctxt);
Daniel Veillard35008381999-10-25 13:15:52 +00003345 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003346
Daniel Veillard7eda8452000-10-14 23:38:43 +00003347 /*
3348 * First case : a comment
3349 */
3350 if ((CUR == '<') && (NXT(1) == '!') &&
3351 (NXT(2) == '-') && (NXT(3) == '-')) {
3352 htmlParseComment(ctxt);
3353 }
3354
3355 /*
3356 * Second case : a sub-element.
3357 */
3358 else if (CUR == '<') {
3359 htmlParseElement(ctxt);
3360 }
3361
3362 /*
3363 * Third case : a reference. If if has not been resolved,
3364 * parsing returns it's Name, create the node
3365 */
3366 else if (CUR == '&') {
3367 htmlParseReference(ctxt);
3368 }
3369
3370 /*
3371 * Fourth : end of the resource
3372 */
3373 else if (CUR == 0) {
3374 htmlAutoClose(ctxt, NULL);
3375 }
3376
3377 /*
3378 * Last case, text. Note that References are handled directly.
3379 */
3380 else {
3381 htmlParseCharData(ctxt, 0);
3382 }
3383
3384 if (cons == ctxt->nbChars) {
3385 if (ctxt->node != NULL) {
3386 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3387 ctxt->sax->error(ctxt->userData,
3388 "detected an error in element content\n");
3389 ctxt->wellFormed = 0;
3390 }
3391 break;
3392 }
3393 }
Daniel Veillarde2d034d1999-07-27 19:52:06 +00003394 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003395 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003396 if (currentNode != NULL) xmlFree(currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003397}
3398
3399/**
3400 * htmlParseElement:
3401 * @ctxt: an HTML parser context
3402 *
3403 * parse an HTML element, this is highly recursive
3404 *
3405 * [39] element ::= EmptyElemTag | STag content ETag
3406 *
3407 * [41] Attribute ::= Name Eq AttValue
3408 */
3409
3410void
3411htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003412 xmlChar *name;
Daniel Veillard7d2c2761999-10-11 15:09:51 +00003413 xmlChar *currentNode = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003414 htmlElemDescPtr info;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003415 htmlParserNodeInfo node_info;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003416 xmlChar *oldname;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003417 int depth = ctxt->nameNr;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003418
3419 /* Capture start position */
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003420 if (ctxt->record_info) {
3421 node_info.begin_pos = ctxt->input->consumed +
3422 (CUR_PTR - ctxt->input->base);
3423 node_info.begin_line = ctxt->input->line;
3424 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003425
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003426 oldname = xmlStrdup(ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003427 htmlParseStartTag(ctxt);
3428 name = ctxt->name;
3429#ifdef DEBUG
3430 if (oldname == NULL)
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003431 xmlGenericError(xmlGenericErrorContext,
3432 "Start of element %s\n", name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003433 else if (name == NULL)
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003434 xmlGenericError(xmlGenericErrorContext,
3435 "Start of element failed, was %s\n", oldname);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003436 else
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003437 xmlGenericError(xmlGenericErrorContext,
3438 "Start of element %s, was %s\n", name, oldname);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003439#endif
Daniel Veillard8b5dd832000-10-01 20:28:44 +00003440 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003441 (name == NULL)) {
3442 if (CUR == '>')
3443 NEXT;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003444 if (oldname != NULL)
3445 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003446 return;
3447 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003448 if (oldname != NULL)
3449 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003450
3451 /*
3452 * Lookup the info for that element.
3453 */
3454 info = htmlTagLookup(name);
3455 if (info == NULL) {
3456 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3457 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3458 name);
3459 ctxt->wellFormed = 0;
3460 } else if (info->depr) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003461/***************************
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003462 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3463 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3464 name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003465 ***************************/
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003466 }
3467
3468 /*
3469 * Check for an Empty Element labelled the XML/SGML way
3470 */
3471 if ((CUR == '/') && (NXT(1) == '>')) {
3472 SKIP(2);
3473 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3474 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003475 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003476#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003477 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003478#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003479 if (oldname != NULL)
3480 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003481 return;
3482 }
3483
Daniel Veillarde2d034d1999-07-27 19:52:06 +00003484 if (CUR == '>') {
3485 NEXT;
3486 } else {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003487 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard87b95392000-08-12 21:12:04 +00003488 ctxt->sax->error(ctxt->userData,
3489 "Couldn't find end of Start Tag %s\n",
3490 name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003491 ctxt->wellFormed = 0;
3492
3493 /*
3494 * end of parsing of this node.
3495 */
Daniel Veillard8b5dd832000-10-01 20:28:44 +00003496 if (xmlStrEqual(name, ctxt->name)) {
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003497 nodePop(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003498 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003499#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003500 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003501#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003502 if (oldname != NULL)
3503 xmlFree(oldname);
3504 }
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003505
3506 /*
3507 * Capture end position and add node
3508 */
3509 if ( currentNode != NULL && ctxt->record_info ) {
3510 node_info.end_pos = ctxt->input->consumed +
3511 (CUR_PTR - ctxt->input->base);
3512 node_info.end_line = ctxt->input->line;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003513 node_info.node = ctxt->node;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003514 xmlParserAddNodeInfo(ctxt, &node_info);
3515 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003516 return;
3517 }
3518
3519 /*
3520 * Check for an Empty Element from DTD definition
3521 */
3522 if ((info != NULL) && (info->empty)) {
3523 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3524 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003525 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003526#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003527 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003528#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003529 if (oldname != NULL)
3530 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003531 return;
3532 }
3533
3534 /*
3535 * Parse the content of the element:
3536 */
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003537 currentNode = xmlStrdup(ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003538 depth = ctxt->nameNr;
3539 while (IS_CHAR(CUR)) {
3540 htmlParseContent(ctxt);
3541 if (ctxt->nameNr < depth) break;
3542 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003543
3544 if (!IS_CHAR(CUR)) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003545 /************
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003546 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3547 ctxt->sax->error(ctxt->userData,
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003548 "Premature end of data in tag %s\n", currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003549 ctxt->wellFormed = 0;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003550 *************/
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003551
3552 /*
3553 * end of parsing of this node.
3554 */
3555 nodePop(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003556 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003557#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003558 xmlGenericError(xmlGenericErrorContext,"Premature end of tag %s : popping out %s\n", name, oldname);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003559#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003560 if (oldname != NULL)
3561 xmlFree(oldname);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003562 if (currentNode != NULL)
3563 xmlFree(currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003564 return;
3565 }
3566
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003567 /*
3568 * Capture end position and add node
3569 */
3570 if ( currentNode != NULL && ctxt->record_info ) {
3571 node_info.end_pos = ctxt->input->consumed +
3572 (CUR_PTR - ctxt->input->base);
3573 node_info.end_line = ctxt->input->line;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003574 node_info.node = ctxt->node;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003575 xmlParserAddNodeInfo(ctxt, &node_info);
3576 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003577 if (currentNode != NULL)
3578 xmlFree(currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003579}
3580
3581/**
3582 * htmlParseDocument :
3583 * @ctxt: an HTML parser context
3584 *
3585 * parse an HTML document (and build a tree if using the standard SAX
3586 * interface).
3587 *
3588 * Returns 0, -1 in case of error. the parser context is augmented
3589 * as a result of the parsing.
3590 */
3591
3592int
3593htmlParseDocument(htmlParserCtxtPtr ctxt) {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003594 xmlDtdPtr dtd;
3595
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003596 htmlDefaultSAXHandlerInit();
3597 ctxt->html = 1;
3598
Daniel Veillarde2d034d1999-07-27 19:52:06 +00003599 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003600 /*
Daniel Veillardb96e6431999-08-29 21:02:19 +00003601 * SAX: beginning of the document processing.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003602 */
3603 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3604 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3605
3606 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003607 * Wipe out everything which is before the first '<'
3608 */
Daniel Veillard35008381999-10-25 13:15:52 +00003609 SKIP_BLANKS;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003610 if (CUR == 0) {
3611 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3612 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3613 ctxt->wellFormed = 0;
3614 }
3615
Daniel Veillardbe803962000-06-28 23:40:59 +00003616 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3617 ctxt->sax->startDocument(ctxt->userData);
3618
3619
Daniel Veillard35008381999-10-25 13:15:52 +00003620 /*
3621 * Parse possible comments before any content
3622 */
3623 while ((CUR == '<') && (NXT(1) == '!') &&
3624 (NXT(2) == '-') && (NXT(3) == '-')) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003625 htmlParseComment(ctxt);
Daniel Veillard35008381999-10-25 13:15:52 +00003626 SKIP_BLANKS;
3627 }
3628
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003629
3630 /*
3631 * Then possibly doc type declaration(s) and more Misc
3632 * (doctypedecl Misc*)?
3633 */
3634 if ((CUR == '<') && (NXT(1) == '!') &&
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003635 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3636 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3637 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3638 (UPP(8) == 'E')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003639 htmlParseDocTypeDecl(ctxt);
3640 }
3641 SKIP_BLANKS;
3642
3643 /*
Daniel Veillard87b95392000-08-12 21:12:04 +00003644 * Parse possible comments before any content
3645 */
3646 while ((CUR == '<') && (NXT(1) == '!') &&
3647 (NXT(2) == '-') && (NXT(3) == '-')) {
3648 htmlParseComment(ctxt);
3649 SKIP_BLANKS;
3650 }
3651
3652 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003653 * Time to start parsing the tree itself
3654 */
Daniel Veillard35008381999-10-25 13:15:52 +00003655 htmlParseContent(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003656
3657 /*
Daniel Veillard365e13b2000-07-02 07:56:37 +00003658 * autoclose
3659 */
3660 if (CUR == 0)
3661 htmlAutoClose(ctxt, NULL);
3662
3663
3664 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003665 * SAX: end of the document processing.
3666 */
3667 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3668 ctxt->sax->endDocument(ctxt->userData);
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003669
3670 if (ctxt->myDoc != NULL) {
3671 dtd = xmlGetIntSubset(ctxt->myDoc);
3672 if (dtd == NULL)
3673 ctxt->myDoc->intSubset =
3674 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3675 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3676 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3677 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003678 if (! ctxt->wellFormed) return(-1);
3679 return(0);
3680}
3681
3682
Daniel Veillarddbfd6411999-12-28 16:35:14 +00003683/************************************************************************
3684 * *
3685 * Parser contexts handling *
3686 * *
3687 ************************************************************************/
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003688
3689/**
3690 * xmlInitParserCtxt:
3691 * @ctxt: an HTML parser context
3692 *
3693 * Initialize a parser context
3694 */
3695
3696void
3697htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3698{
3699 htmlSAXHandler *sax;
3700
Daniel Veillard35008381999-10-25 13:15:52 +00003701 if (ctxt == NULL) return;
3702 memset(ctxt, 0, sizeof(htmlParserCtxt));
3703
Daniel Veillard6454aec1999-09-02 22:04:43 +00003704 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003705 if (sax == NULL) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003706 xmlGenericError(xmlGenericErrorContext,
3707 "htmlInitParserCtxt: out of memory\n");
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003708 }
Daniel Veillard4fb87ee2000-09-19 12:25:59 +00003709 else
3710 memset(sax, 0, sizeof(htmlSAXHandler));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003711
3712 /* Allocate the Input stack */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003713 ctxt->inputTab = (htmlParserInputPtr *)
3714 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3715 if (ctxt->inputTab == NULL) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003716 xmlGenericError(xmlGenericErrorContext,
3717 "htmlInitParserCtxt: out of memory\n");
Daniel Veillard39c7d712000-09-10 16:14:55 +00003718 ctxt->inputNr = 0;
3719 ctxt->inputMax = 0;
3720 ctxt->input = NULL;
3721 return;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003722 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003723 ctxt->inputNr = 0;
3724 ctxt->inputMax = 5;
3725 ctxt->input = NULL;
3726 ctxt->version = NULL;
3727 ctxt->encoding = NULL;
3728 ctxt->standalone = -1;
Daniel Veillarddbfd6411999-12-28 16:35:14 +00003729 ctxt->instate = XML_PARSER_START;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003730
3731 /* Allocate the Node stack */
Daniel Veillard6454aec1999-09-02 22:04:43 +00003732 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
Daniel Veillard39c7d712000-09-10 16:14:55 +00003733 if (ctxt->nodeTab == NULL) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003734 xmlGenericError(xmlGenericErrorContext,
3735 "htmlInitParserCtxt: out of memory\n");
Daniel Veillard39c7d712000-09-10 16:14:55 +00003736 ctxt->nodeNr = 0;
3737 ctxt->nodeMax = 0;
3738 ctxt->node = NULL;
3739 ctxt->inputNr = 0;
3740 ctxt->inputMax = 0;
3741 ctxt->input = NULL;
3742 return;
3743 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003744 ctxt->nodeNr = 0;
3745 ctxt->nodeMax = 10;
3746 ctxt->node = NULL;
3747
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003748 /* Allocate the Name stack */
3749 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Daniel Veillard39c7d712000-09-10 16:14:55 +00003750 if (ctxt->nameTab == NULL) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003751 xmlGenericError(xmlGenericErrorContext,
3752 "htmlInitParserCtxt: out of memory\n");
Daniel Veillard39c7d712000-09-10 16:14:55 +00003753 ctxt->nameNr = 0;
3754 ctxt->nameMax = 10;
3755 ctxt->name = NULL;
3756 ctxt->nodeNr = 0;
3757 ctxt->nodeMax = 0;
3758 ctxt->node = NULL;
3759 ctxt->inputNr = 0;
3760 ctxt->inputMax = 0;
3761 ctxt->input = NULL;
3762 return;
3763 }
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003764 ctxt->nameNr = 0;
3765 ctxt->nameMax = 10;
3766 ctxt->name = NULL;
3767
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003768 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3769 else {
3770 ctxt->sax = sax;
3771 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3772 }
3773 ctxt->userData = ctxt;
3774 ctxt->myDoc = NULL;
3775 ctxt->wellFormed = 1;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003776 ctxt->replaceEntities = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003777 ctxt->html = 1;
3778 ctxt->record_info = 0;
Daniel Veillard35008381999-10-25 13:15:52 +00003779 ctxt->validate = 0;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003780 ctxt->nbChars = 0;
Daniel Veillarddbfd6411999-12-28 16:35:14 +00003781 ctxt->checkIndex = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003782 xmlInitNodeInfoSeq(&ctxt->node_seq);
3783}
3784
3785/**
3786 * htmlFreeParserCtxt:
3787 * @ctxt: an HTML parser context
3788 *
3789 * Free all the memory used by a parser context. However the parsed
3790 * document in ctxt->myDoc is not freed.
3791 */
3792
3793void
3794htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3795{
Daniel Veillard365e13b2000-07-02 07:56:37 +00003796 xmlFreeParserCtxt(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003797}
3798
3799/**
3800 * htmlCreateDocParserCtxt :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003801 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003802 * @encoding: a free form C string describing the HTML document encoding, or NULL
3803 *
3804 * Create a parser context for an HTML document.
3805 *
3806 * Returns the new parser context or NULL
3807 */
3808htmlParserCtxtPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003809htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003810 htmlParserCtxtPtr ctxt;
3811 htmlParserInputPtr input;
3812 /* htmlCharEncoding enc; */
3813
Daniel Veillard6454aec1999-09-02 22:04:43 +00003814 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003815 if (ctxt == NULL) {
3816 perror("malloc");
3817 return(NULL);
3818 }
3819 htmlInitParserCtxt(ctxt);
Daniel Veillard6454aec1999-09-02 22:04:43 +00003820 input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003821 if (input == NULL) {
3822 perror("malloc");
Daniel Veillard6454aec1999-09-02 22:04:43 +00003823 xmlFree(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003824 return(NULL);
3825 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003826 memset(input, 0, sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003827
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003828 input->line = 1;
3829 input->col = 1;
3830 input->base = cur;
3831 input->cur = cur;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003832
3833 inputPush(ctxt, input);
3834 return(ctxt);
3835}
3836
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003837/************************************************************************
3838 * *
3839 * Progressive parsing interfaces *
3840 * *
3841 ************************************************************************/
3842
3843/**
3844 * htmlParseLookupSequence:
3845 * @ctxt: an HTML parser context
3846 * @first: the first char to lookup
3847 * @next: the next char to lookup or zero
3848 * @third: the next char to lookup or zero
3849 *
3850 * Try to find if a sequence (first, next, third) or just (first next) or
3851 * (first) is available in the input stream.
3852 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3853 * to avoid rescanning sequences of bytes, it DOES change the state of the
3854 * parser, do not use liberally.
3855 * This is basically similar to xmlParseLookupSequence()
3856 *
3857 * Returns the index to the current parsing point if the full sequence
3858 * is available, -1 otherwise.
3859 */
3860int
3861htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3862 xmlChar next, xmlChar third) {
3863 int base, len;
3864 htmlParserInputPtr in;
3865 const xmlChar *buf;
3866
3867 in = ctxt->input;
3868 if (in == NULL) return(-1);
3869 base = in->cur - in->base;
3870 if (base < 0) return(-1);
3871 if (ctxt->checkIndex > base)
3872 base = ctxt->checkIndex;
3873 if (in->buf == NULL) {
3874 buf = in->base;
3875 len = in->length;
3876 } else {
3877 buf = in->buf->buffer->content;
3878 len = in->buf->buffer->use;
3879 }
3880 /* take into account the sequence length */
3881 if (third) len -= 2;
3882 else if (next) len --;
3883 for (;base < len;base++) {
3884 if (buf[base] == first) {
3885 if (third != 0) {
3886 if ((buf[base + 1] != next) ||
3887 (buf[base + 2] != third)) continue;
3888 } else if (next != 0) {
3889 if (buf[base + 1] != next) continue;
3890 }
3891 ctxt->checkIndex = 0;
3892#ifdef DEBUG_PUSH
3893 if (next == 0)
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003894 xmlGenericError(xmlGenericErrorContext,
3895 "HPP: lookup '%c' found at %d\n",
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003896 first, base);
3897 else if (third == 0)
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003898 xmlGenericError(xmlGenericErrorContext,
3899 "HPP: lookup '%c%c' found at %d\n",
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003900 first, next, base);
3901 else
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003902 xmlGenericError(xmlGenericErrorContext,
3903 "HPP: lookup '%c%c%c' found at %d\n",
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003904 first, next, third, base);
3905#endif
3906 return(base - (in->cur - in->base));
3907 }
3908 }
3909 ctxt->checkIndex = base;
3910#ifdef DEBUG_PUSH
3911 if (next == 0)
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003912 xmlGenericError(xmlGenericErrorContext,
3913 "HPP: lookup '%c' failed\n", first);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003914 else if (third == 0)
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003915 xmlGenericError(xmlGenericErrorContext,
3916 "HPP: lookup '%c%c' failed\n", first, next);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003917 else
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003918 xmlGenericError(xmlGenericErrorContext,
3919 "HPP: lookup '%c%c%c' failed\n", first, next, third);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003920#endif
3921 return(-1);
3922}
3923
3924/**
Daniel Veillard71b656e2000-01-05 14:46:17 +00003925 * htmlParseTryOrFinish:
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003926 * @ctxt: an HTML parser context
Daniel Veillard71b656e2000-01-05 14:46:17 +00003927 * @terminate: last chunk indicator
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003928 *
3929 * Try to progress on parsing
3930 *
3931 * Returns zero if no parsing was possible
3932 */
3933int
Daniel Veillard71b656e2000-01-05 14:46:17 +00003934htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003935 int ret = 0;
3936 htmlParserInputPtr in;
Daniel Veillard365e13b2000-07-02 07:56:37 +00003937 int avail = 0;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003938 xmlChar cur, next;
3939
3940#ifdef DEBUG_PUSH
3941 switch (ctxt->instate) {
3942 case XML_PARSER_EOF:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003943 xmlGenericError(xmlGenericErrorContext,
3944 "HPP: try EOF\n"); break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003945 case XML_PARSER_START:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003946 xmlGenericError(xmlGenericErrorContext,
3947 "HPP: try START\n"); break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003948 case XML_PARSER_MISC:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003949 xmlGenericError(xmlGenericErrorContext,
3950 "HPP: try MISC\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003951 case XML_PARSER_COMMENT:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003952 xmlGenericError(xmlGenericErrorContext,
3953 "HPP: try COMMENT\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003954 case XML_PARSER_PROLOG:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003955 xmlGenericError(xmlGenericErrorContext,
3956 "HPP: try PROLOG\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003957 case XML_PARSER_START_TAG:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003958 xmlGenericError(xmlGenericErrorContext,
3959 "HPP: try START_TAG\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003960 case XML_PARSER_CONTENT:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003961 xmlGenericError(xmlGenericErrorContext,
3962 "HPP: try CONTENT\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003963 case XML_PARSER_CDATA_SECTION:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003964 xmlGenericError(xmlGenericErrorContext,
3965 "HPP: try CDATA_SECTION\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003966 case XML_PARSER_END_TAG:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003967 xmlGenericError(xmlGenericErrorContext,
3968 "HPP: try END_TAG\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003969 case XML_PARSER_ENTITY_DECL:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003970 xmlGenericError(xmlGenericErrorContext,
3971 "HPP: try ENTITY_DECL\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003972 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003973 xmlGenericError(xmlGenericErrorContext,
3974 "HPP: try ENTITY_VALUE\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003975 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003976 xmlGenericError(xmlGenericErrorContext,
3977 "HPP: try ATTRIBUTE_VALUE\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003978 case XML_PARSER_DTD:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003979 xmlGenericError(xmlGenericErrorContext,
3980 "HPP: try DTD\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003981 case XML_PARSER_EPILOG:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003982 xmlGenericError(xmlGenericErrorContext,
3983 "HPP: try EPILOG\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003984 case XML_PARSER_PI:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003985 xmlGenericError(xmlGenericErrorContext,
3986 "HPP: try PI\n");break;
Daniel Veillard7eda8452000-10-14 23:38:43 +00003987 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003988 xmlGenericError(xmlGenericErrorContext,
3989 "HPP: try SYSTEM_LITERAL\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003990 }
3991#endif
3992
3993 while (1) {
3994
3995 in = ctxt->input;
3996 if (in == NULL) break;
3997 if (in->buf == NULL)
3998 avail = in->length - (in->cur - in->base);
3999 else
4000 avail = in->buf->buffer->use - (in->cur - in->base);
Daniel Veillard365e13b2000-07-02 07:56:37 +00004001 if ((avail == 0) && (terminate)) {
4002 htmlAutoClose(ctxt, NULL);
Daniel Veillard87b95392000-08-12 21:12:04 +00004003 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4004 /*
4005 * SAX: end of the document processing.
4006 */
Daniel Veillard365e13b2000-07-02 07:56:37 +00004007 ctxt->instate = XML_PARSER_EOF;
Daniel Veillard87b95392000-08-12 21:12:04 +00004008 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4009 ctxt->sax->endDocument(ctxt->userData);
4010 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00004011 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004012 if (avail < 1)
4013 goto done;
4014 switch (ctxt->instate) {
4015 case XML_PARSER_EOF:
4016 /*
4017 * Document parsing is done !
4018 */
4019 goto done;
4020 case XML_PARSER_START:
4021 /*
4022 * Very first chars read from the document flow.
4023 */
4024 cur = in->cur[0];
4025 if (IS_BLANK(cur)) {
4026 SKIP_BLANKS;
4027 if (in->buf == NULL)
4028 avail = in->length - (in->cur - in->base);
4029 else
4030 avail = in->buf->buffer->use - (in->cur - in->base);
4031 }
4032 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4033 ctxt->sax->setDocumentLocator(ctxt->userData,
4034 &xmlDefaultSAXLocator);
Daniel Veillardd83eb822000-06-30 18:39:56 +00004035 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4036 (!ctxt->disableSAX))
4037 ctxt->sax->startDocument(ctxt->userData);
4038
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004039 cur = in->cur[0];
4040 next = in->cur[1];
4041 if ((cur == '<') && (next == '!') &&
4042 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4043 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4044 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4045 (UPP(8) == 'E')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00004046 if ((!terminate) &&
4047 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004048 goto done;
4049#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004050 xmlGenericError(xmlGenericErrorContext,
4051 "HPP: Parsing internal subset\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004052#endif
4053 htmlParseDocTypeDecl(ctxt);
4054 ctxt->instate = XML_PARSER_PROLOG;
4055#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004056 xmlGenericError(xmlGenericErrorContext,
4057 "HPP: entering PROLOG\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004058#endif
4059 } else {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004060 ctxt->instate = XML_PARSER_MISC;
4061 }
4062#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004063 xmlGenericError(xmlGenericErrorContext,
4064 "HPP: entering MISC\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004065#endif
4066 break;
4067 case XML_PARSER_MISC:
4068 SKIP_BLANKS;
4069 if (in->buf == NULL)
4070 avail = in->length - (in->cur - in->base);
4071 else
4072 avail = in->buf->buffer->use - (in->cur - in->base);
4073 if (avail < 2)
4074 goto done;
4075 cur = in->cur[0];
4076 next = in->cur[1];
4077 if ((cur == '<') && (next == '!') &&
4078 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00004079 if ((!terminate) &&
4080 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004081 goto done;
4082#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004083 xmlGenericError(xmlGenericErrorContext,
4084 "HPP: Parsing Comment\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004085#endif
4086 htmlParseComment(ctxt);
4087 ctxt->instate = XML_PARSER_MISC;
4088 } else if ((cur == '<') && (next == '!') &&
4089 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4090 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4091 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4092 (UPP(8) == 'E')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00004093 if ((!terminate) &&
4094 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004095 goto done;
4096#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004097 xmlGenericError(xmlGenericErrorContext,
4098 "HPP: Parsing internal subset\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004099#endif
4100 htmlParseDocTypeDecl(ctxt);
4101 ctxt->instate = XML_PARSER_PROLOG;
4102#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004103 xmlGenericError(xmlGenericErrorContext,
4104 "HPP: entering PROLOG\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004105#endif
4106 } else if ((cur == '<') && (next == '!') &&
4107 (avail < 9)) {
4108 goto done;
4109 } else {
4110 ctxt->instate = XML_PARSER_START_TAG;
4111#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004112 xmlGenericError(xmlGenericErrorContext,
4113 "HPP: entering START_TAG\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004114#endif
4115 }
4116 break;
4117 case XML_PARSER_PROLOG:
4118 SKIP_BLANKS;
4119 if (in->buf == NULL)
4120 avail = in->length - (in->cur - in->base);
4121 else
4122 avail = in->buf->buffer->use - (in->cur - in->base);
4123 if (avail < 2)
4124 goto done;
4125 cur = in->cur[0];
4126 next = in->cur[1];
4127 if ((cur == '<') && (next == '!') &&
4128 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00004129 if ((!terminate) &&
4130 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004131 goto done;
4132#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004133 xmlGenericError(xmlGenericErrorContext,
4134 "HPP: Parsing Comment\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004135#endif
4136 htmlParseComment(ctxt);
4137 ctxt->instate = XML_PARSER_PROLOG;
4138 } else if ((cur == '<') && (next == '!') &&
4139 (avail < 4)) {
4140 goto done;
4141 } else {
4142 ctxt->instate = XML_PARSER_START_TAG;
4143#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004144 xmlGenericError(xmlGenericErrorContext,
4145 "HPP: entering START_TAG\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004146#endif
4147 }
4148 break;
4149 case XML_PARSER_EPILOG:
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004150 if (in->buf == NULL)
4151 avail = in->length - (in->cur - in->base);
4152 else
4153 avail = in->buf->buffer->use - (in->cur - in->base);
Daniel Veillard87b95392000-08-12 21:12:04 +00004154 if (avail < 1)
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004155 goto done;
4156 cur = in->cur[0];
Daniel Veillard87b95392000-08-12 21:12:04 +00004157 if (IS_BLANK(cur)) {
4158 htmlParseCharData(ctxt, 0);
4159 goto done;
4160 }
4161 if (avail < 2)
4162 goto done;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004163 next = in->cur[1];
4164 if ((cur == '<') && (next == '!') &&
4165 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00004166 if ((!terminate) &&
4167 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004168 goto done;
4169#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004170 xmlGenericError(xmlGenericErrorContext,
4171 "HPP: Parsing Comment\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004172#endif
4173 htmlParseComment(ctxt);
4174 ctxt->instate = XML_PARSER_EPILOG;
4175 } else if ((cur == '<') && (next == '!') &&
4176 (avail < 4)) {
4177 goto done;
4178 } else {
Daniel Veillarda2c6da92000-09-16 18:15:00 +00004179 ctxt->errNo = XML_ERR_DOCUMENT_END;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004180 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4181 ctxt->sax->error(ctxt->userData,
4182 "Extra content at the end of the document\n");
4183 ctxt->wellFormed = 0;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004184 ctxt->instate = XML_PARSER_EOF;
4185#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004186 xmlGenericError(xmlGenericErrorContext,
4187 "HPP: entering EOF\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004188#endif
4189 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4190 ctxt->sax->endDocument(ctxt->userData);
4191 goto done;
4192 }
4193 break;
4194 case XML_PARSER_START_TAG: {
4195 xmlChar *name, *oldname;
4196 int depth = ctxt->nameNr;
4197 htmlElemDescPtr info;
4198
4199 if (avail < 2)
4200 goto done;
4201 cur = in->cur[0];
4202 if (cur != '<') {
4203 ctxt->instate = XML_PARSER_CONTENT;
4204#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004205 xmlGenericError(xmlGenericErrorContext,
4206 "HPP: entering CONTENT\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004207#endif
4208 break;
4209 }
Daniel Veillard71b656e2000-01-05 14:46:17 +00004210 if ((!terminate) &&
4211 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004212 goto done;
4213
4214 oldname = xmlStrdup(ctxt->name);
4215 htmlParseStartTag(ctxt);
4216 name = ctxt->name;
4217#ifdef DEBUG
4218 if (oldname == NULL)
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004219 xmlGenericError(xmlGenericErrorContext,
4220 "Start of element %s\n", name);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004221 else if (name == NULL)
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004222 xmlGenericError(xmlGenericErrorContext,
4223 "Start of element failed, was %s\n",
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004224 oldname);
4225 else
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004226 xmlGenericError(xmlGenericErrorContext,
4227 "Start of element %s, was %s\n",
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004228 name, oldname);
4229#endif
4230 if (((depth == ctxt->nameNr) &&
Daniel Veillard8b5dd832000-10-01 20:28:44 +00004231 (xmlStrEqual(oldname, ctxt->name))) ||
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004232 (name == NULL)) {
4233 if (CUR == '>')
4234 NEXT;
4235 if (oldname != NULL)
4236 xmlFree(oldname);
4237 break;
4238 }
4239 if (oldname != NULL)
4240 xmlFree(oldname);
4241
4242 /*
4243 * Lookup the info for that element.
4244 */
4245 info = htmlTagLookup(name);
4246 if (info == NULL) {
4247 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4248 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4249 name);
4250 ctxt->wellFormed = 0;
4251 } else if (info->depr) {
4252 /***************************
4253 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4254 ctxt->sax->warning(ctxt->userData,
4255 "Tag %s is deprecated\n",
4256 name);
4257 ***************************/
4258 }
4259
4260 /*
4261 * Check for an Empty Element labelled the XML/SGML way
4262 */
4263 if ((CUR == '/') && (NXT(1) == '>')) {
4264 SKIP(2);
4265 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4266 ctxt->sax->endElement(ctxt->userData, name);
4267 oldname = htmlnamePop(ctxt);
4268#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004269 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004270 oldname);
4271#endif
4272 if (oldname != NULL)
4273 xmlFree(oldname);
4274 ctxt->instate = XML_PARSER_CONTENT;
4275#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004276 xmlGenericError(xmlGenericErrorContext,
4277 "HPP: entering CONTENT\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004278#endif
4279 break;
4280 }
4281
4282 if (CUR == '>') {
4283 NEXT;
4284 } else {
4285 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4286 ctxt->sax->error(ctxt->userData,
4287 "Couldn't find end of Start Tag %s\n",
4288 name);
4289 ctxt->wellFormed = 0;
4290
4291 /*
4292 * end of parsing of this node.
4293 */
Daniel Veillard8b5dd832000-10-01 20:28:44 +00004294 if (xmlStrEqual(name, ctxt->name)) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004295 nodePop(ctxt);
4296 oldname = htmlnamePop(ctxt);
4297#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004298 xmlGenericError(xmlGenericErrorContext,
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004299 "End of start tag problem: popping out %s\n", oldname);
4300#endif
4301 if (oldname != NULL)
4302 xmlFree(oldname);
4303 }
4304
4305 ctxt->instate = XML_PARSER_CONTENT;
4306#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004307 xmlGenericError(xmlGenericErrorContext,
4308 "HPP: entering CONTENT\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004309#endif
4310 break;
4311 }
4312
4313 /*
4314 * Check for an Empty Element from DTD definition
4315 */
4316 if ((info != NULL) && (info->empty)) {
4317 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4318 ctxt->sax->endElement(ctxt->userData, name);
4319 oldname = htmlnamePop(ctxt);
4320#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004321 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004322#endif
4323 if (oldname != NULL)
4324 xmlFree(oldname);
4325 }
4326 ctxt->instate = XML_PARSER_CONTENT;
4327#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004328 xmlGenericError(xmlGenericErrorContext,
4329 "HPP: entering CONTENT\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004330#endif
4331 break;
4332 }
Daniel Veillard87b95392000-08-12 21:12:04 +00004333 case XML_PARSER_CONTENT: {
4334 long cons;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004335 /*
4336 * Handle preparsed entities and charRef
4337 */
4338 if (ctxt->token != 0) {
Daniel Veillard365e13b2000-07-02 07:56:37 +00004339 xmlChar chr[2] = { 0 , 0 } ;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004340
Daniel Veillard365e13b2000-07-02 07:56:37 +00004341 chr[0] = (xmlChar) ctxt->token;
Daniel Veillardb8f25c92000-08-19 19:52:36 +00004342 htmlCheckParagraph(ctxt);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004343 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
Daniel Veillard365e13b2000-07-02 07:56:37 +00004344 ctxt->sax->characters(ctxt->userData, chr, 1);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004345 ctxt->token = 0;
4346 ctxt->checkIndex = 0;
4347 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00004348 if ((avail == 1) && (terminate)) {
4349 cur = in->cur[0];
4350 if ((cur != '<') && (cur != '&')) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00004351 if (ctxt->sax != NULL) {
4352 if (IS_BLANK(cur)) {
4353 if (ctxt->sax->ignorableWhitespace != NULL)
4354 ctxt->sax->ignorableWhitespace(
4355 ctxt->userData, &cur, 1);
4356 } else {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00004357 htmlCheckParagraph(ctxt);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00004358 if (ctxt->sax->characters != NULL)
4359 ctxt->sax->characters(
4360 ctxt->userData, &cur, 1);
4361 }
4362 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00004363 ctxt->token = 0;
4364 ctxt->checkIndex = 0;
4365 NEXT;
4366 }
4367 break;
4368 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004369 if (avail < 2)
4370 goto done;
4371 cur = in->cur[0];
4372 next = in->cur[1];
Daniel Veillard87b95392000-08-12 21:12:04 +00004373 cons = ctxt->nbChars;
Daniel Veillard7eda8452000-10-14 23:38:43 +00004374 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4375 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004376 /*
Daniel Veillard7eda8452000-10-14 23:38:43 +00004377 * Handle SCRIPT/STYLE separately
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004378 */
Daniel Veillard7eda8452000-10-14 23:38:43 +00004379 if ((!terminate) &&
4380 (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4381 goto done;
4382 htmlParseScript(ctxt);
4383 if ((cur == '<') && (next == '/')) {
4384 ctxt->instate = XML_PARSER_END_TAG;
4385 ctxt->checkIndex = 0;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004386#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004387 xmlGenericError(xmlGenericErrorContext,
4388 "HPP: entering END_TAG\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004389#endif
Daniel Veillard7eda8452000-10-14 23:38:43 +00004390 break;
4391 }
4392 } else {
4393 /*
4394 * Sometimes DOCTYPE arrives in the middle of the document
4395 */
4396 if ((cur == '<') && (next == '!') &&
4397 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4398 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4399 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4400 (UPP(8) == 'E')) {
4401 if ((!terminate) &&
4402 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4403 goto done;
4404 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4405 ctxt->sax->error(ctxt->userData,
4406 "Misplaced DOCTYPE declaration\n");
4407 ctxt->wellFormed = 0;
4408 htmlParseDocTypeDecl(ctxt);
4409 } else if ((cur == '<') && (next == '!') &&
4410 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4411 if ((!terminate) &&
4412 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4413 goto done;
4414#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004415 xmlGenericError(xmlGenericErrorContext,
4416 "HPP: Parsing Comment\n");
Daniel Veillard7eda8452000-10-14 23:38:43 +00004417#endif
4418 htmlParseComment(ctxt);
4419 ctxt->instate = XML_PARSER_CONTENT;
4420 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4421 goto done;
4422 } else if ((cur == '<') && (next == '/')) {
4423 ctxt->instate = XML_PARSER_END_TAG;
4424 ctxt->checkIndex = 0;
4425#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004426 xmlGenericError(xmlGenericErrorContext,
4427 "HPP: entering END_TAG\n");
Daniel Veillard7eda8452000-10-14 23:38:43 +00004428#endif
4429 break;
4430 } else if (cur == '<') {
4431 ctxt->instate = XML_PARSER_START_TAG;
4432 ctxt->checkIndex = 0;
4433#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004434 xmlGenericError(xmlGenericErrorContext,
4435 "HPP: entering START_TAG\n");
Daniel Veillard7eda8452000-10-14 23:38:43 +00004436#endif
4437 break;
4438 } else if (cur == '&') {
4439 if ((!terminate) &&
4440 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4441 goto done;
4442#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004443 xmlGenericError(xmlGenericErrorContext,
4444 "HPP: Parsing Reference\n");
Daniel Veillard7eda8452000-10-14 23:38:43 +00004445#endif
4446 /* TODO: check generation of subtrees if noent !!! */
4447 htmlParseReference(ctxt);
4448 } else {
4449 /* TODO Avoid the extra copy, handle directly !!!!!! */
4450 /*
4451 * Goal of the following test is :
4452 * - minimize calls to the SAX 'character' callback
4453 * when they are mergeable
4454 */
4455 if ((ctxt->inputNr == 1) &&
4456 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4457 if ((!terminate) &&
4458 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4459 goto done;
4460 }
4461 ctxt->checkIndex = 0;
4462#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004463 xmlGenericError(xmlGenericErrorContext,
4464 "HPP: Parsing char data\n");
Daniel Veillard7eda8452000-10-14 23:38:43 +00004465#endif
4466 htmlParseCharData(ctxt, 0);
4467 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004468 }
Daniel Veillard87b95392000-08-12 21:12:04 +00004469 if (cons == ctxt->nbChars) {
4470 if (ctxt->node != NULL) {
4471 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4472 ctxt->sax->error(ctxt->userData,
4473 "detected an error in element content\n");
4474 ctxt->wellFormed = 0;
Daniel Veillard87b95392000-08-12 21:12:04 +00004475 }
Daniel Veillard8ddb5a72000-09-23 10:28:52 +00004476 NEXT;
Daniel Veillard87b95392000-08-12 21:12:04 +00004477 break;
4478 }
4479
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004480 break;
Daniel Veillard87b95392000-08-12 21:12:04 +00004481 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004482 case XML_PARSER_END_TAG:
4483 if (avail < 2)
4484 goto done;
Daniel Veillard71b656e2000-01-05 14:46:17 +00004485 if ((!terminate) &&
4486 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004487 goto done;
4488 htmlParseEndTag(ctxt);
4489 if (ctxt->nameNr == 0) {
4490 ctxt->instate = XML_PARSER_EPILOG;
4491 } else {
4492 ctxt->instate = XML_PARSER_CONTENT;
4493 }
4494 ctxt->checkIndex = 0;
4495#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004496 xmlGenericError(xmlGenericErrorContext,
4497 "HPP: entering CONTENT\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004498#endif
4499 break;
4500 case XML_PARSER_CDATA_SECTION:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004501 xmlGenericError(xmlGenericErrorContext,
4502 "HPP: internal error, state == CDATA\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004503 ctxt->instate = XML_PARSER_CONTENT;
4504 ctxt->checkIndex = 0;
4505#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004506 xmlGenericError(xmlGenericErrorContext,
4507 "HPP: entering CONTENT\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004508#endif
4509 break;
4510 case XML_PARSER_DTD:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004511 xmlGenericError(xmlGenericErrorContext,
4512 "HPP: internal error, state == DTD\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004513 ctxt->instate = XML_PARSER_CONTENT;
4514 ctxt->checkIndex = 0;
4515#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004516 xmlGenericError(xmlGenericErrorContext,
4517 "HPP: entering CONTENT\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004518#endif
4519 break;
4520 case XML_PARSER_COMMENT:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004521 xmlGenericError(xmlGenericErrorContext,
4522 "HPP: internal error, state == COMMENT\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004523 ctxt->instate = XML_PARSER_CONTENT;
4524 ctxt->checkIndex = 0;
4525#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004526 xmlGenericError(xmlGenericErrorContext,
4527 "HPP: entering CONTENT\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004528#endif
4529 break;
4530 case XML_PARSER_PI:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004531 xmlGenericError(xmlGenericErrorContext,
4532 "HPP: internal error, state == PI\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004533 ctxt->instate = XML_PARSER_CONTENT;
4534 ctxt->checkIndex = 0;
4535#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004536 xmlGenericError(xmlGenericErrorContext,
4537 "HPP: entering CONTENT\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004538#endif
4539 break;
4540 case XML_PARSER_ENTITY_DECL:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004541 xmlGenericError(xmlGenericErrorContext,
4542 "HPP: internal error, state == ENTITY_DECL\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004543 ctxt->instate = XML_PARSER_CONTENT;
4544 ctxt->checkIndex = 0;
4545#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004546 xmlGenericError(xmlGenericErrorContext,
4547 "HPP: entering CONTENT\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004548#endif
4549 break;
4550 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004551 xmlGenericError(xmlGenericErrorContext,
4552 "HPP: internal error, state == ENTITY_VALUE\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004553 ctxt->instate = XML_PARSER_CONTENT;
4554 ctxt->checkIndex = 0;
4555#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004556 xmlGenericError(xmlGenericErrorContext,
4557 "HPP: entering DTD\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004558#endif
4559 break;
4560 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004561 xmlGenericError(xmlGenericErrorContext,
4562 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004563 ctxt->instate = XML_PARSER_START_TAG;
4564 ctxt->checkIndex = 0;
4565#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004566 xmlGenericError(xmlGenericErrorContext,
4567 "HPP: entering START_TAG\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004568#endif
4569 break;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00004570 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004571 xmlGenericError(xmlGenericErrorContext,
4572 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
Daniel Veillard32bc74e2000-07-14 14:49:25 +00004573 ctxt->instate = XML_PARSER_CONTENT;
4574 ctxt->checkIndex = 0;
4575#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004576 xmlGenericError(xmlGenericErrorContext,
4577 "HPP: entering CONTENT\n");
Daniel Veillard32bc74e2000-07-14 14:49:25 +00004578#endif
4579 break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004580 }
4581 }
4582done:
Daniel Veillard365e13b2000-07-02 07:56:37 +00004583 if ((avail == 0) && (terminate)) {
4584 htmlAutoClose(ctxt, NULL);
Daniel Veillard87b95392000-08-12 21:12:04 +00004585 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4586 /*
4587 * SAX: end of the document processing.
4588 */
Daniel Veillard365e13b2000-07-02 07:56:37 +00004589 ctxt->instate = XML_PARSER_EOF;
Daniel Veillard87b95392000-08-12 21:12:04 +00004590 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4591 ctxt->sax->endDocument(ctxt->userData);
4592 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00004593 }
Daniel Veillardb8f25c92000-08-19 19:52:36 +00004594 if ((ctxt->myDoc != NULL) &&
4595 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4596 (ctxt->instate == XML_PARSER_EPILOG))) {
4597 xmlDtdPtr dtd;
4598 dtd = xmlGetIntSubset(ctxt->myDoc);
4599 if (dtd == NULL)
4600 ctxt->myDoc->intSubset =
4601 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4602 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4603 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4604 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004605#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004606 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004607#endif
4608 return(ret);
4609}
4610
4611/**
Daniel Veillard71b656e2000-01-05 14:46:17 +00004612 * htmlParseTry:
4613 * @ctxt: an HTML parser context
4614 *
4615 * Try to progress on parsing
4616 *
4617 * Returns zero if no parsing was possible
4618 */
4619int
4620htmlParseTry(htmlParserCtxtPtr ctxt) {
4621 return(htmlParseTryOrFinish(ctxt, 0));
4622}
4623
4624/**
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004625 * htmlParseChunk:
4626 * @ctxt: an XML parser context
4627 * @chunk: an char array
4628 * @size: the size in byte of the chunk
4629 * @terminate: last chunk indicator
4630 *
4631 * Parse a Chunk of memory
4632 *
4633 * Returns zero if no error, the xmlParserErrors otherwise.
4634 */
4635int
4636htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4637 int terminate) {
4638 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4639 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4640 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4641 int cur = ctxt->input->cur - ctxt->input->base;
4642
4643 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4644 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4645 ctxt->input->cur = ctxt->input->base + cur;
4646#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004647 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004648#endif
4649
Daniel Veillardd0f7f742000-02-02 17:42:48 +00004650 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4651 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard52402ce2000-08-22 23:36:12 +00004652 } else if (ctxt->instate != XML_PARSER_EOF) {
4653 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
Daniel Veillard71b656e2000-01-05 14:46:17 +00004654 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard52402ce2000-08-22 23:36:12 +00004655 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004656 if (terminate) {
4657 if ((ctxt->instate != XML_PARSER_EOF) &&
4658 (ctxt->instate != XML_PARSER_EPILOG) &&
4659 (ctxt->instate != XML_PARSER_MISC)) {
Daniel Veillarda2c6da92000-09-16 18:15:00 +00004660 ctxt->errNo = XML_ERR_DOCUMENT_END;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004661 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4662 ctxt->sax->error(ctxt->userData,
4663 "Extra content at the end of the document\n");
4664 ctxt->wellFormed = 0;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004665 }
4666 if (ctxt->instate != XML_PARSER_EOF) {
4667 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4668 ctxt->sax->endDocument(ctxt->userData);
4669 }
4670 ctxt->instate = XML_PARSER_EOF;
4671 }
4672 return((xmlParserErrors) ctxt->errNo);
4673}
4674
4675/************************************************************************
4676 * *
4677 * User entry points *
4678 * *
4679 ************************************************************************/
4680
4681/**
4682 * htmlCreatePushParserCtxt :
4683 * @sax: a SAX handler
4684 * @user_data: The user data returned on SAX callbacks
4685 * @chunk: a pointer to an array of chars
4686 * @size: number of chars in the array
4687 * @filename: an optional file name or URI
4688 * @enc: an optional encoding
4689 *
4690 * Create a parser context for using the HTML parser in push mode
4691 * To allow content encoding detection, @size should be >= 4
4692 * The value of @filename is used for fetching external entities
4693 * and error/warning reports.
4694 *
4695 * Returns the new parser context or NULL
4696 */
4697htmlParserCtxtPtr
4698htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4699 const char *chunk, int size, const char *filename,
4700 xmlCharEncoding enc) {
4701 htmlParserCtxtPtr ctxt;
4702 htmlParserInputPtr inputStream;
4703 xmlParserInputBufferPtr buf;
4704
4705 buf = xmlAllocParserInputBuffer(enc);
4706 if (buf == NULL) return(NULL);
4707
4708 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4709 if (ctxt == NULL) {
4710 xmlFree(buf);
4711 return(NULL);
4712 }
4713 memset(ctxt, 0, sizeof(htmlParserCtxt));
4714 htmlInitParserCtxt(ctxt);
4715 if (sax != NULL) {
4716 if (ctxt->sax != &htmlDefaultSAXHandler)
4717 xmlFree(ctxt->sax);
4718 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4719 if (ctxt->sax == NULL) {
4720 xmlFree(buf);
4721 xmlFree(ctxt);
4722 return(NULL);
4723 }
4724 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4725 if (user_data != NULL)
4726 ctxt->userData = user_data;
4727 }
4728 if (filename == NULL) {
4729 ctxt->directory = NULL;
4730 } else {
4731 ctxt->directory = xmlParserGetDirectory(filename);
4732 }
4733
4734 inputStream = htmlNewInputStream(ctxt);
4735 if (inputStream == NULL) {
4736 xmlFreeParserCtxt(ctxt);
4737 return(NULL);
4738 }
4739
4740 if (filename == NULL)
4741 inputStream->filename = NULL;
4742 else
4743 inputStream->filename = xmlMemStrdup(filename);
4744 inputStream->buf = buf;
4745 inputStream->base = inputStream->buf->buffer->content;
4746 inputStream->cur = inputStream->buf->buffer->content;
4747
4748 inputPush(ctxt, inputStream);
4749
4750 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4751 (ctxt->input->buf != NULL)) {
4752 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4753#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004754 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004755#endif
4756 }
4757
4758 return(ctxt);
4759}
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004760
4761/**
4762 * htmlSAXParseDoc :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00004763 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004764 * @encoding: a free form C string describing the HTML document encoding, or NULL
4765 * @sax: the SAX handler block
4766 * @userData: if using SAX, this pointer will be provided on callbacks.
4767 *
4768 * parse an HTML in-memory document and build a tree.
4769 * It use the given SAX function block to handle the parsing callback.
4770 * If sax is NULL, fallback to the default DOM tree building routines.
4771 *
4772 * Returns the resulting document tree
4773 */
4774
4775htmlDocPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00004776htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004777 htmlDocPtr ret;
4778 htmlParserCtxtPtr ctxt;
4779
4780 if (cur == NULL) return(NULL);
4781
4782
4783 ctxt = htmlCreateDocParserCtxt(cur, encoding);
4784 if (ctxt == NULL) return(NULL);
4785 if (sax != NULL) {
4786 ctxt->sax = sax;
4787 ctxt->userData = userData;
4788 }
4789
4790 htmlParseDocument(ctxt);
4791 ret = ctxt->myDoc;
4792 if (sax != NULL) {
4793 ctxt->sax = NULL;
4794 ctxt->userData = NULL;
4795 }
4796 htmlFreeParserCtxt(ctxt);
4797
4798 return(ret);
4799}
4800
4801/**
4802 * htmlParseDoc :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00004803 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004804 * @encoding: a free form C string describing the HTML document encoding, or NULL
4805 *
4806 * parse an HTML in-memory document and build a tree.
4807 *
4808 * Returns the resulting document tree
4809 */
4810
4811htmlDocPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00004812htmlParseDoc(xmlChar *cur, const char *encoding) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004813 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4814}
4815
4816
4817/**
4818 * htmlCreateFileParserCtxt :
4819 * @filename: the filename
4820 * @encoding: a free form C string describing the HTML document encoding, or NULL
4821 *
4822 * Create a parser context for a file content.
4823 * Automatic support for ZLIB/Compress compressed document is provided
4824 * by default if found at compile-time.
4825 *
4826 * Returns the new parser context or NULL
4827 */
4828htmlParserCtxtPtr
4829htmlCreateFileParserCtxt(const char *filename, const char *encoding)
4830{
4831 htmlParserCtxtPtr ctxt;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004832 htmlParserInputPtr inputStream;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00004833 xmlParserInputBufferPtr buf;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004834 /* htmlCharEncoding enc; */
Daniel Veillarda6d8eb62000-12-27 10:46:47 +00004835 xmlChar *content, *content_line = (xmlChar *) "charset=";
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004836
Daniel Veillarde2d034d1999-07-27 19:52:06 +00004837 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
4838 if (buf == NULL) return(NULL);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004839
Daniel Veillard6454aec1999-09-02 22:04:43 +00004840 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004841 if (ctxt == NULL) {
4842 perror("malloc");
4843 return(NULL);
4844 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00004845 memset(ctxt, 0, sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004846 htmlInitParserCtxt(ctxt);
Daniel Veillard6454aec1999-09-02 22:04:43 +00004847 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004848 if (inputStream == NULL) {
4849 perror("malloc");
Daniel Veillard6454aec1999-09-02 22:04:43 +00004850 xmlFree(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004851 return(NULL);
4852 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00004853 memset(inputStream, 0, sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004854
Daniel Veillard6454aec1999-09-02 22:04:43 +00004855 inputStream->filename = xmlMemStrdup(filename);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004856 inputStream->line = 1;
4857 inputStream->col = 1;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00004858 inputStream->buf = buf;
Daniel Veillard35008381999-10-25 13:15:52 +00004859 inputStream->directory = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004860
Daniel Veillarde2d034d1999-07-27 19:52:06 +00004861 inputStream->base = inputStream->buf->buffer->content;
4862 inputStream->cur = inputStream->buf->buffer->content;
4863 inputStream->free = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004864
4865 inputPush(ctxt, inputStream);
Daniel Veillarda6d8eb62000-12-27 10:46:47 +00004866
4867 /* set encoding */
4868 if (encoding) {
4869 content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
4870 if (content) {
4871 strcpy ((char *)content, (char *)content_line);
4872 strcat ((char *)content, (char *)encoding);
4873 htmlCheckEncoding (ctxt, content);
4874 xmlFree (content);
4875 }
4876 }
4877
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004878 return(ctxt);
4879}
4880
4881/**
4882 * htmlSAXParseFile :
4883 * @filename: the filename
4884 * @encoding: a free form C string describing the HTML document encoding, or NULL
4885 * @sax: the SAX handler block
4886 * @userData: if using SAX, this pointer will be provided on callbacks.
4887 *
4888 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4889 * compressed document is provided by default if found at compile-time.
4890 * It use the given SAX function block to handle the parsing callback.
4891 * If sax is NULL, fallback to the default DOM tree building routines.
4892 *
4893 * Returns the resulting document tree
4894 */
4895
4896htmlDocPtr
4897htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
4898 void *userData) {
4899 htmlDocPtr ret;
4900 htmlParserCtxtPtr ctxt;
Daniel Veillard87b95392000-08-12 21:12:04 +00004901 htmlSAXHandlerPtr oldsax = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004902
4903 ctxt = htmlCreateFileParserCtxt(filename, encoding);
4904 if (ctxt == NULL) return(NULL);
4905 if (sax != NULL) {
Daniel Veillard87b95392000-08-12 21:12:04 +00004906 oldsax = ctxt->sax;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004907 ctxt->sax = sax;
4908 ctxt->userData = userData;
4909 }
4910
4911 htmlParseDocument(ctxt);
4912
4913 ret = ctxt->myDoc;
4914 if (sax != NULL) {
Daniel Veillard87b95392000-08-12 21:12:04 +00004915 ctxt->sax = oldsax;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004916 ctxt->userData = NULL;
4917 }
4918 htmlFreeParserCtxt(ctxt);
4919
4920 return(ret);
4921}
4922
4923/**
4924 * htmlParseFile :
4925 * @filename: the filename
4926 * @encoding: a free form C string describing the HTML document encoding, or NULL
4927 *
4928 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4929 * compressed document is provided by default if found at compile-time.
4930 *
4931 * Returns the resulting document tree
4932 */
4933
4934htmlDocPtr
4935htmlParseFile(const char *filename, const char *encoding) {
4936 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
4937}
Daniel Veillard361d8452000-04-03 19:48:13 +00004938
Daniel Veillarda6d8eb62000-12-27 10:46:47 +00004939/**
4940 * htmlHandleOmittedElem:
4941 * @val: int 0 or 1
4942 *
4943 * Set and return the previous value for handling HTML omitted tags.
4944 *
4945 * Returns the last value for 0 for no handling, 1 for auto insertion.
4946 */
4947
4948int
4949htmlHandleOmittedElem(int val) {
4950 int old = htmlOmittedDefaultValue;
4951
4952 htmlOmittedDefaultValue = val;
4953 return(old);
4954}
4955
Daniel Veillard361d8452000-04-03 19:48:13 +00004956#endif /* LIBXML_HTML_ENABLED */