blob: f2831f6b86446dfbbb560c8b9de47d0c428374ba [file] [log] [blame]
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * Daniel.Veillard@w3.org
7 */
8
9#ifdef WIN32
Daniel Veillard3c558c31999-12-22 11:30:41 +000010#include "win32config.h"
Daniel Veillardbe70ff71999-07-05 16:50:46 +000011#else
Daniel Veillard7f7d1111999-09-22 09:46:25 +000012#include "config.h"
Daniel Veillardbe70ff71999-07-05 16:50:46 +000013#endif
Daniel Veillard7f7d1111999-09-22 09:46:25 +000014
Daniel Veillardb71379b2000-10-09 12:30:39 +000015#include <libxml/xmlversion.h>
Daniel Veillard361d8452000-04-03 19:48:13 +000016#ifdef LIBXML_HTML_ENABLED
Daniel Veillardbe70ff71999-07-05 16:50:46 +000017#include <stdio.h>
Daniel Veillard32bc74e2000-07-14 14:49:25 +000018#include <string.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000019#ifdef HAVE_CTYPE_H
20#include <ctype.h>
21#endif
22#ifdef HAVE_STDLIB_H
Daniel Veillardbe70ff71999-07-05 16:50:46 +000023#include <stdlib.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000024#endif
25#ifdef HAVE_SYS_STAT_H
Daniel Veillardbe70ff71999-07-05 16:50:46 +000026#include <sys/stat.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000027#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +000028#ifdef HAVE_FCNTL_H
29#include <fcntl.h>
30#endif
31#ifdef HAVE_UNISTD_H
32#include <unistd.h>
33#endif
34#ifdef HAVE_ZLIB_H
35#include <zlib.h>
36#endif
37
Daniel Veillard361d8452000-04-03 19:48:13 +000038#include <libxml/xmlmemory.h>
39#include <libxml/tree.h>
Daniel Veillardaaf58b92000-10-06 14:07:26 +000040#include <libxml/parser.h>
41#include <libxml/parserInternals.h>
Daniel Veillardb71379b2000-10-09 12:30:39 +000042#include <libxml/xmlerror.h>
Daniel Veillard361d8452000-04-03 19:48:13 +000043#include <libxml/HTMLparser.h>
44#include <libxml/entities.h>
45#include <libxml/encoding.h>
46#include <libxml/valid.h>
Daniel Veillard361d8452000-04-03 19:48:13 +000047#include <libxml/xmlIO.h>
Daniel Veillarde2d034d1999-07-27 19:52:06 +000048
49#define HTML_MAX_NAMELEN 1000
Daniel Veillard32bc74e2000-07-14 14:49:25 +000050#define HTML_PARSER_BIG_BUFFER_SIZE 1000
Daniel Veillard5e5c6231999-12-29 12:49:06 +000051#define HTML_PARSER_BUFFER_SIZE 100
Daniel Veillardbe70ff71999-07-05 16:50:46 +000052
Daniel Veillard82150d81999-07-07 07:32:15 +000053/* #define DEBUG */
Daniel Veillard5e5c6231999-12-29 12:49:06 +000054/* #define DEBUG_PUSH */
Daniel Veillard5233ffc1999-07-06 22:25:25 +000055
56/************************************************************************
57 * *
58 * Parser stacks related functions and macros *
59 * *
60 ************************************************************************/
61
62/*
63 * Generic function for accessing stacks in the Parser Context
64 */
65
Daniel Veillarddbfd6411999-12-28 16:35:14 +000066#define PUSH_AND_POP(scope, type, name) \
67scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000068 if (ctxt->name##Nr >= ctxt->name##Max) { \
69 ctxt->name##Max *= 2; \
Daniel Veillard32bc74e2000-07-14 14:49:25 +000070 ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000071 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
72 if (ctxt->name##Tab == NULL) { \
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +000073 xmlGenericError(xmlGenericErrorContext, \
74 "realloc failed !\n"); \
Daniel Veillard0142b842000-01-14 14:45:24 +000075 return(0); \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000076 } \
77 } \
78 ctxt->name##Tab[ctxt->name##Nr] = value; \
79 ctxt->name = value; \
80 return(ctxt->name##Nr++); \
81} \
Daniel Veillarddbfd6411999-12-28 16:35:14 +000082scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000083 type ret; \
Daniel Veillard7c1206f1999-10-14 09:10:25 +000084 if (ctxt->name##Nr < 0) return(0); \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000085 ctxt->name##Nr--; \
Daniel Veillard7c1206f1999-10-14 09:10:25 +000086 if (ctxt->name##Nr < 0) return(0); \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000087 if (ctxt->name##Nr > 0) \
88 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
89 else \
90 ctxt->name = NULL; \
91 ret = ctxt->name##Tab[ctxt->name##Nr]; \
92 ctxt->name##Tab[ctxt->name##Nr] = 0; \
93 return(ret); \
94} \
95
Daniel Veillarddbfd6411999-12-28 16:35:14 +000096PUSH_AND_POP(extern, xmlNodePtr, node)
97PUSH_AND_POP(extern, xmlChar*, name)
Daniel Veillard5233ffc1999-07-06 22:25:25 +000098
99/*
100 * Macros for accessing the content. Those should be used only by the parser,
101 * and not exported.
102 *
103 * Dirty macros, i.e. one need to make assumption on the context to use them
104 *
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000105 * CUR_PTR return the current pointer to the xmlChar to be parsed.
106 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000107 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
108 * in UNICODE mode. This should be used internally by the parser
109 * only to compare to ASCII values otherwise it would break when
110 * running with UTF-8 encoding.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000111 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000112 * to compare on ASCII based substring.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000113 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000114 * it should be used only to compare on ASCII based substring.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000115 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000116 * strings within the parser.
117 *
118 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
119 *
120 * CURRENT Returns the current char value, with the full decoding of
121 * UTF-8 if we are using this mode. It returns an int.
122 * NEXT Skip to the next character, this does the proper decoding
123 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000124 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
125 */
126
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000127#define UPPER (toupper(*ctxt->input->cur))
Daniel Veillardcf461992000-03-14 18:30:20 +0000128
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000129#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
Daniel Veillardcf461992000-03-14 18:30:20 +0000130
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000131#define NXT(val) ctxt->input->cur[(val)]
Daniel Veillardcf461992000-03-14 18:30:20 +0000132
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000133#define UPP(val) (toupper(ctxt->input->cur[(val)]))
Daniel Veillardcf461992000-03-14 18:30:20 +0000134
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000135#define CUR_PTR ctxt->input->cur
Daniel Veillardcf461992000-03-14 18:30:20 +0000136
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000137#define SHRINK xmlParserInputShrink(ctxt->input)
Daniel Veillardcf461992000-03-14 18:30:20 +0000138
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000139#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000140
Daniel Veillardcf461992000-03-14 18:30:20 +0000141#define CURRENT ((int) (*ctxt->input->cur))
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000142
Daniel Veillard126f2792000-10-24 17:10:12 +0000143#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
Daniel Veillardcf461992000-03-14 18:30:20 +0000144
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000145/* Inported from XML */
146
147/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
148#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard126f2792000-10-24 17:10:12 +0000149#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000150
151#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
152#define NXT(val) ctxt->input->cur[(val)]
153#define CUR_PTR ctxt->input->cur
154
155
Daniel Veillard126f2792000-10-24 17:10:12 +0000156#define NEXTL(l) do { \
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000157 if (*(ctxt->input->cur) == '\n') { \
158 ctxt->input->line++; ctxt->input->col = 1; \
159 } else ctxt->input->col++; \
Daniel Veillard126f2792000-10-24 17:10:12 +0000160 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
161 } while (0)
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000162
163/************
164 \
165 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
166 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
167 ************/
168
Daniel Veillard126f2792000-10-24 17:10:12 +0000169#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
170#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000171
172#define COPY_BUF(l,b,i,v) \
173 if (l == 1) b[i++] = (xmlChar) v; \
Daniel Veillard126f2792000-10-24 17:10:12 +0000174 else i += xmlCopyChar(l,&b[i],v)
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000175
176/**
177 * htmlCurrentChar:
178 * @ctxt: the HTML parser context
179 * @len: pointer to the length of the char read
180 *
181 * The current char value, if using UTF-8 this may actaully span multiple
182 * bytes in the input buffer. Implement the end of line normalization:
183 * 2.11 End-of-Line Handling
184 * If the encoding is unspecified, in the case we find an ISO-Latin-1
185 * char, then the encoding converter is plugged in automatically.
186 *
187 * Returns the current char value and its lenght
188 */
189
190int
191htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
192 if (ctxt->instate == XML_PARSER_EOF)
193 return(0);
194
195 if (ctxt->token != 0) {
196 *len = 0;
197 return(ctxt->token);
198 }
199 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
200 /*
201 * We are supposed to handle UTF8, check it's valid
202 * From rfc2044: encoding of the Unicode values on UTF-8:
203 *
204 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
205 * 0000 0000-0000 007F 0xxxxxxx
206 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
207 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
208 *
209 * Check for the 0x110000 limit too
210 */
211 const unsigned char *cur = ctxt->input->cur;
212 unsigned char c;
213 unsigned int val;
214
215 c = *cur;
216 if (c & 0x80) {
217 if (cur[1] == 0)
218 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
219 if ((cur[1] & 0xc0) != 0x80)
220 goto encoding_error;
221 if ((c & 0xe0) == 0xe0) {
222
223 if (cur[2] == 0)
224 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
225 if ((cur[2] & 0xc0) != 0x80)
226 goto encoding_error;
227 if ((c & 0xf0) == 0xf0) {
228 if (cur[3] == 0)
229 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
230 if (((c & 0xf8) != 0xf0) ||
231 ((cur[3] & 0xc0) != 0x80))
232 goto encoding_error;
233 /* 4-byte code */
234 *len = 4;
235 val = (cur[0] & 0x7) << 18;
236 val |= (cur[1] & 0x3f) << 12;
237 val |= (cur[2] & 0x3f) << 6;
238 val |= cur[3] & 0x3f;
239 } else {
240 /* 3-byte code */
241 *len = 3;
242 val = (cur[0] & 0xf) << 12;
243 val |= (cur[1] & 0x3f) << 6;
244 val |= cur[2] & 0x3f;
245 }
246 } else {
247 /* 2-byte code */
248 *len = 2;
249 val = (cur[0] & 0x1f) << 6;
250 val |= cur[1] & 0x3f;
251 }
252 if (!IS_CHAR(val)) {
Daniel Veillarda2c6da92000-09-16 18:15:00 +0000253 ctxt->errNo = XML_ERR_INVALID_ENCODING;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000254 if ((ctxt->sax != NULL) &&
255 (ctxt->sax->error != NULL))
256 ctxt->sax->error(ctxt->userData,
257 "Char 0x%X out of allowed range\n", val);
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000258 ctxt->wellFormed = 0;
259 ctxt->disableSAX = 1;
260 }
261 return(val);
262 } else {
263 /* 1-byte code */
264 *len = 1;
265 return((int) *ctxt->input->cur);
266 }
267 }
268 /*
269 * Assume it's a fixed lenght encoding (1) with
270 * a compatibke encoding for the ASCII set, since
271 * XML constructs only use < 128 chars
272 */
273 *len = 1;
274 if ((int) *ctxt->input->cur < 0x80)
275 return((int) *ctxt->input->cur);
276
277 /*
278 * Humm this is bad, do an automatic flow conversion
279 */
280 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
281 ctxt->charset = XML_CHAR_ENCODING_UTF8;
282 return(xmlCurrentChar(ctxt, len));
283
284encoding_error:
285 /*
286 * If we detect an UTF8 error that probably mean that the
287 * input encoding didn't get properly advertized in the
288 * declaration header. Report the error and switch the encoding
289 * to ISO-Latin-1 (if you don't like this policy, just declare the
290 * encoding !)
291 */
Daniel Veillarda2c6da92000-09-16 18:15:00 +0000292 ctxt->errNo = XML_ERR_INVALID_ENCODING;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000293 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
294 ctxt->sax->error(ctxt->userData,
295 "Input is not proper UTF-8, indicate encoding !\n");
296 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
297 ctxt->input->cur[0], ctxt->input->cur[1],
298 ctxt->input->cur[2], ctxt->input->cur[3]);
299 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000300
301 ctxt->charset = XML_CHAR_ENCODING_8859_1;
302 *len = 1;
303 return((int) *ctxt->input->cur);
304}
305
Daniel Veillardcf461992000-03-14 18:30:20 +0000306/**
307 * htmlNextChar:
308 * @ctxt: the HTML parser context
309 *
310 * Skip to the next char input char.
311 */
312
313void
314htmlNextChar(htmlParserCtxtPtr ctxt) {
Daniel Veillard3f6f7f62000-06-30 17:58:25 +0000315 if (ctxt->instate == XML_PARSER_EOF)
316 return;
Daniel Veillardcf461992000-03-14 18:30:20 +0000317 if ((*ctxt->input->cur == 0) &&
318 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
319 xmlPopInput(ctxt);
320 } else {
321 if (*(ctxt->input->cur) == '\n') {
322 ctxt->input->line++; ctxt->input->col = 1;
323 } else ctxt->input->col++;
324 ctxt->input->cur++;
325 ctxt->nbChars++;
326 if (*ctxt->input->cur == 0)
327 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
328 }
329}
330
331/**
332 * htmlSkipBlankChars:
333 * @ctxt: the HTML parser context
334 *
335 * skip all blanks character found at that point in the input streams.
336 *
337 * Returns the number of space chars skipped
338 */
339
340int
341htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
342 int res = 0;
343
344 while (IS_BLANK(*(ctxt->input->cur))) {
345 if ((*ctxt->input->cur == 0) &&
346 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
347 xmlPopInput(ctxt);
348 } else {
349 if (*(ctxt->input->cur) == '\n') {
350 ctxt->input->line++; ctxt->input->col = 1;
351 } else ctxt->input->col++;
352 ctxt->input->cur++;
353 ctxt->nbChars++;
354 if (*ctxt->input->cur == 0)
355 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
356 }
357 res++;
358 }
359 return(res);
360}
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000361
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000362
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000363
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000364/************************************************************************
365 * *
366 * The list of HTML elements and their properties *
367 * *
368 ************************************************************************/
369
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000370/*
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000371 * Start Tag: 1 means the start tag can be ommited
372 * End Tag: 1 means the end tag can be ommited
373 * 2 means it's forbidden (empty elements)
374 * Depr: this element is deprecated
375 * DTD: 1 means that this element is valid only in the Loose DTD
376 * 2 means that this element is valid only in the Frameset DTD
377 *
378 * Name,Start Tag,End Tag, Empty, Depr., DTD, Description
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000379 */
380htmlElemDesc html40ElementTable[] = {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000381{ "a", 0, 0, 0, 0, 0, "anchor " },
382{ "abbr", 0, 0, 0, 0, 0, "abbreviated form" },
383{ "acronym", 0, 0, 0, 0, 0, "" },
384{ "address", 0, 0, 0, 0, 0, "information on author " },
385{ "applet", 0, 0, 0, 1, 1, "java applet " },
386{ "area", 0, 2, 1, 0, 0, "client-side image map area " },
387{ "b", 0, 0, 0, 0, 0, "bold text style" },
388{ "base", 0, 2, 1, 0, 0, "document base uri " },
389{ "basefont", 0, 2, 1, 1, 1, "base font size " },
390{ "bdo", 0, 0, 0, 0, 0, "i18n bidi over-ride " },
391{ "big", 0, 0, 0, 0, 0, "large text style" },
392{ "blockquote", 0, 0, 0, 0, 0, "long quotation " },
393{ "body", 1, 1, 0, 0, 0, "document body " },
394{ "br", 0, 2, 1, 0, 0, "forced line break " },
395{ "button", 0, 0, 0, 0, 0, "push button " },
396{ "caption", 0, 0, 0, 0, 0, "table caption " },
397{ "center", 0, 0, 0, 1, 1, "shorthand for div align=center " },
398{ "cite", 0, 0, 0, 0, 0, "citation" },
399{ "code", 0, 0, 0, 0, 0, "computer code fragment" },
400{ "col", 0, 2, 1, 0, 0, "table column " },
401{ "colgroup", 0, 1, 0, 0, 0, "table column group " },
402{ "dd", 0, 1, 0, 0, 0, "definition description " },
403{ "del", 0, 0, 0, 0, 0, "deleted text " },
404{ "dfn", 0, 0, 0, 0, 0, "instance definition" },
405{ "dir", 0, 0, 0, 1, 1, "directory list" },
406{ "div", 0, 0, 0, 0, 0, "generic language/style container"},
407{ "dl", 0, 0, 0, 0, 0, "definition list " },
408{ "dt", 0, 1, 0, 0, 0, "definition term " },
409{ "em", 0, 0, 0, 0, 0, "emphasis" },
410{ "fieldset", 0, 0, 0, 0, 0, "form control group " },
411{ "font", 0, 0, 0, 1, 1, "local change to font " },
412{ "form", 0, 0, 0, 0, 0, "interactive form " },
413{ "frame", 0, 2, 1, 0, 2, "subwindow " },
414{ "frameset", 0, 0, 0, 0, 2, "window subdivision" },
415{ "h1", 0, 0, 0, 0, 0, "heading " },
416{ "h2", 0, 0, 0, 0, 0, "heading " },
417{ "h3", 0, 0, 0, 0, 0, "heading " },
418{ "h4", 0, 0, 0, 0, 0, "heading " },
419{ "h5", 0, 0, 0, 0, 0, "heading " },
420{ "h6", 0, 0, 0, 0, 0, "heading " },
421{ "head", 1, 1, 0, 0, 0, "document head " },
422{ "hr", 0, 2, 1, 0, 0, "horizontal rule " },
423{ "html", 1, 1, 0, 0, 0, "document root element " },
424{ "i", 0, 0, 0, 0, 0, "italic text style" },
425{ "iframe", 0, 0, 0, 0, 1, "inline subwindow " },
426{ "img", 0, 2, 1, 0, 0, "embedded image " },
427{ "input", 0, 2, 1, 0, 0, "form control " },
428{ "ins", 0, 0, 0, 0, 0, "inserted text" },
429{ "isindex", 0, 2, 1, 1, 1, "single line prompt " },
430{ "kbd", 0, 0, 0, 0, 0, "text to be entered by the user" },
431{ "label", 0, 0, 0, 0, 0, "form field label text " },
432{ "legend", 0, 0, 0, 0, 0, "fieldset legend " },
433{ "li", 0, 1, 0, 0, 0, "list item " },
434{ "link", 0, 2, 1, 0, 0, "a media-independent link " },
435{ "map", 0, 0, 0, 0, 0, "client-side image map " },
436{ "menu", 0, 0, 0, 1, 1, "menu list " },
437{ "meta", 0, 2, 1, 0, 0, "generic metainformation " },
438{ "noframes", 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " },
439{ "noscript", 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
440{ "object", 0, 0, 0, 0, 0, "generic embedded object " },
441{ "ol", 0, 0, 0, 0, 0, "ordered list " },
442{ "optgroup", 0, 0, 0, 0, 0, "option group " },
443{ "option", 0, 1, 0, 0, 0, "selectable choice " },
444{ "p", 0, 1, 0, 0, 0, "paragraph " },
445{ "param", 0, 2, 1, 0, 0, "named property value " },
446{ "pre", 0, 0, 0, 0, 0, "preformatted text " },
447{ "q", 0, 0, 0, 0, 0, "short inline quotation " },
448{ "s", 0, 0, 0, 1, 1, "strike-through text style" },
449{ "samp", 0, 0, 0, 0, 0, "sample program output, scripts, etc." },
450{ "script", 0, 0, 0, 0, 0, "script statements " },
451{ "select", 0, 0, 0, 0, 0, "option selector " },
452{ "small", 0, 0, 0, 0, 0, "small text style" },
453{ "span", 0, 0, 0, 0, 0, "generic language/style container " },
454{ "strike", 0, 0, 0, 1, 1, "strike-through text" },
455{ "strong", 0, 0, 0, 0, 0, "strong emphasis" },
456{ "style", 0, 0, 0, 0, 0, "style info " },
457{ "sub", 0, 0, 0, 0, 0, "subscript" },
458{ "sup", 0, 0, 0, 0, 0, "superscript " },
459{ "table", 0, 0, 0, 0, 0, "&#160;" },
460{ "tbody", 1, 1, 0, 0, 0, "table body " },
461{ "td", 0, 1, 0, 0, 0, "table data cell" },
462{ "textarea", 0, 0, 0, 0, 0, "multi-line text field " },
463{ "tfoot", 0, 1, 0, 0, 0, "table footer " },
464{ "th", 0, 1, 0, 0, 0, "table header cell" },
465{ "thead", 0, 1, 0, 0, 0, "table header " },
466{ "title", 0, 0, 0, 0, 0, "document title " },
467{ "tr", 0, 1, 0, 0, 0, "table row " },
468{ "tt", 0, 0, 0, 0, 0, "teletype or monospaced text style" },
469{ "u", 0, 0, 0, 1, 1, "underlined text style" },
470{ "ul", 0, 0, 0, 0, 0, "unordered list " },
471{ "var", 0, 0, 0, 0, 0, "instance of a variable or program argument" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000472};
473
474/*
475 * start tags that imply the end of a current element
476 * any tag of each line implies the end of the current element if the type of
477 * that element is in the same line
478 */
Daniel Veillardb96e6431999-08-29 21:02:19 +0000479char *htmlEquEnd[] = {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000480"dt", "dd", "li", "option", NULL,
481"h1", "h2", "h3", "h4", "h5", "h6", NULL,
482"ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000483NULL
484};
485/*
486 * acording the HTML DTD, HR should be added to the 2nd line above, as it
487 * is not allowed within a H1, H2, H3, etc. But we should tolerate that case
488 * because many documents contain rules in headings...
489 */
490
491/*
492 * start tags that imply the end of current element
493 */
Daniel Veillardb96e6431999-08-29 21:02:19 +0000494char *htmlStartClose[] = {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000495"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
496 "dl", "ul", "ol", "menu", "dir", "address", "pre",
497 "listing", "xmp", "head", NULL,
498"head", "p", NULL,
499"title", "p", NULL,
500"body", "head", "style", "link", "title", "p", NULL,
501"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
502 "pre", "listing", "xmp", "head", "li", NULL,
503"hr", "p", "head", NULL,
504"h1", "p", "head", NULL,
505"h2", "p", "head", NULL,
506"h3", "p", "head", NULL,
507"h4", "p", "head", NULL,
508"h5", "p", "head", NULL,
509"h6", "p", "head", NULL,
510"dir", "p", "head", NULL,
511"address", "p", "head", "ul", NULL,
512"pre", "p", "head", "ul", NULL,
513"listing", "p", "head", NULL,
514"xmp", "p", "head", NULL,
515"blockquote", "p", "head", NULL,
516"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
517 "xmp", "head", NULL,
518"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
519 "head", "dd", NULL,
520"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
521 "head", "dt", NULL,
522"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
523 "listing", "xmp", NULL,
524"ol", "p", "head", "ul", NULL,
525"menu", "p", "head", "ul", NULL,
526"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
527"div", "p", "head", NULL,
528"noscript", "p", "head", NULL,
529"center", "font", "b", "i", "p", "head", NULL,
530"a", "a", NULL,
531"caption", "p", NULL,
532"colgroup", "caption", "colgroup", "col", "p", NULL,
533"col", "caption", "col", "p", NULL,
534"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
535 "listing", "xmp", "a", NULL,
536"th", "th", "td", NULL,
537"td", "th", "td", "p", NULL,
538"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
539"thead", "caption", "col", "colgroup", NULL,
540"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
541 "tbody", "p", NULL,
542"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
543 "tfoot", "tbody", "p", NULL,
544"optgroup", "option", NULL,
Daniel Veillard126f2792000-10-24 17:10:12 +0000545"option", "option", NULL,
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000546"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
547 "pre", "listing", "xmp", "a", NULL,
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000548NULL
549};
550
Daniel Veillardb8f25c92000-08-19 19:52:36 +0000551/*
552 * The list of HTML elements which are supposed not to have
553 * CDATA content and where a p element will be implied
554 *
555 * TODO: extend that list by reading the HTML SGML DtD on
556 * implied paragraph
557 */
558static char *htmlNoContentElements[] = {
559 "html",
560 "head",
561 "body",
562 NULL
563};
564
Daniel Veillard47e12f22000-10-15 14:24:25 +0000565/*
566 * The list of HTML attributes which are of content %Script;
567 * NOTE: when adding ones, check htmlIsScriptAttribute() since
568 * it assumes the name starts with 'on'
569 */
570static char *htmlScriptAttributes[] = {
571 "onclick",
572 "ondblclick",
573 "onmousedown",
574 "onmouseup",
575 "onmouseover",
576 "onmousemove",
577 "onmouseout",
578 "onkeypress",
579 "onkeydown",
580 "onkeyup",
581 "onload",
582 "onunload",
583 "onfocus",
584 "onblur",
585 "onsubmit",
586 "onrest",
587 "onchange",
588 "onselect"
589};
590
591
Daniel Veillardb96e6431999-08-29 21:02:19 +0000592static char** htmlStartCloseIndex[100];
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000593static int htmlStartCloseIndexinitialized = 0;
594
595/************************************************************************
596 * *
597 * functions to handle HTML specific data *
598 * *
599 ************************************************************************/
600
601/**
602 * htmlInitAutoClose:
603 *
604 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
Daniel Veillardbc765302000-10-01 18:23:35 +0000605 * This is not reentrant. Call xmlInitParser() once before processing in
606 * case of use in multithreaded programs.
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000607 */
608void
609htmlInitAutoClose(void) {
610 int index, i = 0;
611
612 if (htmlStartCloseIndexinitialized) return;
613
614 for (index = 0;index < 100;index ++) htmlStartCloseIndex[index] = NULL;
615 index = 0;
616 while ((htmlStartClose[i] != NULL) && (index < 100 - 1)) {
617 htmlStartCloseIndex[index++] = &htmlStartClose[i];
618 while (htmlStartClose[i] != NULL) i++;
619 i++;
620 }
Daniel Veillardbc765302000-10-01 18:23:35 +0000621 htmlStartCloseIndexinitialized = 1;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000622}
623
624/**
625 * htmlTagLookup:
Daniel Veillardb656ebe2000-09-22 13:51:48 +0000626 * @tag: The tag name in lowercase
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000627 *
628 * Lookup the HTML tag in the ElementTable
629 *
630 * Returns the related htmlElemDescPtr or NULL if not found.
631 */
632htmlElemDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000633htmlTagLookup(const xmlChar *tag) {
Daniel Veillard47f3f312000-08-27 22:40:15 +0000634 int i;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000635
636 for (i = 0; i < (sizeof(html40ElementTable) /
637 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000638 if (xmlStrEqual(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000639 return(&html40ElementTable[i]);
640 }
641 return(NULL);
642}
643
644/**
645 * htmlCheckAutoClose:
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000646 * @newtag: The new tag name
647 * @oldtag: The old tag name
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000648 *
649 * Checks wether the new tag is one of the registered valid tags for closing old.
650 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
651 *
652 * Returns 0 if no, 1 if yes.
653 */
654int
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000655htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000656 int i, index;
Daniel Veillard39c7d712000-09-10 16:14:55 +0000657 char **close = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000658
659 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
660
661 /* inefficient, but not a big deal */
662 for (index = 0; index < 100;index++) {
663 close = htmlStartCloseIndex[index];
664 if (close == NULL) return(0);
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000665 if (xmlStrEqual(BAD_CAST *close, newtag)) break;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000666 }
667
668 i = close - htmlStartClose;
669 i++;
670 while (htmlStartClose[i] != NULL) {
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000671 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000672 return(1);
673 }
674 i++;
675 }
676 return(0);
677}
678
679/**
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000680 * htmlAutoCloseOnClose:
681 * @ctxt: an HTML parser context
682 * @newtag: The new tag name
683 *
684 * The HTmL DtD allows an ending tag to implicitely close other tags.
685 */
686void
687htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
688 htmlElemDescPtr info;
689 xmlChar *oldname;
690 int i;
691
692#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000693 xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000694 for (i = 0;i < ctxt->nameNr;i++)
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000695 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000696#endif
697
698 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000699 if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000700 }
701 if (i < 0) return;
702
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000703 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000704 info = htmlTagLookup(ctxt->name);
705 if ((info == NULL) || (info->endTag == 1)) {
706#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000707 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000708#endif
709 } else {
710 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
711 ctxt->sax->error(ctxt->userData,
712 "Opening and ending tag mismatch: %s and %s\n",
713 newtag, ctxt->name);
714 ctxt->wellFormed = 0;
715 }
716 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
717 ctxt->sax->endElement(ctxt->userData, ctxt->name);
718 oldname = htmlnamePop(ctxt);
719 if (oldname != NULL) {
720#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000721 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000722#endif
723 xmlFree(oldname);
724 }
725 }
726}
727
728/**
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000729 * htmlAutoClose:
730 * @ctxt: an HTML parser context
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000731 * @newtag: The new tag name or NULL
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000732 *
733 * The HTmL DtD allows a tag to implicitely close other tags.
734 * The list is kept in htmlStartClose array. This function is
735 * called when a new tag has been detected and generates the
736 * appropriates closes if possible/needed.
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000737 * If newtag is NULL this mean we are at the end of the resource
Daniel Veillard365e13b2000-07-02 07:56:37 +0000738 * and we should check
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000739 */
740void
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000741htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000742 xmlChar *oldname;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000743 while ((newtag != NULL) && (ctxt->name != NULL) &&
744 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000745#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000746 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000747#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000748 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000749 ctxt->sax->endElement(ctxt->userData, ctxt->name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +0000750 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000751 if (oldname != NULL) {
752#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000753 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000754#endif
755 xmlFree(oldname);
756 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000757 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000758 if (newtag == NULL) {
759 htmlAutoCloseOnClose(ctxt, BAD_CAST"head");
760 htmlAutoCloseOnClose(ctxt, BAD_CAST"body");
761 htmlAutoCloseOnClose(ctxt, BAD_CAST"html");
762 }
763 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000764 ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
765 (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
766 (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
Daniel Veillard365e13b2000-07-02 07:56:37 +0000767#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000768 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
Daniel Veillard365e13b2000-07-02 07:56:37 +0000769#endif
770 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
771 ctxt->sax->endElement(ctxt->userData, ctxt->name);
772 oldname = htmlnamePop(ctxt);
773 if (oldname != NULL) {
774#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000775 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
Daniel Veillard365e13b2000-07-02 07:56:37 +0000776#endif
777 xmlFree(oldname);
778 }
779 }
780
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000781}
782
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000783/**
Daniel Veillard5cb5ab81999-12-21 15:35:29 +0000784 * htmlAutoCloseTag:
785 * @doc: the HTML document
786 * @name: The tag name
787 * @elem: the HTML element
788 *
789 * The HTmL DtD allows a tag to implicitely close other tags.
790 * The list is kept in htmlStartClose array. This function checks
791 * if the element or one of it's children would autoclose the
792 * given tag.
793 *
794 * Returns 1 if autoclose, 0 otherwise
795 */
796int
797htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
798 htmlNodePtr child;
799
800 if (elem == NULL) return(1);
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000801 if (xmlStrEqual(name, elem->name)) return(0);
Daniel Veillard5cb5ab81999-12-21 15:35:29 +0000802 if (htmlCheckAutoClose(elem->name, name)) return(1);
Daniel Veillardcf461992000-03-14 18:30:20 +0000803 child = elem->children;
Daniel Veillard5cb5ab81999-12-21 15:35:29 +0000804 while (child != NULL) {
805 if (htmlAutoCloseTag(doc, name, child)) return(1);
806 child = child->next;
807 }
808 return(0);
809}
810
811/**
812 * htmlIsAutoClosed:
813 * @doc: the HTML document
814 * @elem: the HTML element
815 *
816 * The HTmL DtD allows a tag to implicitely close other tags.
817 * The list is kept in htmlStartClose array. This function checks
818 * if a tag is autoclosed by one of it's child
819 *
820 * Returns 1 if autoclosed, 0 otherwise
821 */
822int
823htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
824 htmlNodePtr child;
825
826 if (elem == NULL) return(1);
Daniel Veillardcf461992000-03-14 18:30:20 +0000827 child = elem->children;
Daniel Veillard5cb5ab81999-12-21 15:35:29 +0000828 while (child != NULL) {
829 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
830 child = child->next;
831 }
832 return(0);
833}
834
835/**
Daniel Veillardbe803962000-06-28 23:40:59 +0000836 * htmlCheckImplied:
837 * @ctxt: an HTML parser context
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000838 * @newtag: The new tag name
Daniel Veillardbe803962000-06-28 23:40:59 +0000839 *
840 * The HTmL DtD allows a tag to exists only implicitely
841 * called when a new tag has been detected and generates the
842 * appropriates implicit tags if missing
843 */
844void
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000845htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000846 if (xmlStrEqual(newtag, BAD_CAST"html"))
Daniel Veillardbe803962000-06-28 23:40:59 +0000847 return;
848 if (ctxt->nameNr <= 0) {
849#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000850 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
Daniel Veillardbe803962000-06-28 23:40:59 +0000851#endif
852 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
853 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
854 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
855 }
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000856 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
Daniel Veillardbe803962000-06-28 23:40:59 +0000857 return;
Daniel Veillardf62ceff2000-11-24 23:36:01 +0000858 if ((ctxt->nameNr <= 1) &&
859 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
860 (xmlStrEqual(newtag, BAD_CAST"style")) ||
861 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
862 (xmlStrEqual(newtag, BAD_CAST"link")) ||
863 (xmlStrEqual(newtag, BAD_CAST"title")) ||
864 (xmlStrEqual(newtag, BAD_CAST"base")))) {
Daniel Veillardbe803962000-06-28 23:40:59 +0000865 /*
866 * dropped OBJECT ... i you put it first BODY will be
867 * assumed !
868 */
869#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000870 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
Daniel Veillardbe803962000-06-28 23:40:59 +0000871#endif
872 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
873 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
874 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
Daniel Veillardf62ceff2000-11-24 23:36:01 +0000875 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
876 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
877 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
878 int i;
879 for (i = 0;i < ctxt->nameNr;i++) {
880 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
881 return;
882 }
883 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
884 return;
885 }
Daniel Veillardbe803962000-06-28 23:40:59 +0000886 }
Daniel Veillardf62ceff2000-11-24 23:36:01 +0000887
888#ifdef DEBUG
889 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
890#endif
891 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
892 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
893 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
Daniel Veillardbe803962000-06-28 23:40:59 +0000894 }
895}
896
Daniel Veillardb8f25c92000-08-19 19:52:36 +0000897/**
898 * htmlCheckParagraph
899 * @ctxt: an HTML parser context
900 *
901 * Check whether a p element need to be implied before inserting
902 * characters in the current element.
903 *
904 * Returns 1 if a paragraph has been inserted, 0 if not and -1
905 * in case of error.
906 */
907
908int
909htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
910 const xmlChar *tag;
911 int i;
912
913 if (ctxt == NULL)
914 return(-1);
915 tag = ctxt->name;
916 if (tag == NULL) {
917 htmlAutoClose(ctxt, BAD_CAST"p");
918 htmlCheckImplied(ctxt, BAD_CAST"p");
919 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
920 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
921 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
922 return(1);
923 }
924 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
Daniel Veillard8b5dd832000-10-01 20:28:44 +0000925 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Daniel Veillardb8f25c92000-08-19 19:52:36 +0000926#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000927 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
Daniel Veillardb8f25c92000-08-19 19:52:36 +0000928#endif
929 htmlAutoClose(ctxt, BAD_CAST"p");
930 htmlCheckImplied(ctxt, BAD_CAST"p");
931 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
932 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
933 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
934 return(1);
935 }
936 }
937 return(0);
938}
939
Daniel Veillard47e12f22000-10-15 14:24:25 +0000940/**
941 * htmlIsScriptAttribute:
942 * @name: an attribute name
943 *
944 * Check if an attribute is of content type Script
945 *
946 * Returns 1 is the attribute is a script 0 otherwise
947 */
948int
949htmlIsScriptAttribute(const xmlChar *name) {
950 int i;
951
952 if (name == NULL)
953 return(0);
954 /*
955 * all script attributes start with 'on'
956 */
957 if ((name[0] != 'o') || (name[1] != 'n'))
958 return(0);
959 for (i = 0;
960 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
961 i++) {
962 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
963 return(1);
964 }
965 return(0);
966}
967
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000968/************************************************************************
969 * *
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000970 * The list of HTML predefined entities *
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000971 * *
972 ************************************************************************/
973
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000974
975htmlEntityDesc html40EntitiesTable[] = {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000976/*
Daniel Veillard47f3f312000-08-27 22:40:15 +0000977 * the 4 absolute ones, plus apostrophe.
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000978 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000979{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
980{ 38, "amp", "ampersand, U+0026 ISOnum" },
Daniel Veillard47f3f312000-08-27 22:40:15 +0000981{ 39, "apos", "single quote" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000982{ 60, "lt", "less-than sign, U+003C ISOnum" },
983{ 62, "gt", "greater-than sign, U+003E ISOnum" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000984
985/*
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000986 * A bunch still in the 128-255 range
987 * Replacing them depend really on the charset used.
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000988 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000989{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
990{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
991{ 162, "cent", "cent sign, U+00A2 ISOnum" },
992{ 163, "pound","pound sign, U+00A3 ISOnum" },
993{ 164, "curren","currency sign, U+00A4 ISOnum" },
994{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
995{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
996{ 167, "sect", "section sign, U+00A7 ISOnum" },
997{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
998{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
999{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1000{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1001{ 172, "not", "not sign, U+00AC ISOnum" },
1002{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1003{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1004{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1005{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1006{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1007{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1008{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1009{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1010{ 181, "micro","micro sign, U+00B5 ISOnum" },
1011{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +00001012{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001013{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1014{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1015{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +00001016{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001017{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1018{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1019{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1020{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1021{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1022{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1023{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1024{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1025{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1026{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1027{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1028{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1029{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1030{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1031{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1032{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1033{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1034{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1035{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1036{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1037{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1038{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1039{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1040{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1041{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1042{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1043{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1044{ 215, "times","multiplication sign, U+00D7 ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +00001045{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001046{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1047{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1048{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1049{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1050{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1051{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1052{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1053{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1054{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1055{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1056{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1057{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1058{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1059{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1060{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1061{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1062{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1063{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1064{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1065{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1066{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1067{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1068{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1069{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1070{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1071{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1072{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1073{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1074{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1075{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1076{ 247, "divide","division sign, U+00F7 ISOnum" },
1077{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1078{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1079{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1080{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1081{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1082{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1083{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1084{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001085
Daniel Veillard47f3f312000-08-27 22:40:15 +00001086{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1087{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1088{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1089{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1090{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1091
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001092/*
1093 * Anything below should really be kept as entities references
1094 */
1095{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001096
Daniel Veillard47f3f312000-08-27 22:40:15 +00001097{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1098{ 732, "tilde","small tilde, U+02DC ISOdia" },
1099
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001100{ 913, "Alpha","greek capital letter alpha, U+0391" },
1101{ 914, "Beta", "greek capital letter beta, U+0392" },
1102{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1103{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1104{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1105{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1106{ 919, "Eta", "greek capital letter eta, U+0397" },
1107{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1108{ 921, "Iota", "greek capital letter iota, U+0399" },
1109{ 922, "Kappa","greek capital letter kappa, U+039A" },
1110{ 923, "Lambda""greek capital letter lambda, U+039B ISOgrk3" },
1111{ 924, "Mu", "greek capital letter mu, U+039C" },
1112{ 925, "Nu", "greek capital letter nu, U+039D" },
1113{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1114{ 927, "Omicron","greek capital letter omicron, U+039F" },
1115{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1116{ 929, "Rho", "greek capital letter rho, U+03A1" },
1117{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1118{ 932, "Tau", "greek capital letter tau, U+03A4" },
1119{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1120{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1121{ 935, "Chi", "greek capital letter chi, U+03A7" },
1122{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1123{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001124
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001125{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1126{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1127{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1128{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1129{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1130{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1131{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1132{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1133{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1134{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1135{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1136{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1137{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1138{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1139{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1140{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1141{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1142{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1143{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1144{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1145{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1146{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1147{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1148{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1149{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1150{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1151{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1152{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001153
Daniel Veillard47f3f312000-08-27 22:40:15 +00001154{ 8194, "ensp", "en space, U+2002 ISOpub" },
1155{ 8195, "emsp", "em space, U+2003 ISOpub" },
1156{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1157{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1158{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1159{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1160{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1161{ 8211, "ndash","en dash, U+2013 ISOpub" },
1162{ 8212, "mdash","em dash, U+2014 ISOpub" },
1163{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1164{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1165{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1166{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1167{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1168{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1169{ 8224, "dagger","dagger, U+2020 ISOpub" },
1170{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1171
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001172{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1173{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
Daniel Veillard47f3f312000-08-27 22:40:15 +00001174
1175{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1176
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001177{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1178{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
Daniel Veillard47f3f312000-08-27 22:40:15 +00001179
1180{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1181{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1182
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001183{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1184{ 8260, "frasl","fraction slash, U+2044 NEW" },
1185
Daniel Veillard47f3f312000-08-27 22:40:15 +00001186{ 8364, "euro", "euro sign, U+20AC NEW" },
1187
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001188{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
Daniel Veillard47f3f312000-08-27 22:40:15 +00001189{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001190{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1191{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1192{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1193{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1194{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1195{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1196{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1197{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1198{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1199{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1200{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1201{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1202{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1203{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1204
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001205{ 8704, "forall","for all, U+2200 ISOtech" },
1206{ 8706, "part", "partial differential, U+2202 ISOtech" },
1207{ 8707, "exist","there exists, U+2203 ISOtech" },
1208{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1209{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1210{ 8712, "isin", "element of, U+2208 ISOtech" },
1211{ 8713, "notin","not an element of, U+2209 ISOtech" },
1212{ 8715, "ni", "contains as member, U+220B ISOtech" },
1213{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1214{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
1215{ 8722, "minus","minus sign, U+2212 ISOtech" },
1216{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1217{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1218{ 8733, "prop", "proportional to, U+221D ISOtech" },
1219{ 8734, "infin","infinity, U+221E ISOtech" },
1220{ 8736, "ang", "angle, U+2220 ISOamso" },
1221{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1222{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1223{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1224{ 8746, "cup", "union = cup, U+222A ISOtech" },
1225{ 8747, "int", "integral, U+222B ISOtech" },
1226{ 8756, "there4","therefore, U+2234 ISOtech" },
1227{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1228{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1229{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1230{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1231{ 8801, "equiv","identical to, U+2261 ISOtech" },
1232{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1233{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1234{ 8834, "sub", "subset of, U+2282 ISOtech" },
1235{ 8835, "sup", "superset of, U+2283 ISOtech" },
1236{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1237{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1238{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1239{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1240{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1241{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1242{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1243{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1244{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1245{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1246{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1247{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1248{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1249{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1250
1251{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1252{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1253{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1254{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1255
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001256};
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001257
1258/************************************************************************
1259 * *
1260 * Commodity functions to handle entities *
1261 * *
1262 ************************************************************************/
1263
1264/*
1265 * Macro used to grow the current buffer.
1266 */
1267#define growBuffer(buffer) { \
1268 buffer##_size *= 2; \
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001269 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001270 if (buffer == NULL) { \
1271 perror("realloc failed"); \
Daniel Veillard0142b842000-01-14 14:45:24 +00001272 return(NULL); \
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001273 } \
1274}
1275
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001276/**
1277 * htmlEntityLookup:
1278 * @name: the entity name
1279 *
1280 * Lookup the given entity in EntitiesTable
1281 *
1282 * TODO: the linear scan is really ugly, an hash table is really needed.
1283 *
1284 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1285 */
1286htmlEntityDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001287htmlEntityLookup(const xmlChar *name) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001288 int i;
1289
1290 for (i = 0;i < (sizeof(html40EntitiesTable)/
1291 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard8b5dd832000-10-01 20:28:44 +00001292 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001293#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001294 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001295#endif
1296 return(&html40EntitiesTable[i]);
1297 }
1298 }
1299 return(NULL);
1300}
1301
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001302/**
Daniel Veillard47f3f312000-08-27 22:40:15 +00001303 * htmlEntityValueLookup:
1304 * @value: the entity's unicode value
1305 *
1306 * Lookup the given entity in EntitiesTable
1307 *
1308 * TODO: the linear scan is really ugly, an hash table is really needed.
1309 *
1310 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1311 */
1312htmlEntityDescPtr
1313htmlEntityValueLookup(int value) {
1314 int i;
1315#ifdef DEBUG
1316 int lv = 0;
1317#endif
1318
1319 for (i = 0;i < (sizeof(html40EntitiesTable)/
1320 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard4b0755c2000-09-25 14:26:28 +00001321 if ((unsigned int) html40EntitiesTable[i].value >= value) {
1322 if ((unsigned int) html40EntitiesTable[i].value > value)
Daniel Veillard47f3f312000-08-27 22:40:15 +00001323 break;
1324#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001325 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
Daniel Veillard47f3f312000-08-27 22:40:15 +00001326#endif
1327 return(&html40EntitiesTable[i]);
1328 }
1329#ifdef DEBUG
1330 if (lv > html40EntitiesTable[i].value) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001331 xmlGenericError(xmlGenericErrorContext,
1332 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
Daniel Veillard47f3f312000-08-27 22:40:15 +00001333 lv, html40EntitiesTable[i].value);
1334 }
1335 lv = html40EntitiesTable[i].value;
1336#endif
1337 }
1338 return(NULL);
1339}
1340
1341/**
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001342 * UTF8ToHtml:
1343 * @out: a pointer to an array of bytes to store the result
1344 * @outlen: the length of @out
1345 * @in: a pointer to an array of UTF-8 chars
1346 * @inlen: the length of @in
1347 *
1348 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1349 * plus HTML entities block of chars out.
1350 *
1351 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1352 * The value of @inlen after return is the number of octets consumed
1353 * as the return value is positive, else unpredictiable.
1354 * The value of @outlen after return is the number of octets consumed.
1355 */
1356int
1357UTF8ToHtml(unsigned char* out, int *outlen,
1358 const unsigned char* in, int *inlen) {
1359 const unsigned char* processed = in;
1360 const unsigned char* outend;
1361 const unsigned char* outstart = out;
1362 const unsigned char* instart = in;
1363 const unsigned char* inend;
1364 unsigned int c, d;
1365 int trailing;
1366
1367 if (in == NULL) {
1368 /*
1369 * initialization nothing to do
1370 */
1371 *outlen = 0;
1372 *inlen = 0;
1373 return(0);
1374 }
1375 inend = in + (*inlen);
1376 outend = out + (*outlen);
1377 while (in < inend) {
1378 d = *in++;
1379 if (d < 0x80) { c= d; trailing= 0; }
1380 else if (d < 0xC0) {
1381 /* trailing byte in leading position */
1382 *outlen = out - outstart;
1383 *inlen = processed - instart;
1384 return(-2);
1385 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1386 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1387 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1388 else {
1389 /* no chance for this in Ascii */
1390 *outlen = out - outstart;
1391 *inlen = processed - instart;
1392 return(-2);
1393 }
1394
1395 if (inend - in < trailing) {
1396 break;
1397 }
1398
1399 for ( ; trailing; trailing--) {
1400 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1401 break;
1402 c <<= 6;
1403 c |= d & 0x3F;
1404 }
1405
1406 /* assertion: c is a single UTF-4 value */
1407 if (c < 0x80) {
Daniel Veillarde010c172000-08-28 10:04:51 +00001408 if (out + 1 >= outend)
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001409 break;
1410 *out++ = c;
1411 } else {
Daniel Veillard47f3f312000-08-27 22:40:15 +00001412 int len;
1413 htmlEntityDescPtr ent;
1414
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001415 /*
1416 * Try to lookup a predefined HTML entity for it
1417 */
1418
Daniel Veillard47f3f312000-08-27 22:40:15 +00001419 ent = htmlEntityValueLookup(c);
1420 if (ent == NULL) {
1421 /* no chance for this in Ascii */
1422 *outlen = out - outstart;
1423 *inlen = processed - instart;
1424 return(-2);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001425 }
Daniel Veillard47f3f312000-08-27 22:40:15 +00001426 len = strlen(ent->name);
Daniel Veillarde010c172000-08-28 10:04:51 +00001427 if (out + 2 + len >= outend)
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001428 break;
1429 *out++ = '&';
Daniel Veillard47f3f312000-08-27 22:40:15 +00001430 memcpy(out, ent->name, len);
1431 out += len;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001432 *out++ = ';';
1433 }
1434 processed = in;
1435 }
1436 *outlen = out - outstart;
1437 *inlen = processed - instart;
1438 return(0);
1439}
1440
Daniel Veillarde010c172000-08-28 10:04:51 +00001441/**
1442 * htmlEncodeEntities:
1443 * @out: a pointer to an array of bytes to store the result
1444 * @outlen: the length of @out
1445 * @in: a pointer to an array of UTF-8 chars
1446 * @inlen: the length of @in
1447 * @quoteChar: the quote character to escape (' or ") or zero.
1448 *
1449 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1450 * plus HTML entities block of chars out.
1451 *
1452 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1453 * The value of @inlen after return is the number of octets consumed
1454 * as the return value is positive, else unpredictiable.
1455 * The value of @outlen after return is the number of octets consumed.
1456 */
1457int
1458htmlEncodeEntities(unsigned char* out, int *outlen,
1459 const unsigned char* in, int *inlen, int quoteChar) {
1460 const unsigned char* processed = in;
1461 const unsigned char* outend = out + (*outlen);
1462 const unsigned char* outstart = out;
1463 const unsigned char* instart = in;
1464 const unsigned char* inend = in + (*inlen);
1465 unsigned int c, d;
1466 int trailing;
1467
1468 while (in < inend) {
1469 d = *in++;
1470 if (d < 0x80) { c= d; trailing= 0; }
1471 else if (d < 0xC0) {
1472 /* trailing byte in leading position */
1473 *outlen = out - outstart;
1474 *inlen = processed - instart;
1475 return(-2);
1476 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1477 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1478 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1479 else {
1480 /* no chance for this in Ascii */
1481 *outlen = out - outstart;
1482 *inlen = processed - instart;
1483 return(-2);
1484 }
1485
1486 if (inend - in < trailing)
1487 break;
1488
1489 while (trailing--) {
1490 if (((d= *in++) & 0xC0) != 0x80) {
1491 *outlen = out - outstart;
1492 *inlen = processed - instart;
1493 return(-2);
1494 }
1495 c <<= 6;
1496 c |= d & 0x3F;
1497 }
1498
1499 /* assertion: c is a single UTF-4 value */
1500 if (c < 0x80 && c != quoteChar && c != '&' && c != '<' && c != '>') {
1501 if (out >= outend)
1502 break;
1503 *out++ = c;
1504 } else {
1505 htmlEntityDescPtr ent;
1506 const char *cp;
1507 char nbuf[16];
1508 int len;
1509
1510 /*
1511 * Try to lookup a predefined HTML entity for it
1512 */
1513 ent = htmlEntityValueLookup(c);
1514 if (ent == NULL) {
1515 sprintf(nbuf, "#%u", c);
1516 cp = nbuf;
1517 }
1518 else
1519 cp = ent->name;
1520 len = strlen(cp);
1521 if (out + 2 + len > outend)
1522 break;
1523 *out++ = '&';
1524 memcpy(out, cp, len);
1525 out += len;
1526 *out++ = ';';
1527 }
1528 processed = in;
1529 }
1530 *outlen = out - outstart;
1531 *inlen = processed - instart;
1532 return(0);
1533}
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001534
1535/**
1536 * htmlDecodeEntities:
1537 * @ctxt: the parser context
1538 * @len: the len to decode (in bytes !), -1 for no size limit
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001539 * @end: an end marker xmlChar, 0 if none
1540 * @end2: an end marker xmlChar, 0 if none
1541 * @end3: an end marker xmlChar, 0 if none
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001542 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001543 * Subtitute the HTML entities by their value
1544 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001545 * DEPRECATED !!!!
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001546 *
1547 * Returns A newly allocated string with the substitution done. The caller
1548 * must deallocate it !
1549 */
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001550xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001551htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001552 xmlChar end, xmlChar end2, xmlChar end3) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001553 xmlChar *name = NULL;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001554 xmlChar *buffer = NULL;
1555 unsigned int buffer_size = 0;
1556 unsigned int nbchars = 0;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001557 htmlEntityDescPtr ent;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001558 unsigned int max = (unsigned int) len;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001559 int c,l;
1560
1561 if (ctxt->depth > 40) {
Daniel Veillarda2c6da92000-09-16 18:15:00 +00001562 ctxt->errNo = XML_ERR_ENTITY_LOOP;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001563 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1564 ctxt->sax->error(ctxt->userData,
1565 "Detected entity reference loop\n");
1566 ctxt->wellFormed = 0;
1567 ctxt->disableSAX = 1;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001568 return(NULL);
1569 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001570
1571 /*
1572 * allocate a translation buffer.
1573 */
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001574 buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001575 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001576 if (buffer == NULL) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001577 perror("xmlDecodeEntities: malloc failed");
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001578 return(NULL);
1579 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001580
1581 /*
1582 * Ok loop until we reach one of the ending char or a size limit.
1583 */
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001584 c = CUR_CHAR(l);
1585 while ((nbchars < max) && (c != end) &&
1586 (c != end2) && (c != end3)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001587
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001588 if (c == 0) break;
1589 if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
1590 int val = htmlParseCharRef(ctxt);
1591 COPY_BUF(0,buffer,nbchars,val);
1592 NEXTL(l);
1593 } else if ((c == '&') && (ctxt->token != '&')) {
1594 ent = htmlParseEntityRef(ctxt, &name);
1595 if (name != NULL) {
1596 if (ent != NULL) {
1597 int val = ent->value;
1598 COPY_BUF(0,buffer,nbchars,val);
1599 NEXTL(l);
1600 } else {
1601 const xmlChar *cur = name;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001602
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001603 buffer[nbchars++] = '&';
1604 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1605 growBuffer(buffer);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001606 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001607 while (*cur != 0) {
1608 buffer[nbchars++] = *cur++;
1609 }
1610 buffer[nbchars++] = ';';
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001611 }
1612 }
1613 } else {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001614 COPY_BUF(l,buffer,nbchars,c);
1615 NEXTL(l);
1616 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001617 growBuffer(buffer);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001618 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001619 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001620 c = CUR_CHAR(l);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001621 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001622 buffer[nbchars++] = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001623 return(buffer);
1624}
1625
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001626/************************************************************************
1627 * *
1628 * Commodity functions to handle streams *
1629 * *
1630 ************************************************************************/
1631
1632/**
1633 * htmlFreeInputStream:
1634 * @input: an htmlParserInputPtr
1635 *
1636 * Free up an input stream.
1637 */
1638void
1639htmlFreeInputStream(htmlParserInputPtr input) {
1640 if (input == NULL) return;
1641
1642 if (input->filename != NULL) xmlFree((char *) input->filename);
1643 if (input->directory != NULL) xmlFree((char *) input->directory);
1644 if ((input->free != NULL) && (input->base != NULL))
1645 input->free((xmlChar *) input->base);
1646 if (input->buf != NULL)
1647 xmlFreeParserInputBuffer(input->buf);
1648 memset(input, -1, sizeof(htmlParserInput));
1649 xmlFree(input);
1650}
1651
1652/**
1653 * htmlNewInputStream:
1654 * @ctxt: an HTML parser context
1655 *
1656 * Create a new input stream structure
1657 * Returns the new input stream or NULL
1658 */
1659htmlParserInputPtr
1660htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1661 htmlParserInputPtr input;
1662
1663 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1664 if (input == NULL) {
1665 ctxt->errNo = XML_ERR_NO_MEMORY;
1666 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1667 ctxt->sax->error(ctxt->userData,
1668 "malloc: couldn't allocate a new input stream\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001669 return(NULL);
1670 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001671 memset(input, 0, sizeof(htmlParserInput));
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001672 input->filename = NULL;
1673 input->directory = NULL;
1674 input->base = NULL;
1675 input->cur = NULL;
1676 input->buf = NULL;
1677 input->line = 1;
1678 input->col = 1;
1679 input->buf = NULL;
1680 input->free = NULL;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001681 input->version = NULL;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001682 input->consumed = 0;
1683 input->length = 0;
1684 return(input);
1685}
1686
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001687
1688/************************************************************************
1689 * *
1690 * Commodity functions, cleanup needed ? *
1691 * *
1692 ************************************************************************/
1693
1694/**
1695 * areBlanks:
1696 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001697 * @str: a xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001698 * @len: the size of @str
1699 *
1700 * Is this a sequence of blank chars that one can ignore ?
1701 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001702 * Returns 1 if ignorable 0 otherwise.
1703 */
1704
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001705static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001706 int i;
1707 xmlNodePtr lastChild;
1708
1709 for (i = 0;i < len;i++)
1710 if (!(IS_BLANK(str[i]))) return(0);
1711
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001712 if (CUR == 0) return(1);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001713 if (CUR != '<') return(0);
Daniel Veillarde010c172000-08-28 10:04:51 +00001714 if (ctxt->name == NULL)
1715 return(1);
Daniel Veillard8b5dd832000-10-01 20:28:44 +00001716 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
Daniel Veillard4948eb42000-08-29 09:41:15 +00001717 return(1);
Daniel Veillard8b5dd832000-10-01 20:28:44 +00001718 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
Daniel Veillarde010c172000-08-28 10:04:51 +00001719 return(1);
Daniel Veillard8b5dd832000-10-01 20:28:44 +00001720 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
Daniel Veillarde010c172000-08-28 10:04:51 +00001721 return(1);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001722 if (ctxt->node == NULL) return(0);
1723 lastChild = xmlGetLastChild(ctxt->node);
1724 if (lastChild == NULL) {
1725 if (ctxt->node->content != NULL) return(0);
Daniel Veillardc4f4f0b2000-10-29 17:46:30 +00001726 } else if (xmlNodeIsText(lastChild)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001727 return(0);
Daniel Veillardc4f4f0b2000-10-29 17:46:30 +00001728 } else if (xmlStrEqual(lastChild->name, BAD_CAST"b")) {
1729 return(0);
1730 } else if (xmlStrEqual(lastChild->name, BAD_CAST"bold")) {
1731 return(0);
1732 } else if (xmlStrEqual(lastChild->name, BAD_CAST"em")) {
1733 return(0);
1734 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001735 return(1);
1736}
1737
1738/**
1739 * htmlHandleEntity:
1740 * @ctxt: an HTML parser context
1741 * @entity: an XML entity pointer.
1742 *
1743 * Default handling of an HTML entity, call the parser with the
1744 * substitution string
1745 */
1746
1747void
1748htmlHandleEntity(htmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
1749 int len;
1750
1751 if (entity->content == NULL) {
1752 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1753 ctxt->sax->error(ctxt->userData, "htmlHandleEntity %s: content == NULL\n",
1754 entity->name);
1755 ctxt->wellFormed = 0;
1756 return;
1757 }
1758 len = xmlStrlen(entity->content);
1759
1760 /*
1761 * Just handle the content as a set of chars.
1762 */
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001763 htmlCheckParagraph(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001764 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
1765 ctxt->sax->characters(ctxt->userData, entity->content, len);
1766
1767}
1768
1769/**
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001770 * htmlNewDocNoDtD:
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001771 * @URI: URI for the dtd, or NULL
1772 * @ExternalID: the external ID of the DTD, or NULL
1773 *
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001774 * Returns a new document, do not intialize the DTD if not provided
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001775 */
1776htmlDocPtr
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001777htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001778 xmlDocPtr cur;
1779
1780 /*
1781 * Allocate a new document and fill the fields.
1782 */
Daniel Veillard6454aec1999-09-02 22:04:43 +00001783 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001784 if (cur == NULL) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001785 xmlGenericError(xmlGenericErrorContext,
1786 "xmlNewDoc : malloc failed\n");
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001787 return(NULL);
1788 }
Daniel Veillarde7a5a771999-08-30 13:05:42 +00001789 memset(cur, 0, sizeof(xmlDoc));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001790
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001791 cur->type = XML_HTML_DOCUMENT_NODE;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001792 cur->version = NULL;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001793 cur->intSubset = NULL;
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001794 if ((ExternalID != NULL) ||
1795 (URI != NULL))
Daniel Veillard5cb5ab81999-12-21 15:35:29 +00001796 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
Daniel Veillardbe803962000-06-28 23:40:59 +00001797 cur->doc = cur;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001798 cur->name = NULL;
Daniel Veillardcf461992000-03-14 18:30:20 +00001799 cur->children = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001800 cur->extSubset = NULL;
1801 cur->oldNs = NULL;
1802 cur->encoding = NULL;
1803 cur->standalone = 1;
1804 cur->compression = 0;
Daniel Veillardc08a2c61999-09-08 21:35:25 +00001805 cur->ids = NULL;
1806 cur->refs = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001807#ifndef XML_WITHOUT_CORBA
1808 cur->_private = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001809#endif
1810 return(cur);
1811}
1812
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001813/**
1814 * htmlNewDoc:
1815 * @URI: URI for the dtd, or NULL
1816 * @ExternalID: the external ID of the DTD, or NULL
1817 *
1818 * Returns a new document
1819 */
1820htmlDocPtr
1821htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1822 if ((URI == NULL) && (ExternalID == NULL))
1823 return(htmlNewDocNoDtD(
1824 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
1825 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"));
1826
1827 return(htmlNewDocNoDtD(URI, ExternalID));
1828}
1829
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001830
1831/************************************************************************
1832 * *
1833 * The parser itself *
1834 * Relates to http://www.w3.org/TR/html40 *
1835 * *
1836 ************************************************************************/
1837
1838/************************************************************************
1839 * *
1840 * The parser itself *
1841 * *
1842 ************************************************************************/
1843
1844/**
1845 * htmlParseHTMLName:
1846 * @ctxt: an HTML parser context
1847 *
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001848 * parse an HTML tag or attribute name, note that we convert it to lowercase
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001849 * since HTML names are not case-sensitive.
1850 *
1851 * Returns the Tag Name parsed or NULL
1852 */
1853
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001854xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001855htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001856 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001857 int i = 0;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001858 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001859
1860 if (!IS_LETTER(CUR) && (CUR != '_') &&
1861 (CUR != ':')) return(NULL);
1862
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001863 while ((i < HTML_PARSER_BUFFER_SIZE) &&
Daniel Veillard3f6f7f62000-06-30 17:58:25 +00001864 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
Daniel Veillarde8282ed2000-10-10 23:01:31 +00001865 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001866 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001867 else loc[i] = CUR;
1868 i++;
1869
1870 NEXT;
1871 }
1872
1873 ret = xmlStrndup(loc, i);
1874
1875 return(ret);
1876}
1877
1878/**
1879 * htmlParseName:
1880 * @ctxt: an HTML parser context
1881 *
1882 * parse an HTML name, this routine is case sensistive.
1883 *
1884 * Returns the Name parsed or NULL
1885 */
1886
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001887xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001888htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001889 xmlChar buf[HTML_MAX_NAMELEN];
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001890 int len = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001891
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001892 GROW;
1893 if (!IS_LETTER(CUR) && (CUR != '_')) {
1894 return(NULL);
1895 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001896
1897 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1898 (CUR == '.') || (CUR == '-') ||
1899 (CUR == '_') || (CUR == ':') ||
1900 (IS_COMBINING(CUR)) ||
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001901 (IS_EXTENDER(CUR))) {
1902 buf[len++] = CUR;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001903 NEXT;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001904 if (len >= HTML_MAX_NAMELEN) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001905 xmlGenericError(xmlGenericErrorContext,
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001906 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1907 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1908 (CUR == '.') || (CUR == '-') ||
1909 (CUR == '_') || (CUR == ':') ||
1910 (IS_COMBINING(CUR)) ||
1911 (IS_EXTENDER(CUR)))
1912 NEXT;
1913 break;
1914 }
1915 }
1916 return(xmlStrndup(buf, len));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001917}
1918
1919/**
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001920 * htmlParseHTMLAttribute:
1921 * @ctxt: an HTML parser context
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001922 * @stop: a char stop value
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001923 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001924 * parse an HTML attribute value till the stop (quote), if
1925 * stop is 0 then it stops at the first space
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001926 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001927 * Returns the attribute parsed or NULL
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001928 */
1929
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001930xmlChar *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001931htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00001932 xmlChar *buffer = NULL;
1933 int buffer_size = 0;
1934 xmlChar *out = NULL;
1935 xmlChar *name = NULL;
1936
1937 xmlChar *cur = NULL;
1938 htmlEntityDescPtr ent;
1939
1940 /*
1941 * allocate a translation buffer.
1942 */
Daniel Veillard7eda8452000-10-14 23:38:43 +00001943 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard71b656e2000-01-05 14:46:17 +00001944 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1945 if (buffer == NULL) {
1946 perror("htmlParseHTMLAttribute: malloc failed");
1947 return(NULL);
1948 }
1949 out = buffer;
1950
1951 /*
1952 * Ok loop until we reach one of the ending chars
1953 */
1954 while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1955 if ((stop == 0) && (IS_BLANK(CUR))) break;
1956 if (CUR == '&') {
1957 if (NXT(1) == '#') {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001958 unsigned int c;
1959 int bits;
1960
1961 c = htmlParseCharRef(ctxt);
1962 if (c < 0x80)
1963 { *out++ = c; bits= -6; }
1964 else if (c < 0x800)
1965 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1966 else if (c < 0x10000)
1967 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1968 else
1969 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1970
1971 for ( ; bits >= 0; bits-= 6) {
1972 *out++ = ((c >> bits) & 0x3F) | 0x80;
1973 }
Daniel Veillard71b656e2000-01-05 14:46:17 +00001974 } else {
1975 ent = htmlParseEntityRef(ctxt, &name);
1976 if (name == NULL) {
1977 *out++ = '&';
1978 if (out - buffer > buffer_size - 100) {
1979 int index = out - buffer;
1980
1981 growBuffer(buffer);
1982 out = &buffer[index];
1983 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001984 } else if (ent == NULL) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00001985 *out++ = '&';
1986 cur = name;
1987 while (*cur != 0) {
1988 if (out - buffer > buffer_size - 100) {
1989 int index = out - buffer;
1990
1991 growBuffer(buffer);
1992 out = &buffer[index];
1993 }
1994 *out++ = *cur++;
1995 }
1996 xmlFree(name);
1997 } else {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001998 unsigned int c;
1999 int bits;
2000
Daniel Veillard71b656e2000-01-05 14:46:17 +00002001 if (out - buffer > buffer_size - 100) {
2002 int index = out - buffer;
2003
2004 growBuffer(buffer);
2005 out = &buffer[index];
2006 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002007 c = (xmlChar)ent->value;
2008 if (c < 0x80)
2009 { *out++ = c; bits= -6; }
2010 else if (c < 0x800)
2011 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2012 else if (c < 0x10000)
2013 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2014 else
2015 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2016
2017 for ( ; bits >= 0; bits-= 6) {
2018 *out++ = ((c >> bits) & 0x3F) | 0x80;
2019 }
Daniel Veillard71b656e2000-01-05 14:46:17 +00002020 xmlFree(name);
2021 }
2022 }
2023 } else {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002024 unsigned int c;
Daniel Veillard4fb87ee2000-09-19 12:25:59 +00002025 int bits, l;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002026
Daniel Veillard71b656e2000-01-05 14:46:17 +00002027 if (out - buffer > buffer_size - 100) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002028 int index = out - buffer;
2029
2030 growBuffer(buffer);
2031 out = &buffer[index];
2032 }
Daniel Veillard4fb87ee2000-09-19 12:25:59 +00002033 c = CUR_CHAR(l);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002034 if (c < 0x80)
2035 { *out++ = c; bits= -6; }
2036 else if (c < 0x800)
2037 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2038 else if (c < 0x10000)
2039 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2040 else
2041 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2042
2043 for ( ; bits >= 0; bits-= 6) {
2044 *out++ = ((c >> bits) & 0x3F) | 0x80;
Daniel Veillard71b656e2000-01-05 14:46:17 +00002045 }
2046 NEXT;
2047 }
2048 }
2049 *out++ = 0;
2050 return(buffer);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002051}
2052
2053/**
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002054 * htmlParseNmtoken:
2055 * @ctxt: an HTML parser context
2056 *
2057 * parse an HTML Nmtoken.
2058 *
2059 * Returns the Nmtoken parsed or NULL
2060 */
2061
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002062xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002063htmlParseNmtoken(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002064 xmlChar buf[HTML_MAX_NAMELEN];
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002065 int len = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002066
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002067 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002068 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2069 (CUR == '.') || (CUR == '-') ||
2070 (CUR == '_') || (CUR == ':') ||
2071 (IS_COMBINING(CUR)) ||
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002072 (IS_EXTENDER(CUR))) {
2073 buf[len++] = CUR;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002074 NEXT;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002075 if (len >= HTML_MAX_NAMELEN) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00002076 xmlGenericError(xmlGenericErrorContext,
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002077 "htmlParseNmtoken: reached HTML_MAX_NAMELEN limit\n");
2078 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2079 (CUR == '.') || (CUR == '-') ||
2080 (CUR == '_') || (CUR == ':') ||
2081 (IS_COMBINING(CUR)) ||
2082 (IS_EXTENDER(CUR)))
2083 NEXT;
2084 break;
2085 }
2086 }
2087 return(xmlStrndup(buf, len));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002088}
2089
2090/**
2091 * htmlParseEntityRef:
2092 * @ctxt: an HTML parser context
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002093 * @str: location to store the entity name
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002094 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002095 * parse an HTML ENTITY references
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002096 *
2097 * [68] EntityRef ::= '&' Name ';'
2098 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002099 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2100 * if non-NULL *str will have to be freed by the caller.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002101 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002102htmlEntityDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002103htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2104 xmlChar *name;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002105 htmlEntityDescPtr ent = NULL;
2106 *str = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002107
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002108 if (CUR == '&') {
2109 NEXT;
2110 name = htmlParseName(ctxt);
2111 if (name == NULL) {
2112 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2113 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2114 ctxt->wellFormed = 0;
2115 } else {
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002116 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002117 if (CUR == ';') {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002118 *str = name;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002119
2120 /*
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002121 * Lookup the entity in the table.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002122 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002123 ent = htmlEntityLookup(name);
Daniel Veillard71b656e2000-01-05 14:46:17 +00002124 if (ent != NULL) /* OK that's ugly !!! */
2125 NEXT;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002126 } else {
2127 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2128 ctxt->sax->error(ctxt->userData,
2129 "htmlParseEntityRef: expecting ';'\n");
Daniel Veillard71b656e2000-01-05 14:46:17 +00002130 *str = name;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002131 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002132 }
2133 }
2134 return(ent);
2135}
2136
2137/**
2138 * htmlParseAttValue:
2139 * @ctxt: an HTML parser context
2140 *
2141 * parse a value for an attribute
2142 * Note: the parser won't do substitution of entities here, this
2143 * will be handled later in xmlStringGetNodeList, unless it was
2144 * asked for ctxt->replaceEntities != 0
2145 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002146 * Returns the AttValue parsed or NULL.
2147 */
2148
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002149xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002150htmlParseAttValue(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002151 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002152
2153 if (CUR == '"') {
2154 NEXT;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002155 ret = htmlParseHTMLAttribute(ctxt, '"');
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002156 if (CUR != '"') {
2157 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2158 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2159 ctxt->wellFormed = 0;
2160 } else
2161 NEXT;
2162 } else if (CUR == '\'') {
2163 NEXT;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002164 ret = htmlParseHTMLAttribute(ctxt, '\'');
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002165 if (CUR != '\'') {
2166 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2167 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2168 ctxt->wellFormed = 0;
2169 } else
2170 NEXT;
2171 } else {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002172 /*
2173 * That's an HTMLism, the attribute value may not be quoted
2174 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002175 ret = htmlParseHTMLAttribute(ctxt, 0);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002176 if (ret == NULL) {
2177 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2178 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2179 ctxt->wellFormed = 0;
2180 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002181 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002182 return(ret);
2183}
2184
2185/**
2186 * htmlParseSystemLiteral:
2187 * @ctxt: an HTML parser context
2188 *
2189 * parse an HTML Literal
2190 *
2191 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2192 *
2193 * Returns the SystemLiteral parsed or NULL
2194 */
2195
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002196xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002197htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002198 const xmlChar *q;
2199 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002200
2201 if (CUR == '"') {
2202 NEXT;
2203 q = CUR_PTR;
2204 while ((IS_CHAR(CUR)) && (CUR != '"'))
2205 NEXT;
2206 if (!IS_CHAR(CUR)) {
2207 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2208 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2209 ctxt->wellFormed = 0;
2210 } else {
2211 ret = xmlStrndup(q, CUR_PTR - q);
2212 NEXT;
2213 }
2214 } else if (CUR == '\'') {
2215 NEXT;
2216 q = CUR_PTR;
2217 while ((IS_CHAR(CUR)) && (CUR != '\''))
2218 NEXT;
2219 if (!IS_CHAR(CUR)) {
2220 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2221 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2222 ctxt->wellFormed = 0;
2223 } else {
2224 ret = xmlStrndup(q, CUR_PTR - q);
2225 NEXT;
2226 }
2227 } else {
2228 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardcf461992000-03-14 18:30:20 +00002229 ctxt->sax->error(ctxt->userData,
2230 "SystemLiteral \" or ' expected\n");
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002231 ctxt->wellFormed = 0;
2232 }
2233
2234 return(ret);
2235}
2236
2237/**
2238 * htmlParsePubidLiteral:
2239 * @ctxt: an HTML parser context
2240 *
2241 * parse an HTML public literal
2242 *
2243 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2244 *
2245 * Returns the PubidLiteral parsed or NULL.
2246 */
2247
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002248xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002249htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002250 const xmlChar *q;
2251 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002252 /*
2253 * Name ::= (Letter | '_') (NameChar)*
2254 */
2255 if (CUR == '"') {
2256 NEXT;
2257 q = CUR_PTR;
2258 while (IS_PUBIDCHAR(CUR)) NEXT;
2259 if (CUR != '"') {
2260 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2261 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2262 ctxt->wellFormed = 0;
2263 } else {
2264 ret = xmlStrndup(q, CUR_PTR - q);
2265 NEXT;
2266 }
2267 } else if (CUR == '\'') {
2268 NEXT;
2269 q = CUR_PTR;
2270 while ((IS_LETTER(CUR)) && (CUR != '\''))
2271 NEXT;
2272 if (!IS_LETTER(CUR)) {
2273 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2274 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2275 ctxt->wellFormed = 0;
2276 } else {
2277 ret = xmlStrndup(q, CUR_PTR - q);
2278 NEXT;
2279 }
2280 } else {
2281 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2282 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2283 ctxt->wellFormed = 0;
2284 }
2285
2286 return(ret);
2287}
2288
2289/**
Daniel Veillard7eda8452000-10-14 23:38:43 +00002290 * htmlParseScript:
2291 * @ctxt: an HTML parser context
2292 *
2293 * parse the content of an HTML SCRIPT or STYLE element
2294 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2295 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2296 * http://www.w3.org/TR/html4/types.html#type-script
2297 * http://www.w3.org/TR/html4/types.html#h-6.15
2298 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2299 *
2300 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2301 * element and the value of intrinsic event attributes. User agents must
2302 * not evaluate script data as HTML markup but instead must pass it on as
2303 * data to a script engine.
2304 * NOTES:
2305 * - The content is passed like CDATA
2306 * - the attributes for style and scripting "onXXX" are also described
2307 * as CDATA but SGML allows entities references in attributes so their
2308 * processing is identical as other attributes
2309 */
2310void
2311htmlParseScript(htmlParserCtxtPtr ctxt) {
2312 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2313 int nbchar = 0;
2314 xmlChar cur;
2315
2316 SHRINK;
2317 cur = CUR;
2318 while (IS_CHAR(cur)) {
2319 if ((cur == '<') && (NXT(1) == '/')) {
2320 /*
2321 * One should break here, the specification is clear:
2322 * Authors should therefore escape "</" within the content.
2323 * Escape mechanisms are specific to each scripting or
2324 * style sheet language.
2325 */
2326 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2327 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2328 break; /* while */
2329 }
2330 buf[nbchar++] = cur;
2331 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2332 if (ctxt->sax->cdataBlock!= NULL) {
2333 /*
2334 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2335 */
2336 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2337 }
2338 nbchar = 0;
2339 }
2340 NEXT;
2341 cur = CUR;
2342 }
Daniel Veillarda4964b72000-10-31 18:23:44 +00002343 if (!(IS_CHAR(cur))) {
2344 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2345 ctxt->sax->error(ctxt->userData,
2346 "Invalid char in CDATA 0x%X\n", cur);
2347 ctxt->wellFormed = 0;
2348 NEXT;
2349 }
2350
Daniel Veillard7eda8452000-10-14 23:38:43 +00002351 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2352 if (ctxt->sax->cdataBlock!= NULL) {
2353 /*
2354 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2355 */
2356 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2357 }
2358 }
2359}
2360
2361
2362/**
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002363 * htmlParseCharData:
2364 * @ctxt: an HTML parser context
2365 * @cdata: int indicating whether we are within a CDATA section
2366 *
2367 * parse a CharData section.
2368 * if we are within a CDATA section ']]>' marks an end of section.
2369 *
2370 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2371 */
2372
2373void
2374htmlParseCharData(htmlParserCtxtPtr ctxt, int cdata) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002375 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2376 int nbchar = 0;
2377 int cur, l;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002378
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002379 SHRINK;
2380 cur = CUR_CHAR(l);
2381 while (((cur != '<') || (ctxt->token == '<')) &&
2382 ((cur != '&') || (ctxt->token == '&')) &&
2383 (IS_CHAR(cur))) {
2384 COPY_BUF(l,buf,nbchar,cur);
2385 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2386 /*
2387 * Ok the segment is to be consumed as chars.
2388 */
2389 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2390 if (areBlanks(ctxt, buf, nbchar)) {
2391 if (ctxt->sax->ignorableWhitespace != NULL)
2392 ctxt->sax->ignorableWhitespace(ctxt->userData,
2393 buf, nbchar);
2394 } else {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00002395 htmlCheckParagraph(ctxt);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002396 if (ctxt->sax->characters != NULL)
2397 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2398 }
2399 }
2400 nbchar = 0;
2401 }
2402 NEXTL(l);
2403 cur = CUR_CHAR(l);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002404 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002405 if (nbchar != 0) {
2406 /*
2407 * Ok the segment is to be consumed as chars.
2408 */
2409 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2410 if (areBlanks(ctxt, buf, nbchar)) {
2411 if (ctxt->sax->ignorableWhitespace != NULL)
2412 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2413 } else {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00002414 htmlCheckParagraph(ctxt);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002415 if (ctxt->sax->characters != NULL)
2416 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002417 }
2418 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002419 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002420}
2421
2422/**
2423 * htmlParseExternalID:
2424 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002425 * @publicID: a xmlChar** receiving PubidLiteral
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002426 * @strict: indicate whether we should restrict parsing to only
2427 * production [75], see NOTE below
2428 *
2429 * Parse an External ID or a Public ID
2430 *
2431 * NOTE: Productions [75] and [83] interract badly since [75] can generate
2432 * 'PUBLIC' S PubidLiteral S SystemLiteral
2433 *
2434 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2435 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2436 *
2437 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2438 *
2439 * Returns the function returns SystemLiteral and in the second
2440 * case publicID receives PubidLiteral, is strict is off
2441 * it is possible to return NULL and have publicID set.
2442 */
2443
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002444xmlChar *
2445htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID, int strict) {
2446 xmlChar *URI = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002447
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002448 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2449 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2450 (UPP(4) == 'E') && (UPP(5) == 'M')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002451 SKIP(6);
2452 if (!IS_BLANK(CUR)) {
2453 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2454 ctxt->sax->error(ctxt->userData,
2455 "Space required after 'SYSTEM'\n");
2456 ctxt->wellFormed = 0;
2457 }
2458 SKIP_BLANKS;
2459 URI = htmlParseSystemLiteral(ctxt);
2460 if (URI == NULL) {
2461 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2462 ctxt->sax->error(ctxt->userData,
2463 "htmlParseExternalID: SYSTEM, no URI\n");
2464 ctxt->wellFormed = 0;
2465 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002466 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2467 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2468 (UPP(4) == 'I') && (UPP(5) == 'C')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002469 SKIP(6);
2470 if (!IS_BLANK(CUR)) {
2471 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2472 ctxt->sax->error(ctxt->userData,
2473 "Space required after 'PUBLIC'\n");
2474 ctxt->wellFormed = 0;
2475 }
2476 SKIP_BLANKS;
2477 *publicID = htmlParsePubidLiteral(ctxt);
2478 if (*publicID == NULL) {
2479 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2480 ctxt->sax->error(ctxt->userData,
2481 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2482 ctxt->wellFormed = 0;
2483 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002484 SKIP_BLANKS;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002485 if ((CUR == '"') || (CUR == '\'')) {
2486 URI = htmlParseSystemLiteral(ctxt);
2487 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002488 }
2489 return(URI);
2490}
2491
2492/**
2493 * htmlParseComment:
2494 * @ctxt: an HTML parser context
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002495 *
2496 * Parse an XML (SGML) comment <!-- .... -->
2497 *
2498 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2499 */
2500void
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002501htmlParseComment(htmlParserCtxtPtr ctxt) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002502 xmlChar *buf = NULL;
Daniel Veillard87b95392000-08-12 21:12:04 +00002503 int len;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002504 int size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard87b95392000-08-12 21:12:04 +00002505 int q, ql;
2506 int r, rl;
2507 int cur, l;
2508 xmlParserInputState state;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002509
2510 /*
2511 * Check that there is a comment right here.
2512 */
Daniel Veillard87b95392000-08-12 21:12:04 +00002513 if ((RAW != '<') || (NXT(1) != '!') ||
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002514 (NXT(2) != '-') || (NXT(3) != '-')) return;
2515
Daniel Veillard87b95392000-08-12 21:12:04 +00002516 state = ctxt->instate;
2517 ctxt->instate = XML_PARSER_COMMENT;
2518 SHRINK;
2519 SKIP(4);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002520 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2521 if (buf == NULL) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00002522 xmlGenericError(xmlGenericErrorContext,
2523 "malloc of %d byte failed\n", size);
Daniel Veillard87b95392000-08-12 21:12:04 +00002524 ctxt->instate = state;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002525 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002526 }
Daniel Veillard87b95392000-08-12 21:12:04 +00002527 q = CUR_CHAR(ql);
2528 NEXTL(ql);
2529 r = CUR_CHAR(rl);
2530 NEXTL(rl);
2531 cur = CUR_CHAR(l);
2532 len = 0;
2533 while (IS_CHAR(cur) &&
2534 ((cur != '>') ||
2535 (r != '-') || (q != '-'))) {
2536 if (len + 5 >= size) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002537 size *= 2;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002538 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002539 if (buf == NULL) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00002540 xmlGenericError(xmlGenericErrorContext,
2541 "realloc of %d byte failed\n", size);
Daniel Veillard87b95392000-08-12 21:12:04 +00002542 ctxt->instate = state;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002543 return;
2544 }
2545 }
Daniel Veillard87b95392000-08-12 21:12:04 +00002546 COPY_BUF(ql,buf,len,q);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002547 q = r;
Daniel Veillard87b95392000-08-12 21:12:04 +00002548 ql = rl;
2549 r = cur;
2550 rl = l;
2551 NEXTL(l);
2552 cur = CUR_CHAR(l);
2553 if (cur == 0) {
2554 SHRINK;
2555 GROW;
2556 cur = CUR_CHAR(l);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002557 }
2558 }
Daniel Veillard87b95392000-08-12 21:12:04 +00002559 buf[len] = 0;
2560 if (!IS_CHAR(cur)) {
Daniel Veillarda2c6da92000-09-16 18:15:00 +00002561 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
Daniel Veillard87b95392000-08-12 21:12:04 +00002562 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2563 ctxt->sax->error(ctxt->userData,
2564 "Comment not terminated \n<!--%.50s\n", buf);
Daniel Veillard87b95392000-08-12 21:12:04 +00002565 ctxt->wellFormed = 0;
2566 xmlFree(buf);
2567 } else {
2568 NEXT;
2569 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2570 (!ctxt->disableSAX))
2571 ctxt->sax->comment(ctxt->userData, buf);
2572 xmlFree(buf);
2573 }
2574 ctxt->instate = state;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002575}
2576
2577/**
2578 * htmlParseCharRef:
2579 * @ctxt: an HTML parser context
2580 *
2581 * parse Reference declarations
2582 *
2583 * [66] CharRef ::= '&#' [0-9]+ ';' |
2584 * '&#x' [0-9a-fA-F]+ ';'
2585 *
2586 * Returns the value parsed (as an int)
2587 */
2588int
2589htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2590 int val = 0;
2591
2592 if ((CUR == '&') && (NXT(1) == '#') &&
2593 (NXT(2) == 'x')) {
2594 SKIP(3);
2595 while (CUR != ';') {
2596 if ((CUR >= '0') && (CUR <= '9'))
2597 val = val * 16 + (CUR - '0');
2598 else if ((CUR >= 'a') && (CUR <= 'f'))
2599 val = val * 16 + (CUR - 'a') + 10;
2600 else if ((CUR >= 'A') && (CUR <= 'F'))
2601 val = val * 16 + (CUR - 'A') + 10;
2602 else {
2603 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2604 ctxt->sax->error(ctxt->userData,
2605 "htmlParseCharRef: invalid hexadecimal value\n");
2606 ctxt->wellFormed = 0;
Daniel Veillard748e45d2000-11-17 16:36:08 +00002607 return(0);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002608 }
2609 NEXT;
2610 }
2611 if (CUR == ';')
2612 NEXT;
2613 } else if ((CUR == '&') && (NXT(1) == '#')) {
2614 SKIP(2);
2615 while (CUR != ';') {
2616 if ((CUR >= '0') && (CUR <= '9'))
2617 val = val * 10 + (CUR - '0');
2618 else {
2619 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2620 ctxt->sax->error(ctxt->userData,
2621 "htmlParseCharRef: invalid decimal value\n");
2622 ctxt->wellFormed = 0;
Daniel Veillard748e45d2000-11-17 16:36:08 +00002623 return(0);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002624 }
2625 NEXT;
2626 }
2627 if (CUR == ';')
2628 NEXT;
2629 } else {
2630 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2631 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2632 ctxt->wellFormed = 0;
2633 }
2634 /*
2635 * Check the value IS_CHAR ...
2636 */
2637 if (IS_CHAR(val)) {
2638 return(val);
2639 } else {
2640 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002641 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002642 val);
2643 ctxt->wellFormed = 0;
2644 }
2645 return(0);
2646}
2647
2648
2649/**
2650 * htmlParseDocTypeDecl :
2651 * @ctxt: an HTML parser context
2652 *
2653 * parse a DOCTYPE declaration
2654 *
2655 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2656 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2657 */
2658
2659void
2660htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002661 xmlChar *name;
2662 xmlChar *ExternalID = NULL;
2663 xmlChar *URI = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002664
2665 /*
2666 * We know that '<!DOCTYPE' has been detected.
2667 */
2668 SKIP(9);
2669
2670 SKIP_BLANKS;
2671
2672 /*
2673 * Parse the DOCTYPE name.
2674 */
2675 name = htmlParseName(ctxt);
2676 if (name == NULL) {
2677 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2678 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2679 ctxt->wellFormed = 0;
2680 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002681 /*
2682 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2683 */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002684
2685 SKIP_BLANKS;
2686
2687 /*
2688 * Check for SystemID and ExternalID
2689 */
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002690 URI = htmlParseExternalID(ctxt, &ExternalID, 0);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002691 SKIP_BLANKS;
2692
2693 /*
2694 * We should be at the end of the DOCTYPE declaration.
2695 */
2696 if (CUR != '>') {
2697 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2698 ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
2699 ctxt->wellFormed = 0;
2700 /* We shouldn't try to resynchronize ... */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002701 }
2702 NEXT;
2703
2704 /*
Daniel Veillardd83eb822000-06-30 18:39:56 +00002705 * Create or update the document accordingly to the DOCTYPE
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002706 */
Daniel Veillardd83eb822000-06-30 18:39:56 +00002707 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2708 (!ctxt->disableSAX))
2709 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002710
2711 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002712 * Cleanup, since we don't use all those identifiers
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002713 */
Daniel Veillard6454aec1999-09-02 22:04:43 +00002714 if (URI != NULL) xmlFree(URI);
2715 if (ExternalID != NULL) xmlFree(ExternalID);
2716 if (name != NULL) xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002717}
2718
2719/**
2720 * htmlParseAttribute:
2721 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002722 * @value: a xmlChar ** used to store the value of the attribute
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002723 *
2724 * parse an attribute
2725 *
2726 * [41] Attribute ::= Name Eq AttValue
2727 *
2728 * [25] Eq ::= S? '=' S?
2729 *
2730 * With namespace:
2731 *
2732 * [NS 11] Attribute ::= QName Eq AttValue
2733 *
2734 * Also the case QName == xmlns:??? is handled independently as a namespace
2735 * definition.
2736 *
2737 * Returns the attribute name, and the value in *value.
2738 */
2739
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002740xmlChar *
2741htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002742 xmlChar *name, *val = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002743
2744 *value = NULL;
Daniel Veillard970112a2000-10-03 09:33:21 +00002745 name = htmlParseHTMLName(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002746 if (name == NULL) {
2747 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2748 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2749 ctxt->wellFormed = 0;
2750 return(NULL);
2751 }
2752
2753 /*
2754 * read the value
2755 */
2756 SKIP_BLANKS;
2757 if (CUR == '=') {
2758 NEXT;
2759 SKIP_BLANKS;
2760 val = htmlParseAttValue(ctxt);
Daniel Veillardbe803962000-06-28 23:40:59 +00002761 /******
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002762 } else {
Daniel Veillardbe803962000-06-28 23:40:59 +00002763 * TODO : some attribute must have values, some may not
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002764 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002765 ctxt->sax->warning(ctxt->userData,
Daniel Veillardbe803962000-06-28 23:40:59 +00002766 "No value for attribute %s\n", name); */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002767 }
2768
2769 *value = val;
2770 return(name);
2771}
2772
2773/**
Daniel Veillard365e13b2000-07-02 07:56:37 +00002774 * htmlCheckEncoding:
2775 * @ctxt: an HTML parser context
2776 * @attvalue: the attribute value
2777 *
2778 * Checks an http-equiv attribute from a Meta tag to detect
2779 * the encoding
2780 * If a new encoding is detected the parser is switched to decode
2781 * it and pass UTF8
2782 */
2783void
2784htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2785 const xmlChar *encoding;
2786
2787 if ((ctxt == NULL) || (attvalue == NULL))
2788 return;
2789
Daniel Veillardb656ebe2000-09-22 13:51:48 +00002790 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
Daniel Veillard365e13b2000-07-02 07:56:37 +00002791 if (encoding != NULL) {
2792 encoding += 8;
2793 } else {
Daniel Veillardb656ebe2000-09-22 13:51:48 +00002794 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
Daniel Veillard365e13b2000-07-02 07:56:37 +00002795 if (encoding != NULL)
2796 encoding += 9;
2797 }
2798 if (encoding != NULL) {
2799 xmlCharEncoding enc;
2800 xmlCharEncodingHandlerPtr handler;
2801
2802 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2803
2804 if (ctxt->input->encoding != NULL)
2805 xmlFree((xmlChar *) ctxt->input->encoding);
2806 ctxt->input->encoding = xmlStrdup(encoding);
2807
2808 enc = xmlParseCharEncoding((const char *) encoding);
2809 /*
2810 * registered set of known encodings
2811 */
2812 if (enc != XML_CHAR_ENCODING_ERROR) {
2813 xmlSwitchEncoding(ctxt, enc);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002814 ctxt->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillard365e13b2000-07-02 07:56:37 +00002815 } else {
2816 /*
2817 * fallback for unknown encodings
2818 */
2819 handler = xmlFindCharEncodingHandler((const char *) encoding);
2820 if (handler != NULL) {
2821 xmlSwitchToEncoding(ctxt, handler);
Daniel Veillard87b95392000-08-12 21:12:04 +00002822 ctxt->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillard365e13b2000-07-02 07:56:37 +00002823 } else {
2824 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2825 }
2826 }
Daniel Veillard87b95392000-08-12 21:12:04 +00002827
2828 if ((ctxt->input->buf != NULL) &&
2829 (ctxt->input->buf->encoder != NULL) &&
2830 (ctxt->input->buf->raw != NULL) &&
2831 (ctxt->input->buf->buffer != NULL)) {
2832 int nbchars;
2833 int processed;
2834
2835 /*
2836 * convert as much as possible to the parser reading buffer.
2837 */
2838 processed = ctxt->input->cur - ctxt->input->base;
2839 xmlBufferShrink(ctxt->input->buf->buffer, processed);
2840 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2841 ctxt->input->buf->buffer,
2842 ctxt->input->buf->raw);
2843 if (nbchars < 0) {
Daniel Veillarda2c6da92000-09-16 18:15:00 +00002844 ctxt->errNo = XML_ERR_INVALID_ENCODING;
Daniel Veillard87b95392000-08-12 21:12:04 +00002845 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2846 ctxt->sax->error(ctxt->userData,
2847 "htmlCheckEncoding: encoder error\n");
Daniel Veillard87b95392000-08-12 21:12:04 +00002848 }
2849 ctxt->input->base =
2850 ctxt->input->cur = ctxt->input->buf->buffer->content;
2851 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00002852 }
2853}
2854
2855/**
2856 * htmlCheckMeta:
2857 * @ctxt: an HTML parser context
2858 * @atts: the attributes values
2859 *
2860 * Checks an attributes from a Meta tag
2861 */
2862void
2863htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2864 int i;
2865 const xmlChar *att, *value;
2866 int http = 0;
2867 const xmlChar *content = NULL;
2868
2869 if ((ctxt == NULL) || (atts == NULL))
2870 return;
2871
2872 i = 0;
2873 att = atts[i++];
2874 while (att != NULL) {
2875 value = atts[i++];
Daniel Veillardb656ebe2000-09-22 13:51:48 +00002876 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
2877 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
Daniel Veillard365e13b2000-07-02 07:56:37 +00002878 http = 1;
Daniel Veillardb656ebe2000-09-22 13:51:48 +00002879 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
Daniel Veillard365e13b2000-07-02 07:56:37 +00002880 content = value;
2881 att = atts[i++];
2882 }
2883 if ((http) && (content != NULL))
2884 htmlCheckEncoding(ctxt, content);
2885
2886}
2887
2888/**
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002889 * htmlParseStartTag:
2890 * @ctxt: an HTML parser context
2891 *
2892 * parse a start of tag either for rule element or
2893 * EmptyElement. In both case we don't parse the tag closing chars.
2894 *
2895 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2896 *
2897 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2898 *
2899 * With namespace:
2900 *
2901 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2902 *
2903 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2904 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002905 */
2906
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002907void
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002908htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002909 xmlChar *name;
2910 xmlChar *attname;
2911 xmlChar *attvalue;
2912 const xmlChar **atts = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002913 int nbatts = 0;
2914 int maxatts = 0;
Daniel Veillard365e13b2000-07-02 07:56:37 +00002915 int meta = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002916 int i;
2917
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002918 if (CUR != '<') return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002919 NEXT;
2920
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002921 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002922 name = htmlParseHTMLName(ctxt);
2923 if (name == NULL) {
2924 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2925 ctxt->sax->error(ctxt->userData,
2926 "htmlParseStartTag: invalid element name\n");
2927 ctxt->wellFormed = 0;
Daniel Veillard126f2792000-10-24 17:10:12 +00002928 /* Dump the bogus tag like browsers do */
2929 while ((IS_CHAR(CUR)) && (CUR != '>'))
2930 NEXT;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002931 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002932 }
Daniel Veillard8b5dd832000-10-01 20:28:44 +00002933 if (xmlStrEqual(name, BAD_CAST"meta"))
Daniel Veillard365e13b2000-07-02 07:56:37 +00002934 meta = 1;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002935
2936 /*
2937 * Check for auto-closure of HTML elements.
2938 */
2939 htmlAutoClose(ctxt, name);
2940
2941 /*
Daniel Veillardbe803962000-06-28 23:40:59 +00002942 * Check for implied HTML elements.
2943 */
2944 htmlCheckImplied(ctxt, name);
2945
2946 /*
Daniel Veillardf62ceff2000-11-24 23:36:01 +00002947 * Avoid html at any level > 0, head at any level != 1
2948 * or any attempt to recurse body
2949 */
2950 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
2951 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2952 ctxt->sax->error(ctxt->userData,
2953 "htmlParseStartTag: misplaced <html> tag\n");
2954 ctxt->wellFormed = 0;
2955 xmlFree(name);
2956 return;
2957 }
2958 if ((ctxt->nameNr != 1) &&
2959 (xmlStrEqual(name, BAD_CAST"head"))) {
2960 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2961 ctxt->sax->error(ctxt->userData,
2962 "htmlParseStartTag: misplaced <head> tag\n");
2963 ctxt->wellFormed = 0;
2964 xmlFree(name);
2965 return;
2966 }
2967 if (xmlStrEqual(name, BAD_CAST"body")) {
2968 int i;
2969 for (i = 0;i < ctxt->nameNr;i++) {
2970 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
2971 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2972 ctxt->sax->error(ctxt->userData,
2973 "htmlParseStartTag: misplaced <body> tag\n");
2974 ctxt->wellFormed = 0;
2975 xmlFree(name);
2976 return;
2977 }
2978 }
2979 }
2980
2981 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002982 * Now parse the attributes, it ends up with the ending
2983 *
2984 * (S Attribute)* S?
2985 */
2986 SKIP_BLANKS;
2987 while ((IS_CHAR(CUR)) &&
2988 (CUR != '>') &&
2989 ((CUR != '/') || (NXT(1) != '>'))) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002990 long cons = ctxt->nbChars;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002991
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002992 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002993 attname = htmlParseAttribute(ctxt, &attvalue);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002994 if (attname != NULL) {
Daniel Veillard365e13b2000-07-02 07:56:37 +00002995
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002996 /*
2997 * Well formedness requires at most one declaration of an attribute
2998 */
2999 for (i = 0; i < nbatts;i += 2) {
Daniel Veillard8b5dd832000-10-01 20:28:44 +00003000 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003001 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003002 ctxt->sax->error(ctxt->userData,
3003 "Attribute %s redefined\n",
3004 attname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003005 ctxt->wellFormed = 0;
Daniel Veillard6454aec1999-09-02 22:04:43 +00003006 xmlFree(attname);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003007 if (attvalue != NULL)
3008 xmlFree(attvalue);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003009 goto failed;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003010 }
3011 }
3012
3013 /*
3014 * Add the pair to atts
3015 */
3016 if (atts == NULL) {
3017 maxatts = 10;
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003018 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003019 if (atts == NULL) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003020 xmlGenericError(xmlGenericErrorContext,
3021 "malloc of %ld byte failed\n",
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003022 maxatts * (long)sizeof(xmlChar *));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003023 if (name != NULL) xmlFree(name);
3024 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003025 }
Daniel Veillard51e3b151999-11-12 17:02:31 +00003026 } else if (nbatts + 4 > maxatts) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003027 maxatts *= 2;
Daniel Veillard4b0755c2000-09-25 14:26:28 +00003028 atts = (const xmlChar **) xmlRealloc((void *) atts,
3029 maxatts * sizeof(xmlChar *));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003030 if (atts == NULL) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003031 xmlGenericError(xmlGenericErrorContext,
3032 "realloc of %ld byte failed\n",
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003033 maxatts * (long)sizeof(xmlChar *));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003034 if (name != NULL) xmlFree(name);
3035 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003036 }
3037 }
3038 atts[nbatts++] = attname;
3039 atts[nbatts++] = attvalue;
3040 atts[nbatts] = NULL;
3041 atts[nbatts + 1] = NULL;
3042 }
Daniel Veillard126f2792000-10-24 17:10:12 +00003043 else {
3044 /* Dump the bogus attribute string up to the next blank or
3045 * the end of the tag. */
3046 while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
3047 && ((CUR != '/') || (NXT(1) != '>')))
3048 NEXT;
3049 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003050
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003051failed:
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003052 SKIP_BLANKS;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003053 if (cons == ctxt->nbChars) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003054 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3055 ctxt->sax->error(ctxt->userData,
3056 "htmlParseStartTag: problem parsing attributes\n");
3057 ctxt->wellFormed = 0;
3058 break;
3059 }
3060 }
3061
3062 /*
Daniel Veillard365e13b2000-07-02 07:56:37 +00003063 * Handle specific association to the META tag
3064 */
3065 if (meta)
3066 htmlCheckMeta(ctxt, atts);
3067
3068 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003069 * SAX: Start of Element !
3070 */
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003071 htmlnamePush(ctxt, xmlStrdup(name));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003072#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003073 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003074#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003075 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3076 ctxt->sax->startElement(ctxt->userData, name, atts);
3077
3078 if (atts != NULL) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003079 for (i = 0;i < nbatts;i++) {
3080 if (atts[i] != NULL)
3081 xmlFree((xmlChar *) atts[i]);
3082 }
Daniel Veillard3f6f7f62000-06-30 17:58:25 +00003083 xmlFree((void *) atts);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003084 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003085 if (name != NULL) xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003086}
3087
3088/**
3089 * htmlParseEndTag:
3090 * @ctxt: an HTML parser context
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003091 *
3092 * parse an end of tag
3093 *
3094 * [42] ETag ::= '</' Name S? '>'
3095 *
3096 * With namespace
3097 *
3098 * [NS 9] ETag ::= '</' QName S? '>'
3099 */
3100
3101void
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003102htmlParseEndTag(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003103 xmlChar *name;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003104 xmlChar *oldname;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003105 int i;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003106
3107 if ((CUR != '<') || (NXT(1) != '/')) {
3108 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3109 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3110 ctxt->wellFormed = 0;
3111 return;
3112 }
3113 SKIP(2);
3114
3115 name = htmlParseHTMLName(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003116 if (name == NULL) return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003117
3118 /*
3119 * We should definitely be at the ending "S? '>'" part
3120 */
3121 SKIP_BLANKS;
3122 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3123 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3124 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3125 ctxt->wellFormed = 0;
3126 } else
3127 NEXT;
3128
3129 /*
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003130 * If the name read is not one of the element in the parsing stack
3131 * then return, it's just an error.
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003132 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003133 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard8b5dd832000-10-01 20:28:44 +00003134 if (xmlStrEqual(name, ctxt->nameTab[i])) break;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003135 }
3136 if (i < 0) {
3137 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003138 ctxt->sax->error(ctxt->userData,
3139 "Unexpected end tag : %s\n", name);
Daniel Veillard6454aec1999-09-02 22:04:43 +00003140 xmlFree(name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003141 ctxt->wellFormed = 0;
3142 return;
3143 }
3144
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003145
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003146 /*
3147 * Check for auto-closure of HTML elements.
3148 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003149
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003150 htmlAutoCloseOnClose(ctxt, name);
3151
3152 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003153 * Well formedness constraints, opening and closing must match.
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003154 * With the exception that the autoclose may have popped stuff out
3155 * of the stack.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003156 */
Daniel Veillard8b5dd832000-10-01 20:28:44 +00003157 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003158#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003159 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003160#endif
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003161 if ((ctxt->name != NULL) &&
Daniel Veillard8b5dd832000-10-01 20:28:44 +00003162 (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003163 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3164 ctxt->sax->error(ctxt->userData,
3165 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003166 name, ctxt->name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003167 ctxt->wellFormed = 0;
3168 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003169 }
3170
3171 /*
3172 * SAX: End of Tag
3173 */
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003174 oldname = ctxt->name;
Daniel Veillard8b5dd832000-10-01 20:28:44 +00003175 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003176 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3177 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003178 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003179 if (oldname != NULL) {
3180#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003181 xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003182#endif
3183 xmlFree(oldname);
3184#ifdef DEBUG
3185 } else {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003186 xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003187#endif
3188 }
3189 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003190
3191 if (name != NULL)
Daniel Veillard6454aec1999-09-02 22:04:43 +00003192 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003193
3194 return;
3195}
3196
3197
3198/**
3199 * htmlParseReference:
3200 * @ctxt: an HTML parser context
3201 *
3202 * parse and handle entity references in content,
3203 * this will end-up in a call to character() since this is either a
3204 * CharRef, or a predefined entity.
3205 */
3206void
3207htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003208 htmlEntityDescPtr ent;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003209 xmlChar out[6];
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003210 xmlChar *name;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003211 if (CUR != '&') return;
3212
3213 if (NXT(1) == '#') {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003214 unsigned int c;
3215 int bits, i = 0;
3216
3217 c = htmlParseCharRef(ctxt);
Daniel Veillard748e45d2000-11-17 16:36:08 +00003218 if (c == 0)
3219 return;
3220
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003221 if (c < 0x80) { out[i++]= c; bits= -6; }
3222 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3223 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3224 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3225
3226 for ( ; bits >= 0; bits-= 6) {
3227 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3228 }
3229 out[i] = 0;
3230
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003231 htmlCheckParagraph(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003232 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003233 ctxt->sax->characters(ctxt->userData, out, i);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003234 } else {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003235 ent = htmlParseEntityRef(ctxt, &name);
Daniel Veillard71b656e2000-01-05 14:46:17 +00003236 if (name == NULL) {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003237 htmlCheckParagraph(ctxt);
Daniel Veillard1255ab72000-08-14 15:13:33 +00003238 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3239 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
Daniel Veillard71b656e2000-01-05 14:46:17 +00003240 return;
3241 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003242 if ((ent == NULL) || (ent->value <= 0)) {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003243 htmlCheckParagraph(ctxt);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003244 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
Daniel Veillardb96e6431999-08-29 21:02:19 +00003245 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003246 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
Daniel Veillard71b656e2000-01-05 14:46:17 +00003247 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003248 }
3249 } else {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003250 unsigned int c;
3251 int bits, i = 0;
3252
3253 c = ent->value;
3254 if (c < 0x80)
3255 { out[i++]= c; bits= -6; }
3256 else if (c < 0x800)
3257 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3258 else if (c < 0x10000)
3259 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3260 else
3261 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3262
3263 for ( ; bits >= 0; bits-= 6) {
3264 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3265 }
3266 out[i] = 0;
3267
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003268 htmlCheckParagraph(ctxt);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003269 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003270 ctxt->sax->characters(ctxt->userData, out, i);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003271 }
Daniel Veillard6454aec1999-09-02 22:04:43 +00003272 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003273 }
3274}
3275
3276/**
3277 * htmlParseContent:
3278 * @ctxt: an HTML parser context
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003279 * @name: the node name
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003280 *
3281 * Parse a content: comment, sub-element, reference or text.
3282 *
3283 */
3284
3285void
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003286htmlParseContent(htmlParserCtxtPtr ctxt) {
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003287 xmlChar *currentNode;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003288 int depth;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003289
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003290 currentNode = xmlStrdup(ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003291 depth = ctxt->nameNr;
3292 while (1) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003293 long cons = ctxt->nbChars;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003294
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003295 GROW;
3296 /*
3297 * Our tag or one of it's parent or children is ending.
3298 */
3299 if ((CUR == '<') && (NXT(1) == '/')) {
3300 htmlParseEndTag(ctxt);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003301 if (currentNode != NULL) xmlFree(currentNode);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003302 return;
3303 }
3304
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003305 /*
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003306 * Has this node been popped out during parsing of
3307 * the next element
3308 */
Daniel Veillard8b5dd832000-10-01 20:28:44 +00003309 if ((!xmlStrEqual(currentNode, ctxt->name)) &&
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003310 (depth >= ctxt->nameNr)) {
3311 if (currentNode != NULL) xmlFree(currentNode);
3312 return;
3313 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003314
Daniel Veillard7eda8452000-10-14 23:38:43 +00003315 if ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3316 (xmlStrEqual(currentNode, BAD_CAST"style"))) {
3317 /*
3318 * Handle SCRIPT/STYLE separately
3319 */
3320 htmlParseScript(ctxt);
3321 } else {
3322 /*
3323 * Sometimes DOCTYPE arrives in the middle of the document
3324 */
3325 if ((CUR == '<') && (NXT(1) == '!') &&
3326 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3327 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3328 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3329 (UPP(8) == 'E')) {
Daniel Veillard35008381999-10-25 13:15:52 +00003330 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3331 ctxt->sax->error(ctxt->userData,
Daniel Veillard7eda8452000-10-14 23:38:43 +00003332 "Misplaced DOCTYPE declaration\n");
Daniel Veillard35008381999-10-25 13:15:52 +00003333 ctxt->wellFormed = 0;
Daniel Veillard7eda8452000-10-14 23:38:43 +00003334 htmlParseDocTypeDecl(ctxt);
Daniel Veillard35008381999-10-25 13:15:52 +00003335 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003336
Daniel Veillard7eda8452000-10-14 23:38:43 +00003337 /*
3338 * First case : a comment
3339 */
3340 if ((CUR == '<') && (NXT(1) == '!') &&
3341 (NXT(2) == '-') && (NXT(3) == '-')) {
3342 htmlParseComment(ctxt);
3343 }
3344
3345 /*
3346 * Second case : a sub-element.
3347 */
3348 else if (CUR == '<') {
3349 htmlParseElement(ctxt);
3350 }
3351
3352 /*
3353 * Third case : a reference. If if has not been resolved,
3354 * parsing returns it's Name, create the node
3355 */
3356 else if (CUR == '&') {
3357 htmlParseReference(ctxt);
3358 }
3359
3360 /*
3361 * Fourth : end of the resource
3362 */
3363 else if (CUR == 0) {
3364 htmlAutoClose(ctxt, NULL);
3365 }
3366
3367 /*
3368 * Last case, text. Note that References are handled directly.
3369 */
3370 else {
3371 htmlParseCharData(ctxt, 0);
3372 }
3373
3374 if (cons == ctxt->nbChars) {
3375 if (ctxt->node != NULL) {
3376 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3377 ctxt->sax->error(ctxt->userData,
3378 "detected an error in element content\n");
3379 ctxt->wellFormed = 0;
3380 }
3381 break;
3382 }
3383 }
Daniel Veillarde2d034d1999-07-27 19:52:06 +00003384 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003385 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003386 if (currentNode != NULL) xmlFree(currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003387}
3388
3389/**
3390 * htmlParseElement:
3391 * @ctxt: an HTML parser context
3392 *
3393 * parse an HTML element, this is highly recursive
3394 *
3395 * [39] element ::= EmptyElemTag | STag content ETag
3396 *
3397 * [41] Attribute ::= Name Eq AttValue
3398 */
3399
3400void
3401htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003402 xmlChar *name;
Daniel Veillard7d2c2761999-10-11 15:09:51 +00003403 xmlChar *currentNode = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003404 htmlElemDescPtr info;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003405 htmlParserNodeInfo node_info;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003406 xmlChar *oldname;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003407 int depth = ctxt->nameNr;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003408
3409 /* Capture start position */
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003410 if (ctxt->record_info) {
3411 node_info.begin_pos = ctxt->input->consumed +
3412 (CUR_PTR - ctxt->input->base);
3413 node_info.begin_line = ctxt->input->line;
3414 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003415
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003416 oldname = xmlStrdup(ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003417 htmlParseStartTag(ctxt);
3418 name = ctxt->name;
3419#ifdef DEBUG
3420 if (oldname == NULL)
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003421 xmlGenericError(xmlGenericErrorContext,
3422 "Start of element %s\n", name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003423 else if (name == NULL)
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003424 xmlGenericError(xmlGenericErrorContext,
3425 "Start of element failed, was %s\n", oldname);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003426 else
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003427 xmlGenericError(xmlGenericErrorContext,
3428 "Start of element %s, was %s\n", name, oldname);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003429#endif
Daniel Veillard8b5dd832000-10-01 20:28:44 +00003430 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003431 (name == NULL)) {
3432 if (CUR == '>')
3433 NEXT;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003434 if (oldname != NULL)
3435 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003436 return;
3437 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003438 if (oldname != NULL)
3439 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003440
3441 /*
3442 * Lookup the info for that element.
3443 */
3444 info = htmlTagLookup(name);
3445 if (info == NULL) {
3446 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3447 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3448 name);
3449 ctxt->wellFormed = 0;
3450 } else if (info->depr) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003451/***************************
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003452 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3453 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3454 name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003455 ***************************/
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003456 }
3457
3458 /*
3459 * Check for an Empty Element labelled the XML/SGML way
3460 */
3461 if ((CUR == '/') && (NXT(1) == '>')) {
3462 SKIP(2);
3463 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3464 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003465 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003466#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003467 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003468#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003469 if (oldname != NULL)
3470 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003471 return;
3472 }
3473
Daniel Veillarde2d034d1999-07-27 19:52:06 +00003474 if (CUR == '>') {
3475 NEXT;
3476 } else {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003477 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard87b95392000-08-12 21:12:04 +00003478 ctxt->sax->error(ctxt->userData,
3479 "Couldn't find end of Start Tag %s\n",
3480 name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003481 ctxt->wellFormed = 0;
3482
3483 /*
3484 * end of parsing of this node.
3485 */
Daniel Veillard8b5dd832000-10-01 20:28:44 +00003486 if (xmlStrEqual(name, ctxt->name)) {
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003487 nodePop(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003488 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003489#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003490 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003491#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003492 if (oldname != NULL)
3493 xmlFree(oldname);
3494 }
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003495
3496 /*
3497 * Capture end position and add node
3498 */
3499 if ( currentNode != NULL && ctxt->record_info ) {
3500 node_info.end_pos = ctxt->input->consumed +
3501 (CUR_PTR - ctxt->input->base);
3502 node_info.end_line = ctxt->input->line;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003503 node_info.node = ctxt->node;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003504 xmlParserAddNodeInfo(ctxt, &node_info);
3505 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003506 return;
3507 }
3508
3509 /*
3510 * Check for an Empty Element from DTD definition
3511 */
3512 if ((info != NULL) && (info->empty)) {
3513 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3514 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003515 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003516#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003517 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003518#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003519 if (oldname != NULL)
3520 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003521 return;
3522 }
3523
3524 /*
3525 * Parse the content of the element:
3526 */
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003527 currentNode = xmlStrdup(ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003528 depth = ctxt->nameNr;
3529 while (IS_CHAR(CUR)) {
3530 htmlParseContent(ctxt);
3531 if (ctxt->nameNr < depth) break;
3532 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003533
3534 if (!IS_CHAR(CUR)) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003535 /************
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003536 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3537 ctxt->sax->error(ctxt->userData,
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003538 "Premature end of data in tag %s\n", currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003539 ctxt->wellFormed = 0;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003540 *************/
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003541
3542 /*
3543 * end of parsing of this node.
3544 */
3545 nodePop(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003546 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003547#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003548 xmlGenericError(xmlGenericErrorContext,"Premature end of tag %s : popping out %s\n", name, oldname);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003549#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003550 if (oldname != NULL)
3551 xmlFree(oldname);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003552 if (currentNode != NULL)
3553 xmlFree(currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003554 return;
3555 }
3556
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003557 /*
3558 * Capture end position and add node
3559 */
3560 if ( currentNode != NULL && ctxt->record_info ) {
3561 node_info.end_pos = ctxt->input->consumed +
3562 (CUR_PTR - ctxt->input->base);
3563 node_info.end_line = ctxt->input->line;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003564 node_info.node = ctxt->node;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003565 xmlParserAddNodeInfo(ctxt, &node_info);
3566 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003567 if (currentNode != NULL)
3568 xmlFree(currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003569}
3570
3571/**
3572 * htmlParseDocument :
3573 * @ctxt: an HTML parser context
3574 *
3575 * parse an HTML document (and build a tree if using the standard SAX
3576 * interface).
3577 *
3578 * Returns 0, -1 in case of error. the parser context is augmented
3579 * as a result of the parsing.
3580 */
3581
3582int
3583htmlParseDocument(htmlParserCtxtPtr ctxt) {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003584 xmlDtdPtr dtd;
3585
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003586 htmlDefaultSAXHandlerInit();
3587 ctxt->html = 1;
3588
Daniel Veillarde2d034d1999-07-27 19:52:06 +00003589 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003590 /*
Daniel Veillardb96e6431999-08-29 21:02:19 +00003591 * SAX: beginning of the document processing.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003592 */
3593 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3594 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3595
3596 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003597 * Wipe out everything which is before the first '<'
3598 */
Daniel Veillard35008381999-10-25 13:15:52 +00003599 SKIP_BLANKS;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003600 if (CUR == 0) {
3601 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3602 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3603 ctxt->wellFormed = 0;
3604 }
3605
Daniel Veillardbe803962000-06-28 23:40:59 +00003606 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3607 ctxt->sax->startDocument(ctxt->userData);
3608
3609
Daniel Veillard35008381999-10-25 13:15:52 +00003610 /*
3611 * Parse possible comments before any content
3612 */
3613 while ((CUR == '<') && (NXT(1) == '!') &&
3614 (NXT(2) == '-') && (NXT(3) == '-')) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003615 htmlParseComment(ctxt);
Daniel Veillard35008381999-10-25 13:15:52 +00003616 SKIP_BLANKS;
3617 }
3618
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003619
3620 /*
3621 * Then possibly doc type declaration(s) and more Misc
3622 * (doctypedecl Misc*)?
3623 */
3624 if ((CUR == '<') && (NXT(1) == '!') &&
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003625 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3626 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3627 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3628 (UPP(8) == 'E')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003629 htmlParseDocTypeDecl(ctxt);
3630 }
3631 SKIP_BLANKS;
3632
3633 /*
Daniel Veillard87b95392000-08-12 21:12:04 +00003634 * Parse possible comments before any content
3635 */
3636 while ((CUR == '<') && (NXT(1) == '!') &&
3637 (NXT(2) == '-') && (NXT(3) == '-')) {
3638 htmlParseComment(ctxt);
3639 SKIP_BLANKS;
3640 }
3641
3642 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003643 * Time to start parsing the tree itself
3644 */
Daniel Veillard35008381999-10-25 13:15:52 +00003645 htmlParseContent(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003646
3647 /*
Daniel Veillard365e13b2000-07-02 07:56:37 +00003648 * autoclose
3649 */
3650 if (CUR == 0)
3651 htmlAutoClose(ctxt, NULL);
3652
3653
3654 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003655 * SAX: end of the document processing.
3656 */
3657 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3658 ctxt->sax->endDocument(ctxt->userData);
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003659
3660 if (ctxt->myDoc != NULL) {
3661 dtd = xmlGetIntSubset(ctxt->myDoc);
3662 if (dtd == NULL)
3663 ctxt->myDoc->intSubset =
3664 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3665 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3666 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3667 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003668 if (! ctxt->wellFormed) return(-1);
3669 return(0);
3670}
3671
3672
Daniel Veillarddbfd6411999-12-28 16:35:14 +00003673/************************************************************************
3674 * *
3675 * Parser contexts handling *
3676 * *
3677 ************************************************************************/
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003678
3679/**
3680 * xmlInitParserCtxt:
3681 * @ctxt: an HTML parser context
3682 *
3683 * Initialize a parser context
3684 */
3685
3686void
3687htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3688{
3689 htmlSAXHandler *sax;
3690
Daniel Veillard35008381999-10-25 13:15:52 +00003691 if (ctxt == NULL) return;
3692 memset(ctxt, 0, sizeof(htmlParserCtxt));
3693
Daniel Veillard6454aec1999-09-02 22:04:43 +00003694 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003695 if (sax == NULL) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003696 xmlGenericError(xmlGenericErrorContext,
3697 "htmlInitParserCtxt: out of memory\n");
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003698 }
Daniel Veillard4fb87ee2000-09-19 12:25:59 +00003699 else
3700 memset(sax, 0, sizeof(htmlSAXHandler));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003701
3702 /* Allocate the Input stack */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003703 ctxt->inputTab = (htmlParserInputPtr *)
3704 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3705 if (ctxt->inputTab == NULL) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003706 xmlGenericError(xmlGenericErrorContext,
3707 "htmlInitParserCtxt: out of memory\n");
Daniel Veillard39c7d712000-09-10 16:14:55 +00003708 ctxt->inputNr = 0;
3709 ctxt->inputMax = 0;
3710 ctxt->input = NULL;
3711 return;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003712 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003713 ctxt->inputNr = 0;
3714 ctxt->inputMax = 5;
3715 ctxt->input = NULL;
3716 ctxt->version = NULL;
3717 ctxt->encoding = NULL;
3718 ctxt->standalone = -1;
Daniel Veillarddbfd6411999-12-28 16:35:14 +00003719 ctxt->instate = XML_PARSER_START;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003720
3721 /* Allocate the Node stack */
Daniel Veillard6454aec1999-09-02 22:04:43 +00003722 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
Daniel Veillard39c7d712000-09-10 16:14:55 +00003723 if (ctxt->nodeTab == NULL) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003724 xmlGenericError(xmlGenericErrorContext,
3725 "htmlInitParserCtxt: out of memory\n");
Daniel Veillard39c7d712000-09-10 16:14:55 +00003726 ctxt->nodeNr = 0;
3727 ctxt->nodeMax = 0;
3728 ctxt->node = NULL;
3729 ctxt->inputNr = 0;
3730 ctxt->inputMax = 0;
3731 ctxt->input = NULL;
3732 return;
3733 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003734 ctxt->nodeNr = 0;
3735 ctxt->nodeMax = 10;
3736 ctxt->node = NULL;
3737
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003738 /* Allocate the Name stack */
3739 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Daniel Veillard39c7d712000-09-10 16:14:55 +00003740 if (ctxt->nameTab == NULL) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003741 xmlGenericError(xmlGenericErrorContext,
3742 "htmlInitParserCtxt: out of memory\n");
Daniel Veillard39c7d712000-09-10 16:14:55 +00003743 ctxt->nameNr = 0;
3744 ctxt->nameMax = 10;
3745 ctxt->name = NULL;
3746 ctxt->nodeNr = 0;
3747 ctxt->nodeMax = 0;
3748 ctxt->node = NULL;
3749 ctxt->inputNr = 0;
3750 ctxt->inputMax = 0;
3751 ctxt->input = NULL;
3752 return;
3753 }
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003754 ctxt->nameNr = 0;
3755 ctxt->nameMax = 10;
3756 ctxt->name = NULL;
3757
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003758 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3759 else {
3760 ctxt->sax = sax;
3761 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3762 }
3763 ctxt->userData = ctxt;
3764 ctxt->myDoc = NULL;
3765 ctxt->wellFormed = 1;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003766 ctxt->replaceEntities = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003767 ctxt->html = 1;
3768 ctxt->record_info = 0;
Daniel Veillard35008381999-10-25 13:15:52 +00003769 ctxt->validate = 0;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003770 ctxt->nbChars = 0;
Daniel Veillarddbfd6411999-12-28 16:35:14 +00003771 ctxt->checkIndex = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003772 xmlInitNodeInfoSeq(&ctxt->node_seq);
3773}
3774
3775/**
3776 * htmlFreeParserCtxt:
3777 * @ctxt: an HTML parser context
3778 *
3779 * Free all the memory used by a parser context. However the parsed
3780 * document in ctxt->myDoc is not freed.
3781 */
3782
3783void
3784htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3785{
Daniel Veillard365e13b2000-07-02 07:56:37 +00003786 xmlFreeParserCtxt(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003787}
3788
3789/**
3790 * htmlCreateDocParserCtxt :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003791 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003792 * @encoding: a free form C string describing the HTML document encoding, or NULL
3793 *
3794 * Create a parser context for an HTML document.
3795 *
3796 * Returns the new parser context or NULL
3797 */
3798htmlParserCtxtPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003799htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003800 htmlParserCtxtPtr ctxt;
3801 htmlParserInputPtr input;
3802 /* htmlCharEncoding enc; */
3803
Daniel Veillard6454aec1999-09-02 22:04:43 +00003804 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003805 if (ctxt == NULL) {
3806 perror("malloc");
3807 return(NULL);
3808 }
3809 htmlInitParserCtxt(ctxt);
Daniel Veillard6454aec1999-09-02 22:04:43 +00003810 input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003811 if (input == NULL) {
3812 perror("malloc");
Daniel Veillard6454aec1999-09-02 22:04:43 +00003813 xmlFree(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003814 return(NULL);
3815 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003816 memset(input, 0, sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003817
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003818 input->line = 1;
3819 input->col = 1;
3820 input->base = cur;
3821 input->cur = cur;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003822
3823 inputPush(ctxt, input);
3824 return(ctxt);
3825}
3826
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003827/************************************************************************
3828 * *
3829 * Progressive parsing interfaces *
3830 * *
3831 ************************************************************************/
3832
3833/**
3834 * htmlParseLookupSequence:
3835 * @ctxt: an HTML parser context
3836 * @first: the first char to lookup
3837 * @next: the next char to lookup or zero
3838 * @third: the next char to lookup or zero
3839 *
3840 * Try to find if a sequence (first, next, third) or just (first next) or
3841 * (first) is available in the input stream.
3842 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3843 * to avoid rescanning sequences of bytes, it DOES change the state of the
3844 * parser, do not use liberally.
3845 * This is basically similar to xmlParseLookupSequence()
3846 *
3847 * Returns the index to the current parsing point if the full sequence
3848 * is available, -1 otherwise.
3849 */
3850int
3851htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3852 xmlChar next, xmlChar third) {
3853 int base, len;
3854 htmlParserInputPtr in;
3855 const xmlChar *buf;
3856
3857 in = ctxt->input;
3858 if (in == NULL) return(-1);
3859 base = in->cur - in->base;
3860 if (base < 0) return(-1);
3861 if (ctxt->checkIndex > base)
3862 base = ctxt->checkIndex;
3863 if (in->buf == NULL) {
3864 buf = in->base;
3865 len = in->length;
3866 } else {
3867 buf = in->buf->buffer->content;
3868 len = in->buf->buffer->use;
3869 }
3870 /* take into account the sequence length */
3871 if (third) len -= 2;
3872 else if (next) len --;
3873 for (;base < len;base++) {
3874 if (buf[base] == first) {
3875 if (third != 0) {
3876 if ((buf[base + 1] != next) ||
3877 (buf[base + 2] != third)) continue;
3878 } else if (next != 0) {
3879 if (buf[base + 1] != next) continue;
3880 }
3881 ctxt->checkIndex = 0;
3882#ifdef DEBUG_PUSH
3883 if (next == 0)
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003884 xmlGenericError(xmlGenericErrorContext,
3885 "HPP: lookup '%c' found at %d\n",
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003886 first, base);
3887 else if (third == 0)
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003888 xmlGenericError(xmlGenericErrorContext,
3889 "HPP: lookup '%c%c' found at %d\n",
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003890 first, next, base);
3891 else
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003892 xmlGenericError(xmlGenericErrorContext,
3893 "HPP: lookup '%c%c%c' found at %d\n",
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003894 first, next, third, base);
3895#endif
3896 return(base - (in->cur - in->base));
3897 }
3898 }
3899 ctxt->checkIndex = base;
3900#ifdef DEBUG_PUSH
3901 if (next == 0)
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003902 xmlGenericError(xmlGenericErrorContext,
3903 "HPP: lookup '%c' failed\n", first);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003904 else if (third == 0)
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003905 xmlGenericError(xmlGenericErrorContext,
3906 "HPP: lookup '%c%c' failed\n", first, next);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003907 else
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003908 xmlGenericError(xmlGenericErrorContext,
3909 "HPP: lookup '%c%c%c' failed\n", first, next, third);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003910#endif
3911 return(-1);
3912}
3913
3914/**
Daniel Veillard71b656e2000-01-05 14:46:17 +00003915 * htmlParseTryOrFinish:
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003916 * @ctxt: an HTML parser context
Daniel Veillard71b656e2000-01-05 14:46:17 +00003917 * @terminate: last chunk indicator
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003918 *
3919 * Try to progress on parsing
3920 *
3921 * Returns zero if no parsing was possible
3922 */
3923int
Daniel Veillard71b656e2000-01-05 14:46:17 +00003924htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003925 int ret = 0;
3926 htmlParserInputPtr in;
Daniel Veillard365e13b2000-07-02 07:56:37 +00003927 int avail = 0;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003928 xmlChar cur, next;
3929
3930#ifdef DEBUG_PUSH
3931 switch (ctxt->instate) {
3932 case XML_PARSER_EOF:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003933 xmlGenericError(xmlGenericErrorContext,
3934 "HPP: try EOF\n"); break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003935 case XML_PARSER_START:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003936 xmlGenericError(xmlGenericErrorContext,
3937 "HPP: try START\n"); break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003938 case XML_PARSER_MISC:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003939 xmlGenericError(xmlGenericErrorContext,
3940 "HPP: try MISC\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003941 case XML_PARSER_COMMENT:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003942 xmlGenericError(xmlGenericErrorContext,
3943 "HPP: try COMMENT\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003944 case XML_PARSER_PROLOG:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003945 xmlGenericError(xmlGenericErrorContext,
3946 "HPP: try PROLOG\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003947 case XML_PARSER_START_TAG:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003948 xmlGenericError(xmlGenericErrorContext,
3949 "HPP: try START_TAG\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003950 case XML_PARSER_CONTENT:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003951 xmlGenericError(xmlGenericErrorContext,
3952 "HPP: try CONTENT\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003953 case XML_PARSER_CDATA_SECTION:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003954 xmlGenericError(xmlGenericErrorContext,
3955 "HPP: try CDATA_SECTION\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003956 case XML_PARSER_END_TAG:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003957 xmlGenericError(xmlGenericErrorContext,
3958 "HPP: try END_TAG\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003959 case XML_PARSER_ENTITY_DECL:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003960 xmlGenericError(xmlGenericErrorContext,
3961 "HPP: try ENTITY_DECL\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003962 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003963 xmlGenericError(xmlGenericErrorContext,
3964 "HPP: try ENTITY_VALUE\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003965 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003966 xmlGenericError(xmlGenericErrorContext,
3967 "HPP: try ATTRIBUTE_VALUE\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003968 case XML_PARSER_DTD:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003969 xmlGenericError(xmlGenericErrorContext,
3970 "HPP: try DTD\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003971 case XML_PARSER_EPILOG:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003972 xmlGenericError(xmlGenericErrorContext,
3973 "HPP: try EPILOG\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003974 case XML_PARSER_PI:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003975 xmlGenericError(xmlGenericErrorContext,
3976 "HPP: try PI\n");break;
Daniel Veillard7eda8452000-10-14 23:38:43 +00003977 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00003978 xmlGenericError(xmlGenericErrorContext,
3979 "HPP: try SYSTEM_LITERAL\n");break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003980 }
3981#endif
3982
3983 while (1) {
3984
3985 in = ctxt->input;
3986 if (in == NULL) break;
3987 if (in->buf == NULL)
3988 avail = in->length - (in->cur - in->base);
3989 else
3990 avail = in->buf->buffer->use - (in->cur - in->base);
Daniel Veillard365e13b2000-07-02 07:56:37 +00003991 if ((avail == 0) && (terminate)) {
3992 htmlAutoClose(ctxt, NULL);
Daniel Veillard87b95392000-08-12 21:12:04 +00003993 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
3994 /*
3995 * SAX: end of the document processing.
3996 */
Daniel Veillard365e13b2000-07-02 07:56:37 +00003997 ctxt->instate = XML_PARSER_EOF;
Daniel Veillard87b95392000-08-12 21:12:04 +00003998 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3999 ctxt->sax->endDocument(ctxt->userData);
4000 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00004001 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004002 if (avail < 1)
4003 goto done;
4004 switch (ctxt->instate) {
4005 case XML_PARSER_EOF:
4006 /*
4007 * Document parsing is done !
4008 */
4009 goto done;
4010 case XML_PARSER_START:
4011 /*
4012 * Very first chars read from the document flow.
4013 */
4014 cur = in->cur[0];
4015 if (IS_BLANK(cur)) {
4016 SKIP_BLANKS;
4017 if (in->buf == NULL)
4018 avail = in->length - (in->cur - in->base);
4019 else
4020 avail = in->buf->buffer->use - (in->cur - in->base);
4021 }
4022 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4023 ctxt->sax->setDocumentLocator(ctxt->userData,
4024 &xmlDefaultSAXLocator);
Daniel Veillardd83eb822000-06-30 18:39:56 +00004025 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4026 (!ctxt->disableSAX))
4027 ctxt->sax->startDocument(ctxt->userData);
4028
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004029 cur = in->cur[0];
4030 next = in->cur[1];
4031 if ((cur == '<') && (next == '!') &&
4032 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4033 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4034 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4035 (UPP(8) == 'E')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00004036 if ((!terminate) &&
4037 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004038 goto done;
4039#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004040 xmlGenericError(xmlGenericErrorContext,
4041 "HPP: Parsing internal subset\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004042#endif
4043 htmlParseDocTypeDecl(ctxt);
4044 ctxt->instate = XML_PARSER_PROLOG;
4045#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004046 xmlGenericError(xmlGenericErrorContext,
4047 "HPP: entering PROLOG\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004048#endif
4049 } else {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004050 ctxt->instate = XML_PARSER_MISC;
4051 }
4052#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004053 xmlGenericError(xmlGenericErrorContext,
4054 "HPP: entering MISC\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004055#endif
4056 break;
4057 case XML_PARSER_MISC:
4058 SKIP_BLANKS;
4059 if (in->buf == NULL)
4060 avail = in->length - (in->cur - in->base);
4061 else
4062 avail = in->buf->buffer->use - (in->cur - in->base);
4063 if (avail < 2)
4064 goto done;
4065 cur = in->cur[0];
4066 next = in->cur[1];
4067 if ((cur == '<') && (next == '!') &&
4068 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00004069 if ((!terminate) &&
4070 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004071 goto done;
4072#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004073 xmlGenericError(xmlGenericErrorContext,
4074 "HPP: Parsing Comment\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004075#endif
4076 htmlParseComment(ctxt);
4077 ctxt->instate = XML_PARSER_MISC;
4078 } else if ((cur == '<') && (next == '!') &&
4079 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4080 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4081 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4082 (UPP(8) == 'E')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00004083 if ((!terminate) &&
4084 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004085 goto done;
4086#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004087 xmlGenericError(xmlGenericErrorContext,
4088 "HPP: Parsing internal subset\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004089#endif
4090 htmlParseDocTypeDecl(ctxt);
4091 ctxt->instate = XML_PARSER_PROLOG;
4092#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004093 xmlGenericError(xmlGenericErrorContext,
4094 "HPP: entering PROLOG\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004095#endif
4096 } else if ((cur == '<') && (next == '!') &&
4097 (avail < 9)) {
4098 goto done;
4099 } else {
4100 ctxt->instate = XML_PARSER_START_TAG;
4101#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004102 xmlGenericError(xmlGenericErrorContext,
4103 "HPP: entering START_TAG\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004104#endif
4105 }
4106 break;
4107 case XML_PARSER_PROLOG:
4108 SKIP_BLANKS;
4109 if (in->buf == NULL)
4110 avail = in->length - (in->cur - in->base);
4111 else
4112 avail = in->buf->buffer->use - (in->cur - in->base);
4113 if (avail < 2)
4114 goto done;
4115 cur = in->cur[0];
4116 next = in->cur[1];
4117 if ((cur == '<') && (next == '!') &&
4118 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00004119 if ((!terminate) &&
4120 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004121 goto done;
4122#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004123 xmlGenericError(xmlGenericErrorContext,
4124 "HPP: Parsing Comment\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004125#endif
4126 htmlParseComment(ctxt);
4127 ctxt->instate = XML_PARSER_PROLOG;
4128 } else if ((cur == '<') && (next == '!') &&
4129 (avail < 4)) {
4130 goto done;
4131 } else {
4132 ctxt->instate = XML_PARSER_START_TAG;
4133#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004134 xmlGenericError(xmlGenericErrorContext,
4135 "HPP: entering START_TAG\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004136#endif
4137 }
4138 break;
4139 case XML_PARSER_EPILOG:
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004140 if (in->buf == NULL)
4141 avail = in->length - (in->cur - in->base);
4142 else
4143 avail = in->buf->buffer->use - (in->cur - in->base);
Daniel Veillard87b95392000-08-12 21:12:04 +00004144 if (avail < 1)
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004145 goto done;
4146 cur = in->cur[0];
Daniel Veillard87b95392000-08-12 21:12:04 +00004147 if (IS_BLANK(cur)) {
4148 htmlParseCharData(ctxt, 0);
4149 goto done;
4150 }
4151 if (avail < 2)
4152 goto done;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004153 next = in->cur[1];
4154 if ((cur == '<') && (next == '!') &&
4155 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00004156 if ((!terminate) &&
4157 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004158 goto done;
4159#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004160 xmlGenericError(xmlGenericErrorContext,
4161 "HPP: Parsing Comment\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004162#endif
4163 htmlParseComment(ctxt);
4164 ctxt->instate = XML_PARSER_EPILOG;
4165 } else if ((cur == '<') && (next == '!') &&
4166 (avail < 4)) {
4167 goto done;
4168 } else {
Daniel Veillarda2c6da92000-09-16 18:15:00 +00004169 ctxt->errNo = XML_ERR_DOCUMENT_END;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004170 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4171 ctxt->sax->error(ctxt->userData,
4172 "Extra content at the end of the document\n");
4173 ctxt->wellFormed = 0;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004174 ctxt->instate = XML_PARSER_EOF;
4175#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004176 xmlGenericError(xmlGenericErrorContext,
4177 "HPP: entering EOF\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004178#endif
4179 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4180 ctxt->sax->endDocument(ctxt->userData);
4181 goto done;
4182 }
4183 break;
4184 case XML_PARSER_START_TAG: {
4185 xmlChar *name, *oldname;
4186 int depth = ctxt->nameNr;
4187 htmlElemDescPtr info;
4188
4189 if (avail < 2)
4190 goto done;
4191 cur = in->cur[0];
4192 if (cur != '<') {
4193 ctxt->instate = XML_PARSER_CONTENT;
4194#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004195 xmlGenericError(xmlGenericErrorContext,
4196 "HPP: entering CONTENT\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004197#endif
4198 break;
4199 }
Daniel Veillard71b656e2000-01-05 14:46:17 +00004200 if ((!terminate) &&
4201 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004202 goto done;
4203
4204 oldname = xmlStrdup(ctxt->name);
4205 htmlParseStartTag(ctxt);
4206 name = ctxt->name;
4207#ifdef DEBUG
4208 if (oldname == NULL)
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004209 xmlGenericError(xmlGenericErrorContext,
4210 "Start of element %s\n", name);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004211 else if (name == NULL)
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004212 xmlGenericError(xmlGenericErrorContext,
4213 "Start of element failed, was %s\n",
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004214 oldname);
4215 else
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004216 xmlGenericError(xmlGenericErrorContext,
4217 "Start of element %s, was %s\n",
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004218 name, oldname);
4219#endif
4220 if (((depth == ctxt->nameNr) &&
Daniel Veillard8b5dd832000-10-01 20:28:44 +00004221 (xmlStrEqual(oldname, ctxt->name))) ||
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004222 (name == NULL)) {
4223 if (CUR == '>')
4224 NEXT;
4225 if (oldname != NULL)
4226 xmlFree(oldname);
4227 break;
4228 }
4229 if (oldname != NULL)
4230 xmlFree(oldname);
4231
4232 /*
4233 * Lookup the info for that element.
4234 */
4235 info = htmlTagLookup(name);
4236 if (info == NULL) {
4237 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4238 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4239 name);
4240 ctxt->wellFormed = 0;
4241 } else if (info->depr) {
4242 /***************************
4243 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4244 ctxt->sax->warning(ctxt->userData,
4245 "Tag %s is deprecated\n",
4246 name);
4247 ***************************/
4248 }
4249
4250 /*
4251 * Check for an Empty Element labelled the XML/SGML way
4252 */
4253 if ((CUR == '/') && (NXT(1) == '>')) {
4254 SKIP(2);
4255 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4256 ctxt->sax->endElement(ctxt->userData, name);
4257 oldname = htmlnamePop(ctxt);
4258#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004259 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004260 oldname);
4261#endif
4262 if (oldname != NULL)
4263 xmlFree(oldname);
4264 ctxt->instate = XML_PARSER_CONTENT;
4265#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004266 xmlGenericError(xmlGenericErrorContext,
4267 "HPP: entering CONTENT\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004268#endif
4269 break;
4270 }
4271
4272 if (CUR == '>') {
4273 NEXT;
4274 } else {
4275 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4276 ctxt->sax->error(ctxt->userData,
4277 "Couldn't find end of Start Tag %s\n",
4278 name);
4279 ctxt->wellFormed = 0;
4280
4281 /*
4282 * end of parsing of this node.
4283 */
Daniel Veillard8b5dd832000-10-01 20:28:44 +00004284 if (xmlStrEqual(name, ctxt->name)) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004285 nodePop(ctxt);
4286 oldname = htmlnamePop(ctxt);
4287#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004288 xmlGenericError(xmlGenericErrorContext,
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004289 "End of start tag problem: popping out %s\n", oldname);
4290#endif
4291 if (oldname != NULL)
4292 xmlFree(oldname);
4293 }
4294
4295 ctxt->instate = XML_PARSER_CONTENT;
4296#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004297 xmlGenericError(xmlGenericErrorContext,
4298 "HPP: entering CONTENT\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004299#endif
4300 break;
4301 }
4302
4303 /*
4304 * Check for an Empty Element from DTD definition
4305 */
4306 if ((info != NULL) && (info->empty)) {
4307 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4308 ctxt->sax->endElement(ctxt->userData, name);
4309 oldname = htmlnamePop(ctxt);
4310#ifdef DEBUG
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004311 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004312#endif
4313 if (oldname != NULL)
4314 xmlFree(oldname);
4315 }
4316 ctxt->instate = XML_PARSER_CONTENT;
4317#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004318 xmlGenericError(xmlGenericErrorContext,
4319 "HPP: entering CONTENT\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004320#endif
4321 break;
4322 }
Daniel Veillard87b95392000-08-12 21:12:04 +00004323 case XML_PARSER_CONTENT: {
4324 long cons;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004325 /*
4326 * Handle preparsed entities and charRef
4327 */
4328 if (ctxt->token != 0) {
Daniel Veillard365e13b2000-07-02 07:56:37 +00004329 xmlChar chr[2] = { 0 , 0 } ;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004330
Daniel Veillard365e13b2000-07-02 07:56:37 +00004331 chr[0] = (xmlChar) ctxt->token;
Daniel Veillardb8f25c92000-08-19 19:52:36 +00004332 htmlCheckParagraph(ctxt);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004333 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
Daniel Veillard365e13b2000-07-02 07:56:37 +00004334 ctxt->sax->characters(ctxt->userData, chr, 1);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004335 ctxt->token = 0;
4336 ctxt->checkIndex = 0;
4337 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00004338 if ((avail == 1) && (terminate)) {
4339 cur = in->cur[0];
4340 if ((cur != '<') && (cur != '&')) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00004341 if (ctxt->sax != NULL) {
4342 if (IS_BLANK(cur)) {
4343 if (ctxt->sax->ignorableWhitespace != NULL)
4344 ctxt->sax->ignorableWhitespace(
4345 ctxt->userData, &cur, 1);
4346 } else {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00004347 htmlCheckParagraph(ctxt);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00004348 if (ctxt->sax->characters != NULL)
4349 ctxt->sax->characters(
4350 ctxt->userData, &cur, 1);
4351 }
4352 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00004353 ctxt->token = 0;
4354 ctxt->checkIndex = 0;
4355 NEXT;
4356 }
4357 break;
4358 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004359 if (avail < 2)
4360 goto done;
4361 cur = in->cur[0];
4362 next = in->cur[1];
Daniel Veillard87b95392000-08-12 21:12:04 +00004363 cons = ctxt->nbChars;
Daniel Veillard7eda8452000-10-14 23:38:43 +00004364 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4365 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004366 /*
Daniel Veillard7eda8452000-10-14 23:38:43 +00004367 * Handle SCRIPT/STYLE separately
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004368 */
Daniel Veillard7eda8452000-10-14 23:38:43 +00004369 if ((!terminate) &&
4370 (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4371 goto done;
4372 htmlParseScript(ctxt);
4373 if ((cur == '<') && (next == '/')) {
4374 ctxt->instate = XML_PARSER_END_TAG;
4375 ctxt->checkIndex = 0;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004376#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004377 xmlGenericError(xmlGenericErrorContext,
4378 "HPP: entering END_TAG\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004379#endif
Daniel Veillard7eda8452000-10-14 23:38:43 +00004380 break;
4381 }
4382 } else {
4383 /*
4384 * Sometimes DOCTYPE arrives in the middle of the document
4385 */
4386 if ((cur == '<') && (next == '!') &&
4387 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4388 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4389 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4390 (UPP(8) == 'E')) {
4391 if ((!terminate) &&
4392 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4393 goto done;
4394 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4395 ctxt->sax->error(ctxt->userData,
4396 "Misplaced DOCTYPE declaration\n");
4397 ctxt->wellFormed = 0;
4398 htmlParseDocTypeDecl(ctxt);
4399 } else if ((cur == '<') && (next == '!') &&
4400 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4401 if ((!terminate) &&
4402 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4403 goto done;
4404#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004405 xmlGenericError(xmlGenericErrorContext,
4406 "HPP: Parsing Comment\n");
Daniel Veillard7eda8452000-10-14 23:38:43 +00004407#endif
4408 htmlParseComment(ctxt);
4409 ctxt->instate = XML_PARSER_CONTENT;
4410 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4411 goto done;
4412 } else if ((cur == '<') && (next == '/')) {
4413 ctxt->instate = XML_PARSER_END_TAG;
4414 ctxt->checkIndex = 0;
4415#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004416 xmlGenericError(xmlGenericErrorContext,
4417 "HPP: entering END_TAG\n");
Daniel Veillard7eda8452000-10-14 23:38:43 +00004418#endif
4419 break;
4420 } else if (cur == '<') {
4421 ctxt->instate = XML_PARSER_START_TAG;
4422 ctxt->checkIndex = 0;
4423#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004424 xmlGenericError(xmlGenericErrorContext,
4425 "HPP: entering START_TAG\n");
Daniel Veillard7eda8452000-10-14 23:38:43 +00004426#endif
4427 break;
4428 } else if (cur == '&') {
4429 if ((!terminate) &&
4430 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4431 goto done;
4432#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004433 xmlGenericError(xmlGenericErrorContext,
4434 "HPP: Parsing Reference\n");
Daniel Veillard7eda8452000-10-14 23:38:43 +00004435#endif
4436 /* TODO: check generation of subtrees if noent !!! */
4437 htmlParseReference(ctxt);
4438 } else {
4439 /* TODO Avoid the extra copy, handle directly !!!!!! */
4440 /*
4441 * Goal of the following test is :
4442 * - minimize calls to the SAX 'character' callback
4443 * when they are mergeable
4444 */
4445 if ((ctxt->inputNr == 1) &&
4446 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4447 if ((!terminate) &&
4448 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4449 goto done;
4450 }
4451 ctxt->checkIndex = 0;
4452#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004453 xmlGenericError(xmlGenericErrorContext,
4454 "HPP: Parsing char data\n");
Daniel Veillard7eda8452000-10-14 23:38:43 +00004455#endif
4456 htmlParseCharData(ctxt, 0);
4457 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004458 }
Daniel Veillard87b95392000-08-12 21:12:04 +00004459 if (cons == ctxt->nbChars) {
4460 if (ctxt->node != NULL) {
4461 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4462 ctxt->sax->error(ctxt->userData,
4463 "detected an error in element content\n");
4464 ctxt->wellFormed = 0;
Daniel Veillard87b95392000-08-12 21:12:04 +00004465 }
Daniel Veillard8ddb5a72000-09-23 10:28:52 +00004466 NEXT;
Daniel Veillard87b95392000-08-12 21:12:04 +00004467 break;
4468 }
4469
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004470 break;
Daniel Veillard87b95392000-08-12 21:12:04 +00004471 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004472 case XML_PARSER_END_TAG:
4473 if (avail < 2)
4474 goto done;
Daniel Veillard71b656e2000-01-05 14:46:17 +00004475 if ((!terminate) &&
4476 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004477 goto done;
4478 htmlParseEndTag(ctxt);
4479 if (ctxt->nameNr == 0) {
4480 ctxt->instate = XML_PARSER_EPILOG;
4481 } else {
4482 ctxt->instate = XML_PARSER_CONTENT;
4483 }
4484 ctxt->checkIndex = 0;
4485#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004486 xmlGenericError(xmlGenericErrorContext,
4487 "HPP: entering CONTENT\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004488#endif
4489 break;
4490 case XML_PARSER_CDATA_SECTION:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004491 xmlGenericError(xmlGenericErrorContext,
4492 "HPP: internal error, state == CDATA\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004493 ctxt->instate = XML_PARSER_CONTENT;
4494 ctxt->checkIndex = 0;
4495#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004496 xmlGenericError(xmlGenericErrorContext,
4497 "HPP: entering CONTENT\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004498#endif
4499 break;
4500 case XML_PARSER_DTD:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004501 xmlGenericError(xmlGenericErrorContext,
4502 "HPP: internal error, state == DTD\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004503 ctxt->instate = XML_PARSER_CONTENT;
4504 ctxt->checkIndex = 0;
4505#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004506 xmlGenericError(xmlGenericErrorContext,
4507 "HPP: entering CONTENT\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004508#endif
4509 break;
4510 case XML_PARSER_COMMENT:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004511 xmlGenericError(xmlGenericErrorContext,
4512 "HPP: internal error, state == COMMENT\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004513 ctxt->instate = XML_PARSER_CONTENT;
4514 ctxt->checkIndex = 0;
4515#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004516 xmlGenericError(xmlGenericErrorContext,
4517 "HPP: entering CONTENT\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004518#endif
4519 break;
4520 case XML_PARSER_PI:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004521 xmlGenericError(xmlGenericErrorContext,
4522 "HPP: internal error, state == PI\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004523 ctxt->instate = XML_PARSER_CONTENT;
4524 ctxt->checkIndex = 0;
4525#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004526 xmlGenericError(xmlGenericErrorContext,
4527 "HPP: entering CONTENT\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004528#endif
4529 break;
4530 case XML_PARSER_ENTITY_DECL:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004531 xmlGenericError(xmlGenericErrorContext,
4532 "HPP: internal error, state == ENTITY_DECL\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004533 ctxt->instate = XML_PARSER_CONTENT;
4534 ctxt->checkIndex = 0;
4535#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004536 xmlGenericError(xmlGenericErrorContext,
4537 "HPP: entering CONTENT\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004538#endif
4539 break;
4540 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004541 xmlGenericError(xmlGenericErrorContext,
4542 "HPP: internal error, state == ENTITY_VALUE\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004543 ctxt->instate = XML_PARSER_CONTENT;
4544 ctxt->checkIndex = 0;
4545#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004546 xmlGenericError(xmlGenericErrorContext,
4547 "HPP: entering DTD\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004548#endif
4549 break;
4550 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004551 xmlGenericError(xmlGenericErrorContext,
4552 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004553 ctxt->instate = XML_PARSER_START_TAG;
4554 ctxt->checkIndex = 0;
4555#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004556 xmlGenericError(xmlGenericErrorContext,
4557 "HPP: entering START_TAG\n");
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004558#endif
4559 break;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00004560 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004561 xmlGenericError(xmlGenericErrorContext,
4562 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
Daniel Veillard32bc74e2000-07-14 14:49:25 +00004563 ctxt->instate = XML_PARSER_CONTENT;
4564 ctxt->checkIndex = 0;
4565#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004566 xmlGenericError(xmlGenericErrorContext,
4567 "HPP: entering CONTENT\n");
Daniel Veillard32bc74e2000-07-14 14:49:25 +00004568#endif
4569 break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004570 }
4571 }
4572done:
Daniel Veillard365e13b2000-07-02 07:56:37 +00004573 if ((avail == 0) && (terminate)) {
4574 htmlAutoClose(ctxt, NULL);
Daniel Veillard87b95392000-08-12 21:12:04 +00004575 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4576 /*
4577 * SAX: end of the document processing.
4578 */
Daniel Veillard365e13b2000-07-02 07:56:37 +00004579 ctxt->instate = XML_PARSER_EOF;
Daniel Veillard87b95392000-08-12 21:12:04 +00004580 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4581 ctxt->sax->endDocument(ctxt->userData);
4582 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00004583 }
Daniel Veillardb8f25c92000-08-19 19:52:36 +00004584 if ((ctxt->myDoc != NULL) &&
4585 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4586 (ctxt->instate == XML_PARSER_EPILOG))) {
4587 xmlDtdPtr dtd;
4588 dtd = xmlGetIntSubset(ctxt->myDoc);
4589 if (dtd == NULL)
4590 ctxt->myDoc->intSubset =
4591 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4592 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4593 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4594 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004595#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004596 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004597#endif
4598 return(ret);
4599}
4600
4601/**
Daniel Veillard71b656e2000-01-05 14:46:17 +00004602 * htmlParseTry:
4603 * @ctxt: an HTML parser context
4604 *
4605 * Try to progress on parsing
4606 *
4607 * Returns zero if no parsing was possible
4608 */
4609int
4610htmlParseTry(htmlParserCtxtPtr ctxt) {
4611 return(htmlParseTryOrFinish(ctxt, 0));
4612}
4613
4614/**
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004615 * htmlParseChunk:
4616 * @ctxt: an XML parser context
4617 * @chunk: an char array
4618 * @size: the size in byte of the chunk
4619 * @terminate: last chunk indicator
4620 *
4621 * Parse a Chunk of memory
4622 *
4623 * Returns zero if no error, the xmlParserErrors otherwise.
4624 */
4625int
4626htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4627 int terminate) {
4628 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4629 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4630 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4631 int cur = ctxt->input->cur - ctxt->input->base;
4632
4633 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4634 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4635 ctxt->input->cur = ctxt->input->base + cur;
4636#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004637 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004638#endif
4639
Daniel Veillardd0f7f742000-02-02 17:42:48 +00004640 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4641 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard52402ce2000-08-22 23:36:12 +00004642 } else if (ctxt->instate != XML_PARSER_EOF) {
4643 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
Daniel Veillard71b656e2000-01-05 14:46:17 +00004644 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard52402ce2000-08-22 23:36:12 +00004645 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004646 if (terminate) {
4647 if ((ctxt->instate != XML_PARSER_EOF) &&
4648 (ctxt->instate != XML_PARSER_EPILOG) &&
4649 (ctxt->instate != XML_PARSER_MISC)) {
Daniel Veillarda2c6da92000-09-16 18:15:00 +00004650 ctxt->errNo = XML_ERR_DOCUMENT_END;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004651 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4652 ctxt->sax->error(ctxt->userData,
4653 "Extra content at the end of the document\n");
4654 ctxt->wellFormed = 0;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004655 }
4656 if (ctxt->instate != XML_PARSER_EOF) {
4657 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4658 ctxt->sax->endDocument(ctxt->userData);
4659 }
4660 ctxt->instate = XML_PARSER_EOF;
4661 }
4662 return((xmlParserErrors) ctxt->errNo);
4663}
4664
4665/************************************************************************
4666 * *
4667 * User entry points *
4668 * *
4669 ************************************************************************/
4670
4671/**
4672 * htmlCreatePushParserCtxt :
4673 * @sax: a SAX handler
4674 * @user_data: The user data returned on SAX callbacks
4675 * @chunk: a pointer to an array of chars
4676 * @size: number of chars in the array
4677 * @filename: an optional file name or URI
4678 * @enc: an optional encoding
4679 *
4680 * Create a parser context for using the HTML parser in push mode
4681 * To allow content encoding detection, @size should be >= 4
4682 * The value of @filename is used for fetching external entities
4683 * and error/warning reports.
4684 *
4685 * Returns the new parser context or NULL
4686 */
4687htmlParserCtxtPtr
4688htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4689 const char *chunk, int size, const char *filename,
4690 xmlCharEncoding enc) {
4691 htmlParserCtxtPtr ctxt;
4692 htmlParserInputPtr inputStream;
4693 xmlParserInputBufferPtr buf;
4694
4695 buf = xmlAllocParserInputBuffer(enc);
4696 if (buf == NULL) return(NULL);
4697
4698 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4699 if (ctxt == NULL) {
4700 xmlFree(buf);
4701 return(NULL);
4702 }
4703 memset(ctxt, 0, sizeof(htmlParserCtxt));
4704 htmlInitParserCtxt(ctxt);
4705 if (sax != NULL) {
4706 if (ctxt->sax != &htmlDefaultSAXHandler)
4707 xmlFree(ctxt->sax);
4708 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4709 if (ctxt->sax == NULL) {
4710 xmlFree(buf);
4711 xmlFree(ctxt);
4712 return(NULL);
4713 }
4714 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4715 if (user_data != NULL)
4716 ctxt->userData = user_data;
4717 }
4718 if (filename == NULL) {
4719 ctxt->directory = NULL;
4720 } else {
4721 ctxt->directory = xmlParserGetDirectory(filename);
4722 }
4723
4724 inputStream = htmlNewInputStream(ctxt);
4725 if (inputStream == NULL) {
4726 xmlFreeParserCtxt(ctxt);
4727 return(NULL);
4728 }
4729
4730 if (filename == NULL)
4731 inputStream->filename = NULL;
4732 else
4733 inputStream->filename = xmlMemStrdup(filename);
4734 inputStream->buf = buf;
4735 inputStream->base = inputStream->buf->buffer->content;
4736 inputStream->cur = inputStream->buf->buffer->content;
4737
4738 inputPush(ctxt, inputStream);
4739
4740 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4741 (ctxt->input->buf != NULL)) {
4742 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4743#ifdef DEBUG_PUSH
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00004744 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004745#endif
4746 }
4747
4748 return(ctxt);
4749}
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004750
4751/**
4752 * htmlSAXParseDoc :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00004753 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004754 * @encoding: a free form C string describing the HTML document encoding, or NULL
4755 * @sax: the SAX handler block
4756 * @userData: if using SAX, this pointer will be provided on callbacks.
4757 *
4758 * parse an HTML in-memory document and build a tree.
4759 * It use the given SAX function block to handle the parsing callback.
4760 * If sax is NULL, fallback to the default DOM tree building routines.
4761 *
4762 * Returns the resulting document tree
4763 */
4764
4765htmlDocPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00004766htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004767 htmlDocPtr ret;
4768 htmlParserCtxtPtr ctxt;
4769
4770 if (cur == NULL) return(NULL);
4771
4772
4773 ctxt = htmlCreateDocParserCtxt(cur, encoding);
4774 if (ctxt == NULL) return(NULL);
4775 if (sax != NULL) {
4776 ctxt->sax = sax;
4777 ctxt->userData = userData;
4778 }
4779
4780 htmlParseDocument(ctxt);
4781 ret = ctxt->myDoc;
4782 if (sax != NULL) {
4783 ctxt->sax = NULL;
4784 ctxt->userData = NULL;
4785 }
4786 htmlFreeParserCtxt(ctxt);
4787
4788 return(ret);
4789}
4790
4791/**
4792 * htmlParseDoc :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00004793 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004794 * @encoding: a free form C string describing the HTML document encoding, or NULL
4795 *
4796 * parse an HTML in-memory document and build a tree.
4797 *
4798 * Returns the resulting document tree
4799 */
4800
4801htmlDocPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00004802htmlParseDoc(xmlChar *cur, const char *encoding) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004803 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4804}
4805
4806
4807/**
4808 * htmlCreateFileParserCtxt :
4809 * @filename: the filename
4810 * @encoding: a free form C string describing the HTML document encoding, or NULL
4811 *
4812 * Create a parser context for a file content.
4813 * Automatic support for ZLIB/Compress compressed document is provided
4814 * by default if found at compile-time.
4815 *
4816 * Returns the new parser context or NULL
4817 */
4818htmlParserCtxtPtr
4819htmlCreateFileParserCtxt(const char *filename, const char *encoding)
4820{
4821 htmlParserCtxtPtr ctxt;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004822 htmlParserInputPtr inputStream;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00004823 xmlParserInputBufferPtr buf;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004824 /* htmlCharEncoding enc; */
4825
Daniel Veillarde2d034d1999-07-27 19:52:06 +00004826 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
4827 if (buf == NULL) return(NULL);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004828
Daniel Veillard6454aec1999-09-02 22:04:43 +00004829 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004830 if (ctxt == NULL) {
4831 perror("malloc");
4832 return(NULL);
4833 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00004834 memset(ctxt, 0, sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004835 htmlInitParserCtxt(ctxt);
Daniel Veillard6454aec1999-09-02 22:04:43 +00004836 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004837 if (inputStream == NULL) {
4838 perror("malloc");
Daniel Veillard6454aec1999-09-02 22:04:43 +00004839 xmlFree(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004840 return(NULL);
4841 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00004842 memset(inputStream, 0, sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004843
Daniel Veillard6454aec1999-09-02 22:04:43 +00004844 inputStream->filename = xmlMemStrdup(filename);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004845 inputStream->line = 1;
4846 inputStream->col = 1;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00004847 inputStream->buf = buf;
Daniel Veillard35008381999-10-25 13:15:52 +00004848 inputStream->directory = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004849
Daniel Veillarde2d034d1999-07-27 19:52:06 +00004850 inputStream->base = inputStream->buf->buffer->content;
4851 inputStream->cur = inputStream->buf->buffer->content;
4852 inputStream->free = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004853
4854 inputPush(ctxt, inputStream);
4855 return(ctxt);
4856}
4857
4858/**
4859 * htmlSAXParseFile :
4860 * @filename: the filename
4861 * @encoding: a free form C string describing the HTML document encoding, or NULL
4862 * @sax: the SAX handler block
4863 * @userData: if using SAX, this pointer will be provided on callbacks.
4864 *
4865 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4866 * compressed document is provided by default if found at compile-time.
4867 * It use the given SAX function block to handle the parsing callback.
4868 * If sax is NULL, fallback to the default DOM tree building routines.
4869 *
4870 * Returns the resulting document tree
4871 */
4872
4873htmlDocPtr
4874htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
4875 void *userData) {
4876 htmlDocPtr ret;
4877 htmlParserCtxtPtr ctxt;
Daniel Veillard87b95392000-08-12 21:12:04 +00004878 htmlSAXHandlerPtr oldsax = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004879
4880 ctxt = htmlCreateFileParserCtxt(filename, encoding);
4881 if (ctxt == NULL) return(NULL);
4882 if (sax != NULL) {
Daniel Veillard87b95392000-08-12 21:12:04 +00004883 oldsax = ctxt->sax;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004884 ctxt->sax = sax;
4885 ctxt->userData = userData;
4886 }
4887
4888 htmlParseDocument(ctxt);
4889
4890 ret = ctxt->myDoc;
4891 if (sax != NULL) {
Daniel Veillard87b95392000-08-12 21:12:04 +00004892 ctxt->sax = oldsax;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004893 ctxt->userData = NULL;
4894 }
4895 htmlFreeParserCtxt(ctxt);
4896
4897 return(ret);
4898}
4899
4900/**
4901 * htmlParseFile :
4902 * @filename: the filename
4903 * @encoding: a free form C string describing the HTML document encoding, or NULL
4904 *
4905 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4906 * compressed document is provided by default if found at compile-time.
4907 *
4908 * Returns the resulting document tree
4909 */
4910
4911htmlDocPtr
4912htmlParseFile(const char *filename, const char *encoding) {
4913 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
4914}
Daniel Veillard361d8452000-04-03 19:48:13 +00004915
4916#endif /* LIBXML_HTML_ENABLED */