blob: e0f762064acb7a239a99b12ced17dd53cf5eb17d [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Owen Taylor3473f882001-02-23 17:55:21 +000045
46#define HTML_MAX_NAMELEN 1000
47#define HTML_PARSER_BIG_BUFFER_SIZE 1000
48#define HTML_PARSER_BUFFER_SIZE 100
49
50/* #define DEBUG */
51/* #define DEBUG_PUSH */
52
Daniel Veillard22090732001-07-16 00:06:07 +000053static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000054
Daniel Veillard56a4cb82001-03-24 17:00:36 +000055xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
56 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000057static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000058
59/************************************************************************
60 * *
Owen Taylor3473f882001-02-23 17:55:21 +000061 * Parser stacks related functions and macros *
62 * *
63 ************************************************************************/
64
Daniel Veillard1c732d22002-11-30 11:22:59 +000065/**
66 * htmlnamePush:
67 * @ctxt: an HTML parser context
68 * @value: the element name
69 *
70 * Pushes a new element name on top of the name stack
71 *
72 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +000073 */
Daniel Veillard1c732d22002-11-30 11:22:59 +000074static int
75htmlnamePush(htmlParserCtxtPtr ctxt, xmlChar * value)
76{
77 if (ctxt->nameNr >= ctxt->nameMax) {
78 ctxt->nameMax *= 2;
79 ctxt->nameTab =
80 (xmlChar * *)xmlRealloc(ctxt->nameTab,
81 ctxt->nameMax *
82 sizeof(ctxt->nameTab[0]));
83 if (ctxt->nameTab == NULL) {
84 xmlGenericError(xmlGenericErrorContext, "realloc failed !\n");
85 return (0);
86 }
87 }
88 ctxt->nameTab[ctxt->nameNr] = value;
89 ctxt->name = value;
90 return (ctxt->nameNr++);
91}
92/**
93 * htmlnamePop:
94 * @ctxt: an HTML parser context
95 *
96 * Pops the top element name from the name stack
97 *
98 * Returns the name just removed
99 */
100static xmlChar *
101htmlnamePop(htmlParserCtxtPtr ctxt)
102{
103 xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000104
Daniel Veillard1c732d22002-11-30 11:22:59 +0000105 if (ctxt->nameNr <= 0)
106 return (0);
107 ctxt->nameNr--;
108 if (ctxt->nameNr < 0)
109 return (0);
110 if (ctxt->nameNr > 0)
111 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
112 else
113 ctxt->name = NULL;
114 ret = ctxt->nameTab[ctxt->nameNr];
115 ctxt->nameTab[ctxt->nameNr] = 0;
116 return (ret);
117}
Owen Taylor3473f882001-02-23 17:55:21 +0000118
119/*
120 * Macros for accessing the content. Those should be used only by the parser,
121 * and not exported.
122 *
123 * Dirty macros, i.e. one need to make assumption on the context to use them
124 *
125 * CUR_PTR return the current pointer to the xmlChar to be parsed.
126 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
127 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
128 * in UNICODE mode. This should be used internally by the parser
129 * only to compare to ASCII values otherwise it would break when
130 * running with UTF-8 encoding.
131 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
132 * to compare on ASCII based substring.
133 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
134 * it should be used only to compare on ASCII based substring.
135 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
136 * strings within the parser.
137 *
138 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
139 *
140 * CURRENT Returns the current char value, with the full decoding of
141 * UTF-8 if we are using this mode. It returns an int.
142 * NEXT Skip to the next character, this does the proper decoding
143 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
144 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
145 */
146
147#define UPPER (toupper(*ctxt->input->cur))
148
149#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
150
151#define NXT(val) ctxt->input->cur[(val)]
152
153#define UPP(val) (toupper(ctxt->input->cur[(val)]))
154
155#define CUR_PTR ctxt->input->cur
156
157#define SHRINK xmlParserInputShrink(ctxt->input)
158
159#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
160
161#define CURRENT ((int) (*ctxt->input->cur))
162
163#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
164
165/* Inported from XML */
166
Daniel Veillard561b7f82002-03-20 21:55:57 +0000167/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
168#define CUR ((int) (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000169#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
170
Daniel Veillard561b7f82002-03-20 21:55:57 +0000171#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000172#define NXT(val) ctxt->input->cur[(val)]
173#define CUR_PTR ctxt->input->cur
174
175
176#define NEXTL(l) do { \
177 if (*(ctxt->input->cur) == '\n') { \
178 ctxt->input->line++; ctxt->input->col = 1; \
179 } else ctxt->input->col++; \
180 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
181 } while (0)
182
183/************
184 \
185 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
186 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
187 ************/
188
189#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
190#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
191
192#define COPY_BUF(l,b,i,v) \
193 if (l == 1) b[i++] = (xmlChar) v; \
194 else i += xmlCopyChar(l,&b[i],v)
195
196/**
197 * htmlCurrentChar:
198 * @ctxt: the HTML parser context
199 * @len: pointer to the length of the char read
200 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000201 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000202 * bytes in the input buffer. Implement the end of line normalization:
203 * 2.11 End-of-Line Handling
204 * If the encoding is unspecified, in the case we find an ISO-Latin-1
205 * char, then the encoding converter is plugged in automatically.
206 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000207 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000208 */
209
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000210static int
Owen Taylor3473f882001-02-23 17:55:21 +0000211htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
212 if (ctxt->instate == XML_PARSER_EOF)
213 return(0);
214
215 if (ctxt->token != 0) {
216 *len = 0;
217 return(ctxt->token);
218 }
219 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
220 /*
221 * We are supposed to handle UTF8, check it's valid
222 * From rfc2044: encoding of the Unicode values on UTF-8:
223 *
224 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
225 * 0000 0000-0000 007F 0xxxxxxx
226 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
227 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
228 *
229 * Check for the 0x110000 limit too
230 */
231 const unsigned char *cur = ctxt->input->cur;
232 unsigned char c;
233 unsigned int val;
234
235 c = *cur;
236 if (c & 0x80) {
237 if (cur[1] == 0)
238 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
239 if ((cur[1] & 0xc0) != 0x80)
240 goto encoding_error;
241 if ((c & 0xe0) == 0xe0) {
242
243 if (cur[2] == 0)
244 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
245 if ((cur[2] & 0xc0) != 0x80)
246 goto encoding_error;
247 if ((c & 0xf0) == 0xf0) {
248 if (cur[3] == 0)
249 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
250 if (((c & 0xf8) != 0xf0) ||
251 ((cur[3] & 0xc0) != 0x80))
252 goto encoding_error;
253 /* 4-byte code */
254 *len = 4;
255 val = (cur[0] & 0x7) << 18;
256 val |= (cur[1] & 0x3f) << 12;
257 val |= (cur[2] & 0x3f) << 6;
258 val |= cur[3] & 0x3f;
259 } else {
260 /* 3-byte code */
261 *len = 3;
262 val = (cur[0] & 0xf) << 12;
263 val |= (cur[1] & 0x3f) << 6;
264 val |= cur[2] & 0x3f;
265 }
266 } else {
267 /* 2-byte code */
268 *len = 2;
269 val = (cur[0] & 0x1f) << 6;
270 val |= cur[1] & 0x3f;
271 }
272 if (!IS_CHAR(val)) {
273 ctxt->errNo = XML_ERR_INVALID_ENCODING;
274 if ((ctxt->sax != NULL) &&
275 (ctxt->sax->error != NULL))
276 ctxt->sax->error(ctxt->userData,
277 "Char 0x%X out of allowed range\n", val);
278 ctxt->wellFormed = 0;
Daniel Veillarddad3f682002-11-17 16:47:27 +0000279 if (ctxt->recovery == 0) ctxt->disableSAX = 1;
Owen Taylor3473f882001-02-23 17:55:21 +0000280 }
281 return(val);
282 } else {
283 /* 1-byte code */
284 *len = 1;
285 return((int) *ctxt->input->cur);
286 }
287 }
288 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000289 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000290 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000291 * XML constructs only use < 128 chars
292 */
293 *len = 1;
294 if ((int) *ctxt->input->cur < 0x80)
295 return((int) *ctxt->input->cur);
296
297 /*
298 * Humm this is bad, do an automatic flow conversion
299 */
300 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
301 ctxt->charset = XML_CHAR_ENCODING_UTF8;
302 return(xmlCurrentChar(ctxt, len));
303
304encoding_error:
305 /*
306 * If we detect an UTF8 error that probably mean that the
307 * input encoding didn't get properly advertized in the
308 * declaration header. Report the error and switch the encoding
309 * to ISO-Latin-1 (if you don't like this policy, just declare the
310 * encoding !)
311 */
312 ctxt->errNo = XML_ERR_INVALID_ENCODING;
313 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
314 ctxt->sax->error(ctxt->userData,
315 "Input is not proper UTF-8, indicate encoding !\n");
316 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
317 ctxt->input->cur[0], ctxt->input->cur[1],
318 ctxt->input->cur[2], ctxt->input->cur[3]);
319 }
320
321 ctxt->charset = XML_CHAR_ENCODING_8859_1;
322 *len = 1;
323 return((int) *ctxt->input->cur);
324}
325
326/**
Owen Taylor3473f882001-02-23 17:55:21 +0000327 * htmlSkipBlankChars:
328 * @ctxt: the HTML parser context
329 *
330 * skip all blanks character found at that point in the input streams.
331 *
332 * Returns the number of space chars skipped
333 */
334
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000335static int
Owen Taylor3473f882001-02-23 17:55:21 +0000336htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
337 int res = 0;
338
339 while (IS_BLANK(*(ctxt->input->cur))) {
340 if ((*ctxt->input->cur == 0) &&
341 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
342 xmlPopInput(ctxt);
343 } else {
344 if (*(ctxt->input->cur) == '\n') {
345 ctxt->input->line++; ctxt->input->col = 1;
346 } else ctxt->input->col++;
347 ctxt->input->cur++;
348 ctxt->nbChars++;
349 if (*ctxt->input->cur == 0)
350 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
351 }
352 res++;
353 }
354 return(res);
355}
356
357
358
359/************************************************************************
360 * *
361 * The list of HTML elements and their properties *
362 * *
363 ************************************************************************/
364
365/*
366 * Start Tag: 1 means the start tag can be ommited
367 * End Tag: 1 means the end tag can be ommited
368 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000369 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000370 * Depr: this element is deprecated
371 * DTD: 1 means that this element is valid only in the Loose DTD
372 * 2 means that this element is valid only in the Frameset DTD
373 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000374 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Owen Taylor3473f882001-02-23 17:55:21 +0000375 */
Daniel Veillard22090732001-07-16 00:06:07 +0000376static const htmlElemDesc
377html40ElementTable[] = {
Daniel Veillard02bb1702001-06-13 21:11:59 +0000378{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor " },
379{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form" },
380{ "acronym", 0, 0, 0, 0, 0, 0, 1, "" },
381{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author " },
382{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet " },
383{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area " },
384{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style" },
385{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri " },
386{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " },
387{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride " },
388{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style" },
389{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation " },
390{ "body", 1, 1, 0, 0, 0, 0, 0, "document body " },
391{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break " },
392{ "button", 0, 0, 0, 0, 0, 0, 2, "push button " },
393{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption " },
394{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center " },
395{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation" },
396{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment" },
397{ "col", 0, 2, 2, 1, 0, 0, 0, "table column " },
398{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group " },
399{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description " },
400{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text " },
401{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition" },
402{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list" },
403{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container"},
404{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list " },
405{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term " },
406{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis" },
407{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group " },
408{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font " },
409{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form " },
410{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " },
411{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" },
412{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading " },
413{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading " },
414{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading " },
415{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading " },
416{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading " },
417{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading " },
418{ "head", 1, 1, 0, 0, 0, 0, 0, "document head " },
419{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " },
420{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element " },
421{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style" },
422{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow " },
423{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image " },
424{ "input", 0, 2, 2, 1, 0, 0, 1, "form control " },
425{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text" },
426{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt " },
427{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user" },
428{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text " },
429{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend " },
430{ "li", 0, 1, 1, 0, 0, 0, 0, "list item " },
431{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link " },
432{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map " },
433{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list " },
434{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation " },
435{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering " },
436{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
437{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object " },
438{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list " },
439{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group " },
440{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " },
Daniel Veillardfee408f2002-11-22 13:18:30 +0000441{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph " },
Daniel Veillard02bb1702001-06-13 21:11:59 +0000442{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value " },
443{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text " },
444{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation " },
445{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style" },
446{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc." },
447{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements " },
448{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector " },
449{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style" },
450{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container " },
451{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text" },
452{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis" },
453{ "style", 0, 0, 0, 0, 0, 0, 0, "style info " },
454{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript" },
455{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript " },
456{ "table", 0, 0, 0, 0, 0, 0, 0, "&#160;" },
457{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body " },
458{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell" },
459{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field " },
460{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer " },
461{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell" },
462{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header " },
463{ "title", 0, 0, 0, 0, 0, 0, 0, "document title " },
464{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row " },
465{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style" },
466{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style" },
467{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list " },
468{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument" },
Owen Taylor3473f882001-02-23 17:55:21 +0000469};
470
471/*
Owen Taylor3473f882001-02-23 17:55:21 +0000472 * start tags that imply the end of current element
473 */
Daniel Veillard22090732001-07-16 00:06:07 +0000474static const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000475"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
476 "dl", "ul", "ol", "menu", "dir", "address", "pre",
477 "listing", "xmp", "head", NULL,
478"head", "p", NULL,
479"title", "p", NULL,
480"body", "head", "style", "link", "title", "p", NULL,
481"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
482 "pre", "listing", "xmp", "head", "li", NULL,
483"hr", "p", "head", NULL,
484"h1", "p", "head", NULL,
485"h2", "p", "head", NULL,
486"h3", "p", "head", NULL,
487"h4", "p", "head", NULL,
488"h5", "p", "head", NULL,
489"h6", "p", "head", NULL,
490"dir", "p", "head", NULL,
491"address", "p", "head", "ul", NULL,
492"pre", "p", "head", "ul", NULL,
493"listing", "p", "head", NULL,
494"xmp", "p", "head", NULL,
495"blockquote", "p", "head", NULL,
496"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
497 "xmp", "head", NULL,
498"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
499 "head", "dd", NULL,
500"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
501 "head", "dt", NULL,
502"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
503 "listing", "xmp", NULL,
504"ol", "p", "head", "ul", NULL,
505"menu", "p", "head", "ul", NULL,
506"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
507"div", "p", "head", NULL,
508"noscript", "p", "head", NULL,
509"center", "font", "b", "i", "p", "head", NULL,
510"a", "a", NULL,
511"caption", "p", NULL,
512"colgroup", "caption", "colgroup", "col", "p", NULL,
513"col", "caption", "col", "p", NULL,
514"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
515 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000516"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
517"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000518"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
519"thead", "caption", "col", "colgroup", NULL,
520"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
521 "tbody", "p", NULL,
522"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
523 "tfoot", "tbody", "p", NULL,
524"optgroup", "option", NULL,
525"option", "option", NULL,
526"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
527 "pre", "listing", "xmp", "a", NULL,
528NULL
529};
530
531/*
532 * The list of HTML elements which are supposed not to have
533 * CDATA content and where a p element will be implied
534 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000535 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +0000536 * implied paragraph
537 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000538static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000539 "html",
540 "head",
541 "body",
542 NULL
543};
544
545/*
546 * The list of HTML attributes which are of content %Script;
547 * NOTE: when adding ones, check htmlIsScriptAttribute() since
548 * it assumes the name starts with 'on'
549 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000550static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000551 "onclick",
552 "ondblclick",
553 "onmousedown",
554 "onmouseup",
555 "onmouseover",
556 "onmousemove",
557 "onmouseout",
558 "onkeypress",
559 "onkeydown",
560 "onkeyup",
561 "onload",
562 "onunload",
563 "onfocus",
564 "onblur",
565 "onsubmit",
566 "onrest",
567 "onchange",
568 "onselect"
569};
570
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000571/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000572 * This table is used by the htmlparser to know what to do with
573 * broken html pages. By assigning different priorities to different
574 * elements the parser can decide how to handle extra endtags.
575 * Endtags are only allowed to close elements with lower or equal
576 * priority.
577 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000578
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000579typedef struct {
580 const char *name;
581 int priority;
582} elementPriority;
583
Daniel Veillard22090732001-07-16 00:06:07 +0000584static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000585 {"div", 150},
586 {"td", 160},
587 {"th", 160},
588 {"tr", 170},
589 {"thead", 180},
590 {"tbody", 180},
591 {"tfoot", 180},
592 {"table", 190},
593 {"head", 200},
594 {"body", 200},
595 {"html", 220},
596 {NULL, 100} /* Default priority */
597};
Owen Taylor3473f882001-02-23 17:55:21 +0000598
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000599static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +0000600static int htmlStartCloseIndexinitialized = 0;
601
602/************************************************************************
603 * *
604 * functions to handle HTML specific data *
605 * *
606 ************************************************************************/
607
608/**
609 * htmlInitAutoClose:
610 *
611 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
612 * This is not reentrant. Call xmlInitParser() once before processing in
613 * case of use in multithreaded programs.
614 */
615void
616htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000617 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000618
619 if (htmlStartCloseIndexinitialized) return;
620
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000621 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
622 indx = 0;
623 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
624 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +0000625 while (htmlStartClose[i] != NULL) i++;
626 i++;
627 }
628 htmlStartCloseIndexinitialized = 1;
629}
630
631/**
632 * htmlTagLookup:
633 * @tag: The tag name in lowercase
634 *
635 * Lookup the HTML tag in the ElementTable
636 *
637 * Returns the related htmlElemDescPtr or NULL if not found.
638 */
Daniel Veillardbb371292001-08-16 23:26:59 +0000639const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +0000640htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000641 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000642
643 for (i = 0; i < (sizeof(html40ElementTable) /
644 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000645 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillard22090732001-07-16 00:06:07 +0000646 return((const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +0000647 }
648 return(NULL);
649}
650
651/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000652 * htmlGetEndPriority:
653 * @name: The name of the element to look up the priority for.
654 *
655 * Return value: The "endtag" priority.
656 **/
657static int
658htmlGetEndPriority (const xmlChar *name) {
659 int i = 0;
660
661 while ((htmlEndPriority[i].name != NULL) &&
662 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
663 i++;
664
665 return(htmlEndPriority[i].priority);
666}
667
668/**
Owen Taylor3473f882001-02-23 17:55:21 +0000669 * htmlCheckAutoClose:
670 * @newtag: The new tag name
671 * @oldtag: The old tag name
672 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000673 * Checks whether the new tag is one of the registered valid tags for
674 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +0000675 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
676 *
677 * Returns 0 if no, 1 if yes.
678 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000679static int
Owen Taylor3473f882001-02-23 17:55:21 +0000680htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000681 int i, indx;
682 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +0000683
684 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
685
686 /* inefficient, but not a big deal */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000687 for (indx = 0; indx < 100;indx++) {
688 closed = htmlStartCloseIndex[indx];
689 if (closed == NULL) return(0);
690 if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor3473f882001-02-23 17:55:21 +0000691 }
692
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000693 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +0000694 i++;
695 while (htmlStartClose[i] != NULL) {
696 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
697 return(1);
698 }
699 i++;
700 }
701 return(0);
702}
703
704/**
705 * htmlAutoCloseOnClose:
706 * @ctxt: an HTML parser context
707 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000708 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +0000709 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000710 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000711 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000712static void
Owen Taylor3473f882001-02-23 17:55:21 +0000713htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillardbb371292001-08-16 23:26:59 +0000714 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +0000715 xmlChar *oldname;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000716 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +0000717
718#ifdef DEBUG
719 xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
720 for (i = 0;i < ctxt->nameNr;i++)
721 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
722#endif
723
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000724 priority = htmlGetEndPriority (newtag);
725
Owen Taylor3473f882001-02-23 17:55:21 +0000726 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000727
Owen Taylor3473f882001-02-23 17:55:21 +0000728 if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000729 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000730 * A missplaced endtag can only close elements with lower
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000731 * or equal priority, so if we find an element with higher
732 * priority before we find an element with
733 * matching name, we just ignore this endtag
734 */
735 if (htmlGetEndPriority (ctxt->nameTab[i]) > priority) return;
Owen Taylor3473f882001-02-23 17:55:21 +0000736 }
737 if (i < 0) return;
738
739 while (!xmlStrEqual(newtag, ctxt->name)) {
740 info = htmlTagLookup(ctxt->name);
741 if ((info == NULL) || (info->endTag == 1)) {
742#ifdef DEBUG
743 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
744#endif
Daniel Veillard56098d42001-04-24 12:51:09 +0000745 } else if (info->endTag == 3) {
746#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +0000747 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", newtag, ctxt->name);
William M. Brack1633d182001-10-05 15:41:19 +0000748
Daniel Veillard56098d42001-04-24 12:51:09 +0000749#endif
750 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
751 ctxt->sax->error(ctxt->userData,
752 "Opening and ending tag mismatch: %s and %s\n",
753 newtag, ctxt->name);
754 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000755 }
756 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
757 ctxt->sax->endElement(ctxt->userData, ctxt->name);
758 oldname = htmlnamePop(ctxt);
759 if (oldname != NULL) {
760#ifdef DEBUG
761 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
762#endif
763 xmlFree(oldname);
764 }
765 }
766}
767
768/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000769 * htmlAutoCloseOnEnd:
770 * @ctxt: an HTML parser context
771 *
772 * Close all remaining tags at the end of the stream
773 */
774static void
775htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
776 xmlChar *oldname;
777 int i;
778
779 if (ctxt->nameNr == 0)
780 return;
781#ifdef DEBUG
782 xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
783#endif
784
785 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
786#ifdef DEBUG
787 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
788#endif
789 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
790 ctxt->sax->endElement(ctxt->userData, ctxt->name);
791 oldname = htmlnamePop(ctxt);
792 if (oldname != NULL) {
793#ifdef DEBUG
794 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
795#endif
796 xmlFree(oldname);
797 }
798 }
799}
800
801/**
Owen Taylor3473f882001-02-23 17:55:21 +0000802 * htmlAutoClose:
803 * @ctxt: an HTML parser context
804 * @newtag: The new tag name or NULL
805 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000806 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000807 * The list is kept in htmlStartClose array. This function is
808 * called when a new tag has been detected and generates the
809 * appropriates closes if possible/needed.
810 * If newtag is NULL this mean we are at the end of the resource
811 * and we should check
812 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000813static void
Owen Taylor3473f882001-02-23 17:55:21 +0000814htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
815 xmlChar *oldname;
816 while ((newtag != NULL) && (ctxt->name != NULL) &&
817 (htmlCheckAutoClose(newtag, ctxt->name))) {
818#ifdef DEBUG
819 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
820#endif
821 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
822 ctxt->sax->endElement(ctxt->userData, ctxt->name);
823 oldname = htmlnamePop(ctxt);
824 if (oldname != NULL) {
825#ifdef DEBUG
826 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
827#endif
828 xmlFree(oldname);
829 }
830 }
831 if (newtag == NULL) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000832 htmlAutoCloseOnEnd(ctxt);
833 return;
Owen Taylor3473f882001-02-23 17:55:21 +0000834 }
835 while ((newtag == NULL) && (ctxt->name != NULL) &&
836 ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
837 (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
838 (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
839#ifdef DEBUG
840 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
841#endif
842 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
843 ctxt->sax->endElement(ctxt->userData, ctxt->name);
844 oldname = htmlnamePop(ctxt);
845 if (oldname != NULL) {
846#ifdef DEBUG
847 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
848#endif
849 xmlFree(oldname);
850 }
851 }
852
853}
854
855/**
856 * htmlAutoCloseTag:
857 * @doc: the HTML document
858 * @name: The tag name
859 * @elem: the HTML element
860 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000861 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000862 * The list is kept in htmlStartClose array. This function checks
863 * if the element or one of it's children would autoclose the
864 * given tag.
865 *
866 * Returns 1 if autoclose, 0 otherwise
867 */
868int
869htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
870 htmlNodePtr child;
871
872 if (elem == NULL) return(1);
873 if (xmlStrEqual(name, elem->name)) return(0);
874 if (htmlCheckAutoClose(elem->name, name)) return(1);
875 child = elem->children;
876 while (child != NULL) {
877 if (htmlAutoCloseTag(doc, name, child)) return(1);
878 child = child->next;
879 }
880 return(0);
881}
882
883/**
884 * htmlIsAutoClosed:
885 * @doc: the HTML document
886 * @elem: the HTML element
887 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000888 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000889 * The list is kept in htmlStartClose array. This function checks
890 * if a tag is autoclosed by one of it's child
891 *
892 * Returns 1 if autoclosed, 0 otherwise
893 */
894int
895htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
896 htmlNodePtr child;
897
898 if (elem == NULL) return(1);
899 child = elem->children;
900 while (child != NULL) {
901 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
902 child = child->next;
903 }
904 return(0);
905}
906
907/**
908 * htmlCheckImplied:
909 * @ctxt: an HTML parser context
910 * @newtag: The new tag name
911 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000912 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +0000913 * called when a new tag has been detected and generates the
914 * appropriates implicit tags if missing
915 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000916static void
Owen Taylor3473f882001-02-23 17:55:21 +0000917htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
918 if (!htmlOmittedDefaultValue)
919 return;
920 if (xmlStrEqual(newtag, BAD_CAST"html"))
921 return;
922 if (ctxt->nameNr <= 0) {
923#ifdef DEBUG
924 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
925#endif
926 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
927 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
928 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
929 }
930 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
931 return;
932 if ((ctxt->nameNr <= 1) &&
933 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
934 (xmlStrEqual(newtag, BAD_CAST"style")) ||
935 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
936 (xmlStrEqual(newtag, BAD_CAST"link")) ||
937 (xmlStrEqual(newtag, BAD_CAST"title")) ||
938 (xmlStrEqual(newtag, BAD_CAST"base")))) {
939 /*
940 * dropped OBJECT ... i you put it first BODY will be
941 * assumed !
942 */
943#ifdef DEBUG
944 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
945#endif
946 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
947 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
948 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
949 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
950 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
951 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
952 int i;
953 for (i = 0;i < ctxt->nameNr;i++) {
954 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
955 return;
956 }
957 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
958 return;
959 }
960 }
961
962#ifdef DEBUG
963 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
964#endif
965 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
966 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
967 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
968 }
969}
970
971/**
972 * htmlCheckParagraph
973 * @ctxt: an HTML parser context
974 *
975 * Check whether a p element need to be implied before inserting
976 * characters in the current element.
977 *
978 * Returns 1 if a paragraph has been inserted, 0 if not and -1
979 * in case of error.
980 */
981
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000982static int
Owen Taylor3473f882001-02-23 17:55:21 +0000983htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
984 const xmlChar *tag;
985 int i;
986
987 if (ctxt == NULL)
988 return(-1);
989 tag = ctxt->name;
990 if (tag == NULL) {
991 htmlAutoClose(ctxt, BAD_CAST"p");
992 htmlCheckImplied(ctxt, BAD_CAST"p");
993 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
994 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
995 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
996 return(1);
997 }
998 if (!htmlOmittedDefaultValue)
999 return(0);
1000 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1001 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1002#ifdef DEBUG
1003 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
1004#endif
1005 htmlAutoClose(ctxt, BAD_CAST"p");
1006 htmlCheckImplied(ctxt, BAD_CAST"p");
1007 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
1008 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1009 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1010 return(1);
1011 }
1012 }
1013 return(0);
1014}
1015
1016/**
1017 * htmlIsScriptAttribute:
1018 * @name: an attribute name
1019 *
1020 * Check if an attribute is of content type Script
1021 *
1022 * Returns 1 is the attribute is a script 0 otherwise
1023 */
1024int
1025htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001026 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001027
1028 if (name == NULL)
1029 return(0);
1030 /*
1031 * all script attributes start with 'on'
1032 */
1033 if ((name[0] != 'o') || (name[1] != 'n'))
1034 return(0);
1035 for (i = 0;
1036 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1037 i++) {
1038 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1039 return(1);
1040 }
1041 return(0);
1042}
1043
1044/************************************************************************
1045 * *
1046 * The list of HTML predefined entities *
1047 * *
1048 ************************************************************************/
1049
1050
Daniel Veillard22090732001-07-16 00:06:07 +00001051static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001052/*
1053 * the 4 absolute ones, plus apostrophe.
1054 */
1055{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1056{ 38, "amp", "ampersand, U+0026 ISOnum" },
1057{ 39, "apos", "single quote" },
1058{ 60, "lt", "less-than sign, U+003C ISOnum" },
1059{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1060
1061/*
1062 * A bunch still in the 128-255 range
1063 * Replacing them depend really on the charset used.
1064 */
1065{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1066{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1067{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1068{ 163, "pound","pound sign, U+00A3 ISOnum" },
1069{ 164, "curren","currency sign, U+00A4 ISOnum" },
1070{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1071{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1072{ 167, "sect", "section sign, U+00A7 ISOnum" },
1073{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1074{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1075{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1076{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1077{ 172, "not", "not sign, U+00AC ISOnum" },
1078{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1079{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1080{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1081{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1082{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1083{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1084{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1085{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1086{ 181, "micro","micro sign, U+00B5 ISOnum" },
1087{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1088{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1089{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1090{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1091{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1092{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1093{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1094{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1095{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1096{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1097{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1098{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1099{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1100{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1101{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1102{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1103{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1104{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1105{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1106{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1107{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1108{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1109{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1110{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1111{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1112{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1113{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1114{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1115{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1116{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1117{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1118{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1119{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1120{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1121{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1122{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1123{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1124{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1125{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1126{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1127{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1128{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1129{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1130{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1131{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1132{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1133{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1134{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1135{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1136{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1137{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1138{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1139{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1140{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1141{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1142{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1143{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1144{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1145{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1146{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1147{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1148{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1149{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1150{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1151{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1152{ 247, "divide","division sign, U+00F7 ISOnum" },
1153{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1154{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1155{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1156{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1157{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1158{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1159{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1160{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1161
1162{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1163{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1164{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1165{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1166{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1167
1168/*
1169 * Anything below should really be kept as entities references
1170 */
1171{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1172
1173{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1174{ 732, "tilde","small tilde, U+02DC ISOdia" },
1175
1176{ 913, "Alpha","greek capital letter alpha, U+0391" },
1177{ 914, "Beta", "greek capital letter beta, U+0392" },
1178{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1179{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1180{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1181{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1182{ 919, "Eta", "greek capital letter eta, U+0397" },
1183{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1184{ 921, "Iota", "greek capital letter iota, U+0399" },
1185{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001186{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001187{ 924, "Mu", "greek capital letter mu, U+039C" },
1188{ 925, "Nu", "greek capital letter nu, U+039D" },
1189{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1190{ 927, "Omicron","greek capital letter omicron, U+039F" },
1191{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1192{ 929, "Rho", "greek capital letter rho, U+03A1" },
1193{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1194{ 932, "Tau", "greek capital letter tau, U+03A4" },
1195{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1196{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1197{ 935, "Chi", "greek capital letter chi, U+03A7" },
1198{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1199{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1200
1201{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1202{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1203{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1204{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1205{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1206{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1207{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1208{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1209{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1210{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1211{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1212{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1213{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1214{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1215{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1216{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1217{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1218{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1219{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1220{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1221{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1222{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1223{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1224{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1225{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1226{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1227{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1228{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1229
1230{ 8194, "ensp", "en space, U+2002 ISOpub" },
1231{ 8195, "emsp", "em space, U+2003 ISOpub" },
1232{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1233{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1234{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1235{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1236{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1237{ 8211, "ndash","en dash, U+2013 ISOpub" },
1238{ 8212, "mdash","em dash, U+2014 ISOpub" },
1239{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1240{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1241{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1242{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1243{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1244{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1245{ 8224, "dagger","dagger, U+2020 ISOpub" },
1246{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1247
1248{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1249{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1250
1251{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1252
1253{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1254{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1255
1256{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1257{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1258
1259{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1260{ 8260, "frasl","fraction slash, U+2044 NEW" },
1261
1262{ 8364, "euro", "euro sign, U+20AC NEW" },
1263
1264{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1265{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1266{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1267{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1268{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1269{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1270{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1271{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1272{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1273{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1274{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1275{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1276{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1277{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1278{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1279{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1280
1281{ 8704, "forall","for all, U+2200 ISOtech" },
1282{ 8706, "part", "partial differential, U+2202 ISOtech" },
1283{ 8707, "exist","there exists, U+2203 ISOtech" },
1284{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1285{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1286{ 8712, "isin", "element of, U+2208 ISOtech" },
1287{ 8713, "notin","not an element of, U+2209 ISOtech" },
1288{ 8715, "ni", "contains as member, U+220B ISOtech" },
1289{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001290{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001291{ 8722, "minus","minus sign, U+2212 ISOtech" },
1292{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1293{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1294{ 8733, "prop", "proportional to, U+221D ISOtech" },
1295{ 8734, "infin","infinity, U+221E ISOtech" },
1296{ 8736, "ang", "angle, U+2220 ISOamso" },
1297{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1298{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1299{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1300{ 8746, "cup", "union = cup, U+222A ISOtech" },
1301{ 8747, "int", "integral, U+222B ISOtech" },
1302{ 8756, "there4","therefore, U+2234 ISOtech" },
1303{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1304{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1305{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1306{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1307{ 8801, "equiv","identical to, U+2261 ISOtech" },
1308{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1309{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1310{ 8834, "sub", "subset of, U+2282 ISOtech" },
1311{ 8835, "sup", "superset of, U+2283 ISOtech" },
1312{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1313{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1314{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1315{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1316{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1317{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1318{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1319{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1320{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1321{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1322{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1323{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1324{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1325{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1326
1327{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1328{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1329{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1330{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1331
1332};
1333
1334/************************************************************************
1335 * *
1336 * Commodity functions to handle entities *
1337 * *
1338 ************************************************************************/
1339
1340/*
1341 * Macro used to grow the current buffer.
1342 */
1343#define growBuffer(buffer) { \
1344 buffer##_size *= 2; \
Daniel Veillard3487c8d2002-09-05 11:33:25 +00001345 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
Owen Taylor3473f882001-02-23 17:55:21 +00001346 if (buffer == NULL) { \
Daniel Veillard3487c8d2002-09-05 11:33:25 +00001347 xmlGenericError(xmlGenericErrorContext, "realloc failed\n"); \
Owen Taylor3473f882001-02-23 17:55:21 +00001348 return(NULL); \
1349 } \
1350}
1351
1352/**
1353 * htmlEntityLookup:
1354 * @name: the entity name
1355 *
1356 * Lookup the given entity in EntitiesTable
1357 *
1358 * TODO: the linear scan is really ugly, an hash table is really needed.
1359 *
1360 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1361 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001362const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001363htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001364 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001365
1366 for (i = 0;i < (sizeof(html40EntitiesTable)/
1367 sizeof(html40EntitiesTable[0]));i++) {
1368 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1369#ifdef DEBUG
1370 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
1371#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001372 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001373 }
1374 }
1375 return(NULL);
1376}
1377
1378/**
1379 * htmlEntityValueLookup:
1380 * @value: the entity's unicode value
1381 *
1382 * Lookup the given entity in EntitiesTable
1383 *
1384 * TODO: the linear scan is really ugly, an hash table is really needed.
1385 *
1386 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1387 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001388const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001389htmlEntityValueLookup(unsigned int value) {
1390 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001391#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00001392 unsigned int lv = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001393#endif
1394
1395 for (i = 0;i < (sizeof(html40EntitiesTable)/
1396 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001397 if (html40EntitiesTable[i].value >= value) {
1398 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001399 break;
1400#ifdef DEBUG
1401 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
1402#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001403 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001404 }
1405#ifdef DEBUG
1406 if (lv > html40EntitiesTable[i].value) {
1407 xmlGenericError(xmlGenericErrorContext,
1408 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1409 lv, html40EntitiesTable[i].value);
1410 }
1411 lv = html40EntitiesTable[i].value;
1412#endif
1413 }
1414 return(NULL);
1415}
1416
1417/**
1418 * UTF8ToHtml:
1419 * @out: a pointer to an array of bytes to store the result
1420 * @outlen: the length of @out
1421 * @in: a pointer to an array of UTF-8 chars
1422 * @inlen: the length of @in
1423 *
1424 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1425 * plus HTML entities block of chars out.
1426 *
1427 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1428 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001429 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001430 * The value of @outlen after return is the number of octets consumed.
1431 */
1432int
1433UTF8ToHtml(unsigned char* out, int *outlen,
1434 const unsigned char* in, int *inlen) {
1435 const unsigned char* processed = in;
1436 const unsigned char* outend;
1437 const unsigned char* outstart = out;
1438 const unsigned char* instart = in;
1439 const unsigned char* inend;
1440 unsigned int c, d;
1441 int trailing;
1442
1443 if (in == NULL) {
1444 /*
1445 * initialization nothing to do
1446 */
1447 *outlen = 0;
1448 *inlen = 0;
1449 return(0);
1450 }
1451 inend = in + (*inlen);
1452 outend = out + (*outlen);
1453 while (in < inend) {
1454 d = *in++;
1455 if (d < 0x80) { c= d; trailing= 0; }
1456 else if (d < 0xC0) {
1457 /* trailing byte in leading position */
1458 *outlen = out - outstart;
1459 *inlen = processed - instart;
1460 return(-2);
1461 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1462 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1463 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1464 else {
1465 /* no chance for this in Ascii */
1466 *outlen = out - outstart;
1467 *inlen = processed - instart;
1468 return(-2);
1469 }
1470
1471 if (inend - in < trailing) {
1472 break;
1473 }
1474
1475 for ( ; trailing; trailing--) {
1476 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1477 break;
1478 c <<= 6;
1479 c |= d & 0x3F;
1480 }
1481
1482 /* assertion: c is a single UTF-4 value */
1483 if (c < 0x80) {
1484 if (out + 1 >= outend)
1485 break;
1486 *out++ = c;
1487 } else {
1488 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001489 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001490
1491 /*
1492 * Try to lookup a predefined HTML entity for it
1493 */
1494
1495 ent = htmlEntityValueLookup(c);
1496 if (ent == NULL) {
1497 /* no chance for this in Ascii */
1498 *outlen = out - outstart;
1499 *inlen = processed - instart;
1500 return(-2);
1501 }
1502 len = strlen(ent->name);
1503 if (out + 2 + len >= outend)
1504 break;
1505 *out++ = '&';
1506 memcpy(out, ent->name, len);
1507 out += len;
1508 *out++ = ';';
1509 }
1510 processed = in;
1511 }
1512 *outlen = out - outstart;
1513 *inlen = processed - instart;
1514 return(0);
1515}
1516
1517/**
1518 * htmlEncodeEntities:
1519 * @out: a pointer to an array of bytes to store the result
1520 * @outlen: the length of @out
1521 * @in: a pointer to an array of UTF-8 chars
1522 * @inlen: the length of @in
1523 * @quoteChar: the quote character to escape (' or ") or zero.
1524 *
1525 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1526 * plus HTML entities block of chars out.
1527 *
1528 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1529 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001530 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001531 * The value of @outlen after return is the number of octets consumed.
1532 */
1533int
1534htmlEncodeEntities(unsigned char* out, int *outlen,
1535 const unsigned char* in, int *inlen, int quoteChar) {
1536 const unsigned char* processed = in;
1537 const unsigned char* outend = out + (*outlen);
1538 const unsigned char* outstart = out;
1539 const unsigned char* instart = in;
1540 const unsigned char* inend = in + (*inlen);
1541 unsigned int c, d;
1542 int trailing;
1543
1544 while (in < inend) {
1545 d = *in++;
1546 if (d < 0x80) { c= d; trailing= 0; }
1547 else if (d < 0xC0) {
1548 /* trailing byte in leading position */
1549 *outlen = out - outstart;
1550 *inlen = processed - instart;
1551 return(-2);
1552 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1553 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1554 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1555 else {
1556 /* no chance for this in Ascii */
1557 *outlen = out - outstart;
1558 *inlen = processed - instart;
1559 return(-2);
1560 }
1561
1562 if (inend - in < trailing)
1563 break;
1564
1565 while (trailing--) {
1566 if (((d= *in++) & 0xC0) != 0x80) {
1567 *outlen = out - outstart;
1568 *inlen = processed - instart;
1569 return(-2);
1570 }
1571 c <<= 6;
1572 c |= d & 0x3F;
1573 }
1574
1575 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001576 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1577 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001578 if (out >= outend)
1579 break;
1580 *out++ = c;
1581 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001582 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001583 const char *cp;
1584 char nbuf[16];
1585 int len;
1586
1587 /*
1588 * Try to lookup a predefined HTML entity for it
1589 */
1590 ent = htmlEntityValueLookup(c);
1591 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00001592 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00001593 cp = nbuf;
1594 }
1595 else
1596 cp = ent->name;
1597 len = strlen(cp);
1598 if (out + 2 + len > outend)
1599 break;
1600 *out++ = '&';
1601 memcpy(out, cp, len);
1602 out += len;
1603 *out++ = ';';
1604 }
1605 processed = in;
1606 }
1607 *outlen = out - outstart;
1608 *inlen = processed - instart;
1609 return(0);
1610}
1611
1612/**
1613 * htmlDecodeEntities:
1614 * @ctxt: the parser context
1615 * @len: the len to decode (in bytes !), -1 for no size limit
1616 * @end: an end marker xmlChar, 0 if none
1617 * @end2: an end marker xmlChar, 0 if none
1618 * @end3: an end marker xmlChar, 0 if none
1619 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001620 * Substitute the HTML entities by their value
Owen Taylor3473f882001-02-23 17:55:21 +00001621 *
1622 * DEPRECATED !!!!
1623 *
1624 * Returns A newly allocated string with the substitution done. The caller
1625 * must deallocate it !
1626 */
1627xmlChar *
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00001628htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
1629 xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001630 static int deprecated = 0;
1631 if (!deprecated) {
1632 xmlGenericError(xmlGenericErrorContext,
1633 "htmlDecodeEntities() deprecated function reached\n");
1634 deprecated = 1;
1635 }
1636 return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00001637}
1638
1639/************************************************************************
1640 * *
1641 * Commodity functions to handle streams *
1642 * *
1643 ************************************************************************/
1644
1645/**
Owen Taylor3473f882001-02-23 17:55:21 +00001646 * htmlNewInputStream:
1647 * @ctxt: an HTML parser context
1648 *
1649 * Create a new input stream structure
1650 * Returns the new input stream or NULL
1651 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001652static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001653htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1654 htmlParserInputPtr input;
1655
1656 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1657 if (input == NULL) {
1658 ctxt->errNo = XML_ERR_NO_MEMORY;
1659 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1660 ctxt->sax->error(ctxt->userData,
1661 "malloc: couldn't allocate a new input stream\n");
1662 return(NULL);
1663 }
1664 memset(input, 0, sizeof(htmlParserInput));
1665 input->filename = NULL;
1666 input->directory = NULL;
1667 input->base = NULL;
1668 input->cur = NULL;
1669 input->buf = NULL;
1670 input->line = 1;
1671 input->col = 1;
1672 input->buf = NULL;
1673 input->free = NULL;
1674 input->version = NULL;
1675 input->consumed = 0;
1676 input->length = 0;
1677 return(input);
1678}
1679
1680
1681/************************************************************************
1682 * *
1683 * Commodity functions, cleanup needed ? *
1684 * *
1685 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001686/*
1687 * all tags allowing pc data from the html 4.01 loose dtd
1688 * NOTE: it might be more apropriate to integrate this information
1689 * into the html40ElementTable array but I don't want to risk any
1690 * binary incomptibility
1691 */
1692static const char *allowPCData[] = {
1693 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
1694 "blockquote", "body", "button", "caption", "center", "cite", "code",
1695 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
1696 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
1697 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
1698 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
1699};
Owen Taylor3473f882001-02-23 17:55:21 +00001700
1701/**
1702 * areBlanks:
1703 * @ctxt: an HTML parser context
1704 * @str: a xmlChar *
1705 * @len: the size of @str
1706 *
1707 * Is this a sequence of blank chars that one can ignore ?
1708 *
1709 * Returns 1 if ignorable 0 otherwise.
1710 */
1711
1712static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001713 unsigned int i;
1714 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00001715 xmlNodePtr lastChild;
1716
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001717 for (j = 0;j < len;j++)
1718 if (!(IS_BLANK(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001719
1720 if (CUR == 0) return(1);
1721 if (CUR != '<') return(0);
1722 if (ctxt->name == NULL)
1723 return(1);
1724 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
1725 return(1);
1726 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
1727 return(1);
1728 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
1729 return(1);
1730 if (ctxt->node == NULL) return(0);
1731 lastChild = xmlGetLastChild(ctxt->node);
1732 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00001733 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
1734 (ctxt->node->content != NULL)) return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001735 /* keep ws in constructs like ...<b> </b>...
1736 for all tags "b" allowing PCDATA */
1737 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
1738 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
1739 return(0);
1740 }
1741 }
Owen Taylor3473f882001-02-23 17:55:21 +00001742 } else if (xmlNodeIsText(lastChild)) {
1743 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001744 } else {
1745 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
1746 for all tags "p" allowing PCDATA */
1747 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
1748 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
1749 return(0);
1750 }
1751 }
Owen Taylor3473f882001-02-23 17:55:21 +00001752 }
1753 return(1);
1754}
1755
1756/**
Owen Taylor3473f882001-02-23 17:55:21 +00001757 * htmlNewDocNoDtD:
1758 * @URI: URI for the dtd, or NULL
1759 * @ExternalID: the external ID of the DTD, or NULL
1760 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001761 * Creates a new HTML document without a DTD node if @URI and @ExternalID
1762 * are NULL
1763 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001764 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00001765 */
1766htmlDocPtr
1767htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
1768 xmlDocPtr cur;
1769
1770 /*
1771 * Allocate a new document and fill the fields.
1772 */
1773 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
1774 if (cur == NULL) {
1775 xmlGenericError(xmlGenericErrorContext,
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001776 "htmlNewDocNoDtD : malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00001777 return(NULL);
1778 }
1779 memset(cur, 0, sizeof(xmlDoc));
1780
1781 cur->type = XML_HTML_DOCUMENT_NODE;
1782 cur->version = NULL;
1783 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001784 cur->doc = cur;
1785 cur->name = NULL;
1786 cur->children = NULL;
1787 cur->extSubset = NULL;
1788 cur->oldNs = NULL;
1789 cur->encoding = NULL;
1790 cur->standalone = 1;
1791 cur->compression = 0;
1792 cur->ids = NULL;
1793 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001794 cur->_private = NULL;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00001795 if ((ExternalID != NULL) ||
1796 (URI != NULL))
Daniel Veillard5151c062001-10-23 13:10:19 +00001797 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00001798 return(cur);
1799}
1800
1801/**
1802 * htmlNewDoc:
1803 * @URI: URI for the dtd, or NULL
1804 * @ExternalID: the external ID of the DTD, or NULL
1805 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001806 * Creates a new HTML document
1807 *
Owen Taylor3473f882001-02-23 17:55:21 +00001808 * Returns a new document
1809 */
1810htmlDocPtr
1811htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1812 if ((URI == NULL) && (ExternalID == NULL))
1813 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00001814 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
1815 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00001816
1817 return(htmlNewDocNoDtD(URI, ExternalID));
1818}
1819
1820
1821/************************************************************************
1822 * *
1823 * The parser itself *
1824 * Relates to http://www.w3.org/TR/html40 *
1825 * *
1826 ************************************************************************/
1827
1828/************************************************************************
1829 * *
1830 * The parser itself *
1831 * *
1832 ************************************************************************/
1833
Daniel Veillarde55e8e42003-01-10 12:50:02 +00001834static xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
1835
Owen Taylor3473f882001-02-23 17:55:21 +00001836/**
1837 * htmlParseHTMLName:
1838 * @ctxt: an HTML parser context
1839 *
1840 * parse an HTML tag or attribute name, note that we convert it to lowercase
1841 * since HTML names are not case-sensitive.
1842 *
1843 * Returns the Tag Name parsed or NULL
1844 */
1845
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001846static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001847htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1848 xmlChar *ret = NULL;
1849 int i = 0;
1850 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
1851
1852 if (!IS_LETTER(CUR) && (CUR != '_') &&
1853 (CUR != ':')) return(NULL);
1854
1855 while ((i < HTML_PARSER_BUFFER_SIZE) &&
1856 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1857 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
1858 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
1859 else loc[i] = CUR;
1860 i++;
1861
1862 NEXT;
1863 }
1864
1865 ret = xmlStrndup(loc, i);
1866
1867 return(ret);
1868}
1869
1870/**
1871 * htmlParseName:
1872 * @ctxt: an HTML parser context
1873 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001874 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00001875 *
1876 * Returns the Name parsed or NULL
1877 */
1878
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001879static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001880htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00001881 const xmlChar *in;
1882 xmlChar *ret;
1883 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001884
1885 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00001886
1887 /*
1888 * Accelerator for simple ASCII names
1889 */
1890 in = ctxt->input->cur;
1891 if (((*in >= 0x61) && (*in <= 0x7A)) ||
1892 ((*in >= 0x41) && (*in <= 0x5A)) ||
1893 (*in == '_') || (*in == ':')) {
1894 in++;
1895 while (((*in >= 0x61) && (*in <= 0x7A)) ||
1896 ((*in >= 0x41) && (*in <= 0x5A)) ||
1897 ((*in >= 0x30) && (*in <= 0x39)) ||
1898 (*in == '_') || (*in == '-') ||
1899 (*in == ':') || (*in == '.'))
1900 in++;
1901 if ((*in > 0) && (*in < 0x80)) {
1902 count = in - ctxt->input->cur;
1903 ret = xmlStrndup(ctxt->input->cur, count);
1904 ctxt->input->cur = in;
1905 return(ret);
1906 }
1907 }
1908 return(htmlParseNameComplex(ctxt));
1909}
1910
1911static xmlChar *
1912htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
1913 xmlChar buf[XML_MAX_NAMELEN + 5];
1914 int len = 0, l;
1915 int c;
1916 int count = 0;
1917
1918 /*
1919 * Handler for more complex cases
1920 */
1921 GROW;
1922 c = CUR_CHAR(l);
1923 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
1924 (!IS_LETTER(c) && (c != '_') &&
1925 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00001926 return(NULL);
1927 }
1928
Daniel Veillarde55e8e42003-01-10 12:50:02 +00001929 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
1930 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
1931 (c == '.') || (c == '-') ||
1932 (c == '_') || (c == ':') ||
1933 (IS_COMBINING(c)) ||
1934 (IS_EXTENDER(c)))) {
1935 if (count++ > 100) {
1936 count = 0;
1937 GROW;
1938 }
1939 COPY_BUF(l,buf,len,c);
1940 NEXTL(l);
1941 c = CUR_CHAR(l);
1942 if (len >= XML_MAX_NAMELEN) {
1943 /*
1944 * Okay someone managed to make a huge name, so he's ready to pay
1945 * for the processing speed.
1946 */
1947 xmlChar *buffer;
1948 int max = len * 2;
1949
1950 buffer = (xmlChar *) xmlMalloc(max * sizeof(xmlChar));
1951 if (buffer == NULL) {
1952 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1953 ctxt->sax->error(ctxt->userData,
1954 "htmlParseNameComplex: out of memory\n");
1955 return(NULL);
1956 }
1957 memcpy(buffer, buf, len);
1958 while ((IS_LETTER(c)) || (IS_DIGIT(c)) || /* test bigname.xml */
1959 (c == '.') || (c == '-') ||
1960 (c == '_') || (c == ':') ||
1961 (IS_COMBINING(c)) ||
1962 (IS_EXTENDER(c))) {
1963 if (count++ > 100) {
1964 count = 0;
1965 GROW;
1966 }
1967 if (len + 10 > max) {
1968 max *= 2;
1969 buffer = (xmlChar *) xmlRealloc(buffer,
1970 max * sizeof(xmlChar));
1971 if (buffer == NULL) {
1972 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1973 ctxt->sax->error(ctxt->userData,
1974 "htmlParseNameComplex: out of memory\n");
1975 return(NULL);
1976 }
1977 }
1978 COPY_BUF(l,buffer,len,c);
1979 NEXTL(l);
1980 c = CUR_CHAR(l);
1981 }
1982 buffer[len] = 0;
1983 return(buffer);
Owen Taylor3473f882001-02-23 17:55:21 +00001984 }
1985 }
1986 return(xmlStrndup(buf, len));
1987}
1988
Daniel Veillarde55e8e42003-01-10 12:50:02 +00001989
Owen Taylor3473f882001-02-23 17:55:21 +00001990/**
1991 * htmlParseHTMLAttribute:
1992 * @ctxt: an HTML parser context
1993 * @stop: a char stop value
1994 *
1995 * parse an HTML attribute value till the stop (quote), if
1996 * stop is 0 then it stops at the first space
1997 *
1998 * Returns the attribute parsed or NULL
1999 */
2000
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002001static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002002htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2003 xmlChar *buffer = NULL;
2004 int buffer_size = 0;
2005 xmlChar *out = NULL;
2006 xmlChar *name = NULL;
2007
2008 xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002009 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002010
2011 /*
2012 * allocate a translation buffer.
2013 */
2014 buffer_size = HTML_PARSER_BUFFER_SIZE;
2015 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
2016 if (buffer == NULL) {
Daniel Veillard3487c8d2002-09-05 11:33:25 +00002017 xmlGenericError(xmlGenericErrorContext,
2018 "htmlParseHTMLAttribute: malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002019 return(NULL);
2020 }
2021 out = buffer;
2022
2023 /*
2024 * Ok loop until we reach one of the ending chars
2025 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002026 while ((CUR != 0) && (CUR != stop)) {
2027 if ((stop == 0) && (CUR == '>')) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002028 if ((stop == 0) && (IS_BLANK(CUR))) break;
2029 if (CUR == '&') {
2030 if (NXT(1) == '#') {
2031 unsigned int c;
2032 int bits;
2033
2034 c = htmlParseCharRef(ctxt);
2035 if (c < 0x80)
2036 { *out++ = c; bits= -6; }
2037 else if (c < 0x800)
2038 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2039 else if (c < 0x10000)
2040 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2041 else
2042 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2043
2044 for ( ; bits >= 0; bits-= 6) {
2045 *out++ = ((c >> bits) & 0x3F) | 0x80;
2046 }
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002047
2048 if (out - buffer > buffer_size - 100) {
2049 int indx = out - buffer;
2050
2051 growBuffer(buffer);
2052 out = &buffer[indx];
2053 }
Owen Taylor3473f882001-02-23 17:55:21 +00002054 } else {
2055 ent = htmlParseEntityRef(ctxt, &name);
2056 if (name == NULL) {
2057 *out++ = '&';
2058 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002059 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002060
2061 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002062 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002063 }
2064 } else if (ent == NULL) {
2065 *out++ = '&';
2066 cur = name;
2067 while (*cur != 0) {
2068 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002069 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002070
2071 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002072 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002073 }
2074 *out++ = *cur++;
2075 }
2076 xmlFree(name);
2077 } else {
2078 unsigned int c;
2079 int bits;
2080
2081 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002082 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002083
2084 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002085 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002086 }
2087 c = (xmlChar)ent->value;
2088 if (c < 0x80)
2089 { *out++ = c; bits= -6; }
2090 else if (c < 0x800)
2091 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2092 else if (c < 0x10000)
2093 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2094 else
2095 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2096
2097 for ( ; bits >= 0; bits-= 6) {
2098 *out++ = ((c >> bits) & 0x3F) | 0x80;
2099 }
2100 xmlFree(name);
2101 }
2102 }
2103 } else {
2104 unsigned int c;
2105 int bits, l;
2106
2107 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002108 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002109
2110 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002111 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002112 }
2113 c = CUR_CHAR(l);
2114 if (c < 0x80)
2115 { *out++ = c; bits= -6; }
2116 else if (c < 0x800)
2117 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2118 else if (c < 0x10000)
2119 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2120 else
2121 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2122
2123 for ( ; bits >= 0; bits-= 6) {
2124 *out++ = ((c >> bits) & 0x3F) | 0x80;
2125 }
2126 NEXT;
2127 }
2128 }
2129 *out++ = 0;
2130 return(buffer);
2131}
2132
2133/**
Owen Taylor3473f882001-02-23 17:55:21 +00002134 * htmlParseEntityRef:
2135 * @ctxt: an HTML parser context
2136 * @str: location to store the entity name
2137 *
2138 * parse an HTML ENTITY references
2139 *
2140 * [68] EntityRef ::= '&' Name ';'
2141 *
2142 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2143 * if non-NULL *str will have to be freed by the caller.
2144 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002145const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00002146htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2147 xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002148 const htmlEntityDesc * ent = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002149 *str = NULL;
2150
2151 if (CUR == '&') {
2152 NEXT;
2153 name = htmlParseName(ctxt);
2154 if (name == NULL) {
2155 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2156 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2157 ctxt->wellFormed = 0;
2158 } else {
2159 GROW;
2160 if (CUR == ';') {
2161 *str = name;
2162
2163 /*
2164 * Lookup the entity in the table.
2165 */
2166 ent = htmlEntityLookup(name);
2167 if (ent != NULL) /* OK that's ugly !!! */
2168 NEXT;
2169 } else {
2170 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2171 ctxt->sax->error(ctxt->userData,
2172 "htmlParseEntityRef: expecting ';'\n");
2173 *str = name;
2174 }
2175 }
2176 }
2177 return(ent);
2178}
2179
2180/**
2181 * htmlParseAttValue:
2182 * @ctxt: an HTML parser context
2183 *
2184 * parse a value for an attribute
2185 * Note: the parser won't do substitution of entities here, this
2186 * will be handled later in xmlStringGetNodeList, unless it was
2187 * asked for ctxt->replaceEntities != 0
2188 *
2189 * Returns the AttValue parsed or NULL.
2190 */
2191
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002192static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002193htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2194 xmlChar *ret = NULL;
2195
2196 if (CUR == '"') {
2197 NEXT;
2198 ret = htmlParseHTMLAttribute(ctxt, '"');
2199 if (CUR != '"') {
2200 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2201 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2202 ctxt->wellFormed = 0;
2203 } else
2204 NEXT;
2205 } else if (CUR == '\'') {
2206 NEXT;
2207 ret = htmlParseHTMLAttribute(ctxt, '\'');
2208 if (CUR != '\'') {
2209 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2210 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2211 ctxt->wellFormed = 0;
2212 } else
2213 NEXT;
2214 } else {
2215 /*
2216 * That's an HTMLism, the attribute value may not be quoted
2217 */
2218 ret = htmlParseHTMLAttribute(ctxt, 0);
2219 if (ret == NULL) {
2220 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2221 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2222 ctxt->wellFormed = 0;
2223 }
2224 }
2225 return(ret);
2226}
2227
2228/**
2229 * htmlParseSystemLiteral:
2230 * @ctxt: an HTML parser context
2231 *
2232 * parse an HTML Literal
2233 *
2234 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2235 *
2236 * Returns the SystemLiteral parsed or NULL
2237 */
2238
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002239static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002240htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2241 const xmlChar *q;
2242 xmlChar *ret = NULL;
2243
2244 if (CUR == '"') {
2245 NEXT;
2246 q = CUR_PTR;
2247 while ((IS_CHAR(CUR)) && (CUR != '"'))
2248 NEXT;
2249 if (!IS_CHAR(CUR)) {
2250 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2251 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2252 ctxt->wellFormed = 0;
2253 } else {
2254 ret = xmlStrndup(q, CUR_PTR - q);
2255 NEXT;
2256 }
2257 } else if (CUR == '\'') {
2258 NEXT;
2259 q = CUR_PTR;
2260 while ((IS_CHAR(CUR)) && (CUR != '\''))
2261 NEXT;
2262 if (!IS_CHAR(CUR)) {
2263 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2264 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2265 ctxt->wellFormed = 0;
2266 } else {
2267 ret = xmlStrndup(q, CUR_PTR - q);
2268 NEXT;
2269 }
2270 } else {
2271 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2272 ctxt->sax->error(ctxt->userData,
2273 "SystemLiteral \" or ' expected\n");
2274 ctxt->wellFormed = 0;
2275 }
2276
2277 return(ret);
2278}
2279
2280/**
2281 * htmlParsePubidLiteral:
2282 * @ctxt: an HTML parser context
2283 *
2284 * parse an HTML public literal
2285 *
2286 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2287 *
2288 * Returns the PubidLiteral parsed or NULL.
2289 */
2290
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002291static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002292htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2293 const xmlChar *q;
2294 xmlChar *ret = NULL;
2295 /*
2296 * Name ::= (Letter | '_') (NameChar)*
2297 */
2298 if (CUR == '"') {
2299 NEXT;
2300 q = CUR_PTR;
2301 while (IS_PUBIDCHAR(CUR)) NEXT;
2302 if (CUR != '"') {
2303 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2304 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2305 ctxt->wellFormed = 0;
2306 } else {
2307 ret = xmlStrndup(q, CUR_PTR - q);
2308 NEXT;
2309 }
2310 } else if (CUR == '\'') {
2311 NEXT;
2312 q = CUR_PTR;
2313 while ((IS_LETTER(CUR)) && (CUR != '\''))
2314 NEXT;
2315 if (!IS_LETTER(CUR)) {
2316 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2317 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2318 ctxt->wellFormed = 0;
2319 } else {
2320 ret = xmlStrndup(q, CUR_PTR - q);
2321 NEXT;
2322 }
2323 } else {
2324 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2325 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2326 ctxt->wellFormed = 0;
2327 }
2328
2329 return(ret);
2330}
2331
2332/**
2333 * htmlParseScript:
2334 * @ctxt: an HTML parser context
2335 *
2336 * parse the content of an HTML SCRIPT or STYLE element
2337 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2338 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2339 * http://www.w3.org/TR/html4/types.html#type-script
2340 * http://www.w3.org/TR/html4/types.html#h-6.15
2341 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2342 *
2343 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2344 * element and the value of intrinsic event attributes. User agents must
2345 * not evaluate script data as HTML markup but instead must pass it on as
2346 * data to a script engine.
2347 * NOTES:
2348 * - The content is passed like CDATA
2349 * - the attributes for style and scripting "onXXX" are also described
2350 * as CDATA but SGML allows entities references in attributes so their
2351 * processing is identical as other attributes
2352 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002353static void
Owen Taylor3473f882001-02-23 17:55:21 +00002354htmlParseScript(htmlParserCtxtPtr ctxt) {
2355 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2356 int nbchar = 0;
2357 xmlChar cur;
2358
2359 SHRINK;
2360 cur = CUR;
2361 while (IS_CHAR(cur)) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00002362 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2363 (NXT(3) == '-')) {
2364 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2365 if (ctxt->sax->cdataBlock!= NULL) {
2366 /*
2367 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2368 */
2369 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2370 }
2371 }
2372 nbchar = 0;
2373 htmlParseComment(ctxt);
2374 cur = CUR;
2375 continue;
2376 } else if ((cur == '<') && (NXT(1) == '/')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002377 /*
2378 * One should break here, the specification is clear:
2379 * Authors should therefore escape "</" within the content.
2380 * Escape mechanisms are specific to each scripting or
2381 * style sheet language.
2382 */
2383 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2384 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2385 break; /* while */
2386 }
2387 buf[nbchar++] = cur;
2388 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2389 if (ctxt->sax->cdataBlock!= NULL) {
2390 /*
2391 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2392 */
2393 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2394 }
2395 nbchar = 0;
2396 }
2397 NEXT;
2398 cur = CUR;
2399 }
2400 if (!(IS_CHAR(cur))) {
2401 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2402 ctxt->sax->error(ctxt->userData,
2403 "Invalid char in CDATA 0x%X\n", cur);
2404 ctxt->wellFormed = 0;
2405 NEXT;
2406 }
2407
2408 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2409 if (ctxt->sax->cdataBlock!= NULL) {
2410 /*
2411 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2412 */
2413 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2414 }
2415 }
2416}
2417
2418
2419/**
2420 * htmlParseCharData:
2421 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002422 *
2423 * parse a CharData section.
2424 * if we are within a CDATA section ']]>' marks an end of section.
2425 *
2426 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2427 */
2428
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002429static void
2430htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002431 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2432 int nbchar = 0;
2433 int cur, l;
2434
2435 SHRINK;
2436 cur = CUR_CHAR(l);
2437 while (((cur != '<') || (ctxt->token == '<')) &&
2438 ((cur != '&') || (ctxt->token == '&')) &&
2439 (IS_CHAR(cur))) {
2440 COPY_BUF(l,buf,nbchar,cur);
2441 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2442 /*
2443 * Ok the segment is to be consumed as chars.
2444 */
2445 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2446 if (areBlanks(ctxt, buf, nbchar)) {
2447 if (ctxt->sax->ignorableWhitespace != NULL)
2448 ctxt->sax->ignorableWhitespace(ctxt->userData,
2449 buf, nbchar);
2450 } else {
2451 htmlCheckParagraph(ctxt);
2452 if (ctxt->sax->characters != NULL)
2453 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2454 }
2455 }
2456 nbchar = 0;
2457 }
2458 NEXTL(l);
2459 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002460 if (cur == 0) {
2461 SHRINK;
2462 GROW;
2463 cur = CUR_CHAR(l);
2464 }
Owen Taylor3473f882001-02-23 17:55:21 +00002465 }
2466 if (nbchar != 0) {
2467 /*
2468 * Ok the segment is to be consumed as chars.
2469 */
2470 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2471 if (areBlanks(ctxt, buf, nbchar)) {
2472 if (ctxt->sax->ignorableWhitespace != NULL)
2473 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2474 } else {
2475 htmlCheckParagraph(ctxt);
2476 if (ctxt->sax->characters != NULL)
2477 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2478 }
2479 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002480 } else {
2481 /*
2482 * Loop detection
2483 */
2484 if (cur == 0)
2485 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002486 }
2487}
2488
2489/**
2490 * htmlParseExternalID:
2491 * @ctxt: an HTML parser context
2492 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002493 *
2494 * Parse an External ID or a Public ID
2495 *
Owen Taylor3473f882001-02-23 17:55:21 +00002496 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2497 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2498 *
2499 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2500 *
2501 * Returns the function returns SystemLiteral and in the second
2502 * case publicID receives PubidLiteral, is strict is off
2503 * it is possible to return NULL and have publicID set.
2504 */
2505
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002506static xmlChar *
2507htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002508 xmlChar *URI = NULL;
2509
2510 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2511 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2512 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2513 SKIP(6);
2514 if (!IS_BLANK(CUR)) {
2515 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2516 ctxt->sax->error(ctxt->userData,
2517 "Space required after 'SYSTEM'\n");
2518 ctxt->wellFormed = 0;
2519 }
2520 SKIP_BLANKS;
2521 URI = htmlParseSystemLiteral(ctxt);
2522 if (URI == NULL) {
2523 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2524 ctxt->sax->error(ctxt->userData,
2525 "htmlParseExternalID: SYSTEM, no URI\n");
2526 ctxt->wellFormed = 0;
2527 }
2528 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2529 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2530 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2531 SKIP(6);
2532 if (!IS_BLANK(CUR)) {
2533 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2534 ctxt->sax->error(ctxt->userData,
2535 "Space required after 'PUBLIC'\n");
2536 ctxt->wellFormed = 0;
2537 }
2538 SKIP_BLANKS;
2539 *publicID = htmlParsePubidLiteral(ctxt);
2540 if (*publicID == NULL) {
2541 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2542 ctxt->sax->error(ctxt->userData,
2543 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2544 ctxt->wellFormed = 0;
2545 }
2546 SKIP_BLANKS;
2547 if ((CUR == '"') || (CUR == '\'')) {
2548 URI = htmlParseSystemLiteral(ctxt);
2549 }
2550 }
2551 return(URI);
2552}
2553
2554/**
2555 * htmlParseComment:
2556 * @ctxt: an HTML parser context
2557 *
2558 * Parse an XML (SGML) comment <!-- .... -->
2559 *
2560 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2561 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002562static void
Owen Taylor3473f882001-02-23 17:55:21 +00002563htmlParseComment(htmlParserCtxtPtr ctxt) {
2564 xmlChar *buf = NULL;
2565 int len;
2566 int size = HTML_PARSER_BUFFER_SIZE;
2567 int q, ql;
2568 int r, rl;
2569 int cur, l;
2570 xmlParserInputState state;
2571
2572 /*
2573 * Check that there is a comment right here.
2574 */
2575 if ((RAW != '<') || (NXT(1) != '!') ||
2576 (NXT(2) != '-') || (NXT(3) != '-')) return;
2577
2578 state = ctxt->instate;
2579 ctxt->instate = XML_PARSER_COMMENT;
2580 SHRINK;
2581 SKIP(4);
2582 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2583 if (buf == NULL) {
2584 xmlGenericError(xmlGenericErrorContext,
2585 "malloc of %d byte failed\n", size);
2586 ctxt->instate = state;
2587 return;
2588 }
2589 q = CUR_CHAR(ql);
2590 NEXTL(ql);
2591 r = CUR_CHAR(rl);
2592 NEXTL(rl);
2593 cur = CUR_CHAR(l);
2594 len = 0;
2595 while (IS_CHAR(cur) &&
2596 ((cur != '>') ||
2597 (r != '-') || (q != '-'))) {
2598 if (len + 5 >= size) {
2599 size *= 2;
2600 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2601 if (buf == NULL) {
2602 xmlGenericError(xmlGenericErrorContext,
2603 "realloc of %d byte failed\n", size);
2604 ctxt->instate = state;
2605 return;
2606 }
2607 }
2608 COPY_BUF(ql,buf,len,q);
2609 q = r;
2610 ql = rl;
2611 r = cur;
2612 rl = l;
2613 NEXTL(l);
2614 cur = CUR_CHAR(l);
2615 if (cur == 0) {
2616 SHRINK;
2617 GROW;
2618 cur = CUR_CHAR(l);
2619 }
2620 }
2621 buf[len] = 0;
2622 if (!IS_CHAR(cur)) {
2623 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2624 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2625 ctxt->sax->error(ctxt->userData,
2626 "Comment not terminated \n<!--%.50s\n", buf);
2627 ctxt->wellFormed = 0;
2628 xmlFree(buf);
2629 } else {
2630 NEXT;
2631 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2632 (!ctxt->disableSAX))
2633 ctxt->sax->comment(ctxt->userData, buf);
2634 xmlFree(buf);
2635 }
2636 ctxt->instate = state;
2637}
2638
2639/**
2640 * htmlParseCharRef:
2641 * @ctxt: an HTML parser context
2642 *
2643 * parse Reference declarations
2644 *
2645 * [66] CharRef ::= '&#' [0-9]+ ';' |
2646 * '&#x' [0-9a-fA-F]+ ';'
2647 *
2648 * Returns the value parsed (as an int)
2649 */
2650int
2651htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2652 int val = 0;
2653
2654 if ((CUR == '&') && (NXT(1) == '#') &&
2655 (NXT(2) == 'x')) {
2656 SKIP(3);
2657 while (CUR != ';') {
2658 if ((CUR >= '0') && (CUR <= '9'))
2659 val = val * 16 + (CUR - '0');
2660 else if ((CUR >= 'a') && (CUR <= 'f'))
2661 val = val * 16 + (CUR - 'a') + 10;
2662 else if ((CUR >= 'A') && (CUR <= 'F'))
2663 val = val * 16 + (CUR - 'A') + 10;
2664 else {
2665 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2666 ctxt->sax->error(ctxt->userData,
2667 "htmlParseCharRef: invalid hexadecimal value\n");
2668 ctxt->wellFormed = 0;
2669 return(0);
2670 }
2671 NEXT;
2672 }
2673 if (CUR == ';')
2674 NEXT;
2675 } else if ((CUR == '&') && (NXT(1) == '#')) {
2676 SKIP(2);
2677 while (CUR != ';') {
2678 if ((CUR >= '0') && (CUR <= '9'))
2679 val = val * 10 + (CUR - '0');
2680 else {
2681 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2682 ctxt->sax->error(ctxt->userData,
2683 "htmlParseCharRef: invalid decimal value\n");
2684 ctxt->wellFormed = 0;
2685 return(0);
2686 }
2687 NEXT;
2688 }
2689 if (CUR == ';')
2690 NEXT;
2691 } else {
2692 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2693 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2694 ctxt->wellFormed = 0;
2695 }
2696 /*
2697 * Check the value IS_CHAR ...
2698 */
2699 if (IS_CHAR(val)) {
2700 return(val);
2701 } else {
2702 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2703 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
2704 val);
2705 ctxt->wellFormed = 0;
2706 }
2707 return(0);
2708}
2709
2710
2711/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00002712 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00002713 * @ctxt: an HTML parser context
2714 *
2715 * parse a DOCTYPE declaration
2716 *
2717 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2718 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2719 */
2720
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002721static void
Owen Taylor3473f882001-02-23 17:55:21 +00002722htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
2723 xmlChar *name;
2724 xmlChar *ExternalID = NULL;
2725 xmlChar *URI = NULL;
2726
2727 /*
2728 * We know that '<!DOCTYPE' has been detected.
2729 */
2730 SKIP(9);
2731
2732 SKIP_BLANKS;
2733
2734 /*
2735 * Parse the DOCTYPE name.
2736 */
2737 name = htmlParseName(ctxt);
2738 if (name == NULL) {
2739 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2740 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2741 ctxt->wellFormed = 0;
2742 }
2743 /*
2744 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2745 */
2746
2747 SKIP_BLANKS;
2748
2749 /*
2750 * Check for SystemID and ExternalID
2751 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002752 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00002753 SKIP_BLANKS;
2754
2755 /*
2756 * We should be at the end of the DOCTYPE declaration.
2757 */
2758 if (CUR != '>') {
2759 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardf6ed8bc2001-10-02 09:22:47 +00002760 ctxt->sax->error(ctxt->userData, "DOCTYPE improperly terminated\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002761 ctxt->wellFormed = 0;
2762 /* We shouldn't try to resynchronize ... */
2763 }
2764 NEXT;
2765
2766 /*
2767 * Create or update the document accordingly to the DOCTYPE
2768 */
2769 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2770 (!ctxt->disableSAX))
2771 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
2772
2773 /*
2774 * Cleanup, since we don't use all those identifiers
2775 */
2776 if (URI != NULL) xmlFree(URI);
2777 if (ExternalID != NULL) xmlFree(ExternalID);
2778 if (name != NULL) xmlFree(name);
2779}
2780
2781/**
2782 * htmlParseAttribute:
2783 * @ctxt: an HTML parser context
2784 * @value: a xmlChar ** used to store the value of the attribute
2785 *
2786 * parse an attribute
2787 *
2788 * [41] Attribute ::= Name Eq AttValue
2789 *
2790 * [25] Eq ::= S? '=' S?
2791 *
2792 * With namespace:
2793 *
2794 * [NS 11] Attribute ::= QName Eq AttValue
2795 *
2796 * Also the case QName == xmlns:??? is handled independently as a namespace
2797 * definition.
2798 *
2799 * Returns the attribute name, and the value in *value.
2800 */
2801
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002802static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002803htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
2804 xmlChar *name, *val = NULL;
2805
2806 *value = NULL;
2807 name = htmlParseHTMLName(ctxt);
2808 if (name == NULL) {
2809 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2810 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2811 ctxt->wellFormed = 0;
2812 return(NULL);
2813 }
2814
2815 /*
2816 * read the value
2817 */
2818 SKIP_BLANKS;
2819 if (CUR == '=') {
2820 NEXT;
2821 SKIP_BLANKS;
2822 val = htmlParseAttValue(ctxt);
2823 /******
2824 } else {
2825 * TODO : some attribute must have values, some may not
2826 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2827 ctxt->sax->warning(ctxt->userData,
2828 "No value for attribute %s\n", name); */
2829 }
2830
2831 *value = val;
2832 return(name);
2833}
2834
2835/**
2836 * htmlCheckEncoding:
2837 * @ctxt: an HTML parser context
2838 * @attvalue: the attribute value
2839 *
2840 * Checks an http-equiv attribute from a Meta tag to detect
2841 * the encoding
2842 * If a new encoding is detected the parser is switched to decode
2843 * it and pass UTF8
2844 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002845static void
Owen Taylor3473f882001-02-23 17:55:21 +00002846htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2847 const xmlChar *encoding;
2848
2849 if ((ctxt == NULL) || (attvalue == NULL))
2850 return;
2851
2852 /* do not change encoding */
2853 if (ctxt->input->encoding != NULL)
2854 return;
2855
2856 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
2857 if (encoding != NULL) {
2858 encoding += 8;
2859 } else {
2860 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
2861 if (encoding != NULL)
2862 encoding += 9;
2863 }
2864 if (encoding != NULL) {
2865 xmlCharEncoding enc;
2866 xmlCharEncodingHandlerPtr handler;
2867
2868 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2869
2870 if (ctxt->input->encoding != NULL)
2871 xmlFree((xmlChar *) ctxt->input->encoding);
2872 ctxt->input->encoding = xmlStrdup(encoding);
2873
2874 enc = xmlParseCharEncoding((const char *) encoding);
2875 /*
2876 * registered set of known encodings
2877 */
2878 if (enc != XML_CHAR_ENCODING_ERROR) {
2879 xmlSwitchEncoding(ctxt, enc);
2880 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2881 } else {
2882 /*
2883 * fallback for unknown encodings
2884 */
2885 handler = xmlFindCharEncodingHandler((const char *) encoding);
2886 if (handler != NULL) {
2887 xmlSwitchToEncoding(ctxt, handler);
2888 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2889 } else {
2890 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2891 }
2892 }
2893
2894 if ((ctxt->input->buf != NULL) &&
2895 (ctxt->input->buf->encoder != NULL) &&
2896 (ctxt->input->buf->raw != NULL) &&
2897 (ctxt->input->buf->buffer != NULL)) {
2898 int nbchars;
2899 int processed;
2900
2901 /*
2902 * convert as much as possible to the parser reading buffer.
2903 */
2904 processed = ctxt->input->cur - ctxt->input->base;
2905 xmlBufferShrink(ctxt->input->buf->buffer, processed);
2906 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2907 ctxt->input->buf->buffer,
2908 ctxt->input->buf->raw);
2909 if (nbchars < 0) {
2910 ctxt->errNo = XML_ERR_INVALID_ENCODING;
2911 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2912 ctxt->sax->error(ctxt->userData,
2913 "htmlCheckEncoding: encoder error\n");
2914 }
2915 ctxt->input->base =
2916 ctxt->input->cur = ctxt->input->buf->buffer->content;
2917 }
2918 }
2919}
2920
2921/**
2922 * htmlCheckMeta:
2923 * @ctxt: an HTML parser context
2924 * @atts: the attributes values
2925 *
2926 * Checks an attributes from a Meta tag
2927 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002928static void
Owen Taylor3473f882001-02-23 17:55:21 +00002929htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2930 int i;
2931 const xmlChar *att, *value;
2932 int http = 0;
2933 const xmlChar *content = NULL;
2934
2935 if ((ctxt == NULL) || (atts == NULL))
2936 return;
2937
2938 i = 0;
2939 att = atts[i++];
2940 while (att != NULL) {
2941 value = atts[i++];
2942 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
2943 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
2944 http = 1;
2945 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
2946 content = value;
2947 att = atts[i++];
2948 }
2949 if ((http) && (content != NULL))
2950 htmlCheckEncoding(ctxt, content);
2951
2952}
2953
2954/**
2955 * htmlParseStartTag:
2956 * @ctxt: an HTML parser context
2957 *
2958 * parse a start of tag either for rule element or
2959 * EmptyElement. In both case we don't parse the tag closing chars.
2960 *
2961 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2962 *
2963 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2964 *
2965 * With namespace:
2966 *
2967 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2968 *
2969 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2970 *
2971 */
2972
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002973static void
Owen Taylor3473f882001-02-23 17:55:21 +00002974htmlParseStartTag(htmlParserCtxtPtr ctxt) {
2975 xmlChar *name;
2976 xmlChar *attname;
2977 xmlChar *attvalue;
2978 const xmlChar **atts = NULL;
2979 int nbatts = 0;
2980 int maxatts = 0;
2981 int meta = 0;
2982 int i;
2983
2984 if (CUR != '<') return;
2985 NEXT;
2986
2987 GROW;
2988 name = htmlParseHTMLName(ctxt);
2989 if (name == NULL) {
2990 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2991 ctxt->sax->error(ctxt->userData,
2992 "htmlParseStartTag: invalid element name\n");
2993 ctxt->wellFormed = 0;
2994 /* Dump the bogus tag like browsers do */
2995 while ((IS_CHAR(CUR)) && (CUR != '>'))
2996 NEXT;
2997 return;
2998 }
2999 if (xmlStrEqual(name, BAD_CAST"meta"))
3000 meta = 1;
3001
3002 /*
3003 * Check for auto-closure of HTML elements.
3004 */
3005 htmlAutoClose(ctxt, name);
3006
3007 /*
3008 * Check for implied HTML elements.
3009 */
3010 htmlCheckImplied(ctxt, name);
3011
3012 /*
3013 * Avoid html at any level > 0, head at any level != 1
3014 * or any attempt to recurse body
3015 */
3016 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3017 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3018 ctxt->sax->error(ctxt->userData,
3019 "htmlParseStartTag: misplaced <html> tag\n");
3020 ctxt->wellFormed = 0;
3021 xmlFree(name);
3022 return;
3023 }
3024 if ((ctxt->nameNr != 1) &&
3025 (xmlStrEqual(name, BAD_CAST"head"))) {
3026 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3027 ctxt->sax->error(ctxt->userData,
3028 "htmlParseStartTag: misplaced <head> tag\n");
3029 ctxt->wellFormed = 0;
3030 xmlFree(name);
3031 return;
3032 }
3033 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003034 int indx;
3035 for (indx = 0;indx < ctxt->nameNr;indx++) {
3036 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor3473f882001-02-23 17:55:21 +00003037 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3038 ctxt->sax->error(ctxt->userData,
3039 "htmlParseStartTag: misplaced <body> tag\n");
3040 ctxt->wellFormed = 0;
3041 xmlFree(name);
3042 return;
3043 }
3044 }
3045 }
3046
3047 /*
3048 * Now parse the attributes, it ends up with the ending
3049 *
3050 * (S Attribute)* S?
3051 */
3052 SKIP_BLANKS;
3053 while ((IS_CHAR(CUR)) &&
3054 (CUR != '>') &&
3055 ((CUR != '/') || (NXT(1) != '>'))) {
3056 long cons = ctxt->nbChars;
3057
3058 GROW;
3059 attname = htmlParseAttribute(ctxt, &attvalue);
3060 if (attname != NULL) {
3061
3062 /*
3063 * Well formedness requires at most one declaration of an attribute
3064 */
3065 for (i = 0; i < nbatts;i += 2) {
3066 if (xmlStrEqual(atts[i], attname)) {
3067 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3068 ctxt->sax->error(ctxt->userData,
3069 "Attribute %s redefined\n",
3070 attname);
3071 ctxt->wellFormed = 0;
3072 xmlFree(attname);
3073 if (attvalue != NULL)
3074 xmlFree(attvalue);
3075 goto failed;
3076 }
3077 }
3078
3079 /*
3080 * Add the pair to atts
3081 */
3082 if (atts == NULL) {
3083 maxatts = 10;
3084 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
3085 if (atts == NULL) {
3086 xmlGenericError(xmlGenericErrorContext,
3087 "malloc of %ld byte failed\n",
3088 maxatts * (long)sizeof(xmlChar *));
3089 if (name != NULL) xmlFree(name);
3090 return;
3091 }
3092 } else if (nbatts + 4 > maxatts) {
3093 maxatts *= 2;
3094 atts = (const xmlChar **) xmlRealloc((void *) atts,
3095 maxatts * sizeof(xmlChar *));
3096 if (atts == NULL) {
3097 xmlGenericError(xmlGenericErrorContext,
3098 "realloc of %ld byte failed\n",
3099 maxatts * (long)sizeof(xmlChar *));
3100 if (name != NULL) xmlFree(name);
3101 return;
3102 }
3103 }
3104 atts[nbatts++] = attname;
3105 atts[nbatts++] = attvalue;
3106 atts[nbatts] = NULL;
3107 atts[nbatts + 1] = NULL;
3108 }
3109 else {
3110 /* Dump the bogus attribute string up to the next blank or
3111 * the end of the tag. */
Daniel Veillard561b7f82002-03-20 21:55:57 +00003112 while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
3113 && ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003114 NEXT;
3115 }
3116
3117failed:
3118 SKIP_BLANKS;
3119 if (cons == ctxt->nbChars) {
3120 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3121 ctxt->sax->error(ctxt->userData,
3122 "htmlParseStartTag: problem parsing attributes\n");
3123 ctxt->wellFormed = 0;
3124 break;
3125 }
3126 }
3127
3128 /*
3129 * Handle specific association to the META tag
3130 */
3131 if (meta)
3132 htmlCheckMeta(ctxt, atts);
3133
3134 /*
3135 * SAX: Start of Element !
3136 */
3137 htmlnamePush(ctxt, xmlStrdup(name));
3138#ifdef DEBUG
3139 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
3140#endif
3141 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3142 ctxt->sax->startElement(ctxt->userData, name, atts);
3143
3144 if (atts != NULL) {
3145 for (i = 0;i < nbatts;i++) {
3146 if (atts[i] != NULL)
3147 xmlFree((xmlChar *) atts[i]);
3148 }
3149 xmlFree((void *) atts);
3150 }
3151 if (name != NULL) xmlFree(name);
3152}
3153
3154/**
3155 * htmlParseEndTag:
3156 * @ctxt: an HTML parser context
3157 *
3158 * parse an end of tag
3159 *
3160 * [42] ETag ::= '</' Name S? '>'
3161 *
3162 * With namespace
3163 *
3164 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003165 *
3166 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003167 */
3168
Daniel Veillardf420ac52001-07-04 16:04:09 +00003169static int
Owen Taylor3473f882001-02-23 17:55:21 +00003170htmlParseEndTag(htmlParserCtxtPtr ctxt) {
3171 xmlChar *name;
3172 xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003173 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003174
3175 if ((CUR != '<') || (NXT(1) != '/')) {
3176 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3177 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3178 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003179 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003180 }
3181 SKIP(2);
3182
3183 name = htmlParseHTMLName(ctxt);
Daniel Veillardf420ac52001-07-04 16:04:09 +00003184 if (name == NULL) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003185
3186 /*
3187 * We should definitely be at the ending "S? '>'" part
3188 */
3189 SKIP_BLANKS;
3190 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3191 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3192 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3193 ctxt->wellFormed = 0;
3194 } else
3195 NEXT;
3196
3197 /*
3198 * If the name read is not one of the element in the parsing stack
3199 * then return, it's just an error.
3200 */
3201 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
3202 if (xmlStrEqual(name, ctxt->nameTab[i])) break;
3203 }
3204 if (i < 0) {
3205 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3206 ctxt->sax->error(ctxt->userData,
3207 "Unexpected end tag : %s\n", name);
3208 xmlFree(name);
3209 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003210 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003211 }
3212
3213
3214 /*
3215 * Check for auto-closure of HTML elements.
3216 */
3217
3218 htmlAutoCloseOnClose(ctxt, name);
3219
3220 /*
3221 * Well formedness constraints, opening and closing must match.
3222 * With the exception that the autoclose may have popped stuff out
3223 * of the stack.
3224 */
3225 if (!xmlStrEqual(name, ctxt->name)) {
3226#ifdef DEBUG
3227 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
3228#endif
3229 if ((ctxt->name != NULL) &&
3230 (!xmlStrEqual(ctxt->name, name))) {
3231 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3232 ctxt->sax->error(ctxt->userData,
3233 "Opening and ending tag mismatch: %s and %s\n",
3234 name, ctxt->name);
3235 ctxt->wellFormed = 0;
3236 }
3237 }
3238
3239 /*
3240 * SAX: End of Tag
3241 */
3242 oldname = ctxt->name;
3243 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3244 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3245 ctxt->sax->endElement(ctxt->userData, name);
3246 oldname = htmlnamePop(ctxt);
3247 if (oldname != NULL) {
3248#ifdef DEBUG
3249 xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
3250#endif
3251 xmlFree(oldname);
3252#ifdef DEBUG
3253 } else {
3254 xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
3255#endif
3256 }
Daniel Veillardf420ac52001-07-04 16:04:09 +00003257 ret = 1;
3258 } else {
3259 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003260 }
3261
3262 if (name != NULL)
3263 xmlFree(name);
3264
Daniel Veillardf420ac52001-07-04 16:04:09 +00003265 return(ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003266}
3267
3268
3269/**
3270 * htmlParseReference:
3271 * @ctxt: an HTML parser context
3272 *
3273 * parse and handle entity references in content,
3274 * this will end-up in a call to character() since this is either a
3275 * CharRef, or a predefined entity.
3276 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003277static void
Owen Taylor3473f882001-02-23 17:55:21 +00003278htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003279 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003280 xmlChar out[6];
3281 xmlChar *name;
3282 if (CUR != '&') return;
3283
3284 if (NXT(1) == '#') {
3285 unsigned int c;
3286 int bits, i = 0;
3287
3288 c = htmlParseCharRef(ctxt);
3289 if (c == 0)
3290 return;
3291
3292 if (c < 0x80) { out[i++]= c; bits= -6; }
3293 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3294 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3295 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3296
3297 for ( ; bits >= 0; bits-= 6) {
3298 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3299 }
3300 out[i] = 0;
3301
3302 htmlCheckParagraph(ctxt);
3303 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3304 ctxt->sax->characters(ctxt->userData, out, i);
3305 } else {
3306 ent = htmlParseEntityRef(ctxt, &name);
3307 if (name == NULL) {
3308 htmlCheckParagraph(ctxt);
3309 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3310 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3311 return;
3312 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003313 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003314 htmlCheckParagraph(ctxt);
3315 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3316 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3317 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3318 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3319 }
3320 } else {
3321 unsigned int c;
3322 int bits, i = 0;
3323
3324 c = ent->value;
3325 if (c < 0x80)
3326 { out[i++]= c; bits= -6; }
3327 else if (c < 0x800)
3328 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3329 else if (c < 0x10000)
3330 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3331 else
3332 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3333
3334 for ( ; bits >= 0; bits-= 6) {
3335 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3336 }
3337 out[i] = 0;
3338
3339 htmlCheckParagraph(ctxt);
3340 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3341 ctxt->sax->characters(ctxt->userData, out, i);
3342 }
3343 xmlFree(name);
3344 }
3345}
3346
3347/**
3348 * htmlParseContent:
3349 * @ctxt: an HTML parser context
3350 * @name: the node name
3351 *
3352 * Parse a content: comment, sub-element, reference or text.
3353 *
3354 */
3355
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003356static void
Owen Taylor3473f882001-02-23 17:55:21 +00003357htmlParseContent(htmlParserCtxtPtr ctxt) {
3358 xmlChar *currentNode;
3359 int depth;
3360
3361 currentNode = xmlStrdup(ctxt->name);
3362 depth = ctxt->nameNr;
3363 while (1) {
3364 long cons = ctxt->nbChars;
3365
3366 GROW;
3367 /*
3368 * Our tag or one of it's parent or children is ending.
3369 */
3370 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003371 if (htmlParseEndTag(ctxt) &&
3372 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3373 if (currentNode != NULL)
3374 xmlFree(currentNode);
3375 return;
3376 }
3377 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003378 }
3379
3380 /*
3381 * Has this node been popped out during parsing of
3382 * the next element
3383 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003384 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3385 (!xmlStrEqual(currentNode, ctxt->name)))
3386 {
Owen Taylor3473f882001-02-23 17:55:21 +00003387 if (currentNode != NULL) xmlFree(currentNode);
3388 return;
3389 }
3390
Daniel Veillardf9533d12001-03-03 10:04:57 +00003391 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3392 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003393 /*
3394 * Handle SCRIPT/STYLE separately
3395 */
3396 htmlParseScript(ctxt);
3397 } else {
3398 /*
3399 * Sometimes DOCTYPE arrives in the middle of the document
3400 */
3401 if ((CUR == '<') && (NXT(1) == '!') &&
3402 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3403 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3404 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3405 (UPP(8) == 'E')) {
3406 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3407 ctxt->sax->error(ctxt->userData,
3408 "Misplaced DOCTYPE declaration\n");
3409 ctxt->wellFormed = 0;
3410 htmlParseDocTypeDecl(ctxt);
3411 }
3412
3413 /*
3414 * First case : a comment
3415 */
3416 if ((CUR == '<') && (NXT(1) == '!') &&
3417 (NXT(2) == '-') && (NXT(3) == '-')) {
3418 htmlParseComment(ctxt);
3419 }
3420
3421 /*
3422 * Second case : a sub-element.
3423 */
3424 else if (CUR == '<') {
3425 htmlParseElement(ctxt);
3426 }
3427
3428 /*
3429 * Third case : a reference. If if has not been resolved,
3430 * parsing returns it's Name, create the node
3431 */
3432 else if (CUR == '&') {
3433 htmlParseReference(ctxt);
3434 }
3435
3436 /*
3437 * Fourth : end of the resource
3438 */
3439 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003440 htmlAutoCloseOnEnd(ctxt);
3441 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003442 }
3443
3444 /*
3445 * Last case, text. Note that References are handled directly.
3446 */
3447 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003448 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003449 }
3450
3451 if (cons == ctxt->nbChars) {
3452 if (ctxt->node != NULL) {
3453 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3454 ctxt->sax->error(ctxt->userData,
3455 "detected an error in element content\n");
3456 ctxt->wellFormed = 0;
3457 }
3458 break;
3459 }
3460 }
3461 GROW;
3462 }
3463 if (currentNode != NULL) xmlFree(currentNode);
3464}
3465
3466/**
3467 * htmlParseElement:
3468 * @ctxt: an HTML parser context
3469 *
3470 * parse an HTML element, this is highly recursive
3471 *
3472 * [39] element ::= EmptyElemTag | STag content ETag
3473 *
3474 * [41] Attribute ::= Name Eq AttValue
3475 */
3476
3477void
3478htmlParseElement(htmlParserCtxtPtr ctxt) {
3479 xmlChar *name;
3480 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003481 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003482 htmlParserNodeInfo node_info;
3483 xmlChar *oldname;
3484 int depth = ctxt->nameNr;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003485 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003486
3487 /* Capture start position */
3488 if (ctxt->record_info) {
3489 node_info.begin_pos = ctxt->input->consumed +
3490 (CUR_PTR - ctxt->input->base);
3491 node_info.begin_line = ctxt->input->line;
3492 }
3493
3494 oldname = xmlStrdup(ctxt->name);
3495 htmlParseStartTag(ctxt);
3496 name = ctxt->name;
3497#ifdef DEBUG
3498 if (oldname == NULL)
3499 xmlGenericError(xmlGenericErrorContext,
3500 "Start of element %s\n", name);
3501 else if (name == NULL)
3502 xmlGenericError(xmlGenericErrorContext,
3503 "Start of element failed, was %s\n", oldname);
3504 else
3505 xmlGenericError(xmlGenericErrorContext,
3506 "Start of element %s, was %s\n", name, oldname);
3507#endif
3508 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3509 (name == NULL)) {
3510 if (CUR == '>')
3511 NEXT;
3512 if (oldname != NULL)
3513 xmlFree(oldname);
3514 return;
3515 }
3516 if (oldname != NULL)
3517 xmlFree(oldname);
3518
3519 /*
3520 * Lookup the info for that element.
3521 */
3522 info = htmlTagLookup(name);
3523 if (info == NULL) {
3524 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3525 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3526 name);
3527 ctxt->wellFormed = 0;
3528 } else if (info->depr) {
3529/***************************
3530 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3531 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3532 name);
3533 ***************************/
3534 }
3535
3536 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00003537 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00003538 */
3539 if ((CUR == '/') && (NXT(1) == '>')) {
3540 SKIP(2);
3541 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3542 ctxt->sax->endElement(ctxt->userData, name);
3543 oldname = htmlnamePop(ctxt);
3544#ifdef DEBUG
3545 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
3546#endif
3547 if (oldname != NULL)
3548 xmlFree(oldname);
3549 return;
3550 }
3551
3552 if (CUR == '>') {
3553 NEXT;
3554 } else {
3555 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3556 ctxt->sax->error(ctxt->userData,
3557 "Couldn't find end of Start Tag %s\n",
3558 name);
3559 ctxt->wellFormed = 0;
3560
3561 /*
3562 * end of parsing of this node.
3563 */
3564 if (xmlStrEqual(name, ctxt->name)) {
3565 nodePop(ctxt);
3566 oldname = htmlnamePop(ctxt);
3567#ifdef DEBUG
3568 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
3569#endif
3570 if (oldname != NULL)
3571 xmlFree(oldname);
3572 }
3573
3574 /*
3575 * Capture end position and add node
3576 */
3577 if ( currentNode != NULL && ctxt->record_info ) {
3578 node_info.end_pos = ctxt->input->consumed +
3579 (CUR_PTR - ctxt->input->base);
3580 node_info.end_line = ctxt->input->line;
3581 node_info.node = ctxt->node;
3582 xmlParserAddNodeInfo(ctxt, &node_info);
3583 }
3584 return;
3585 }
3586
3587 /*
3588 * Check for an Empty Element from DTD definition
3589 */
3590 if ((info != NULL) && (info->empty)) {
3591 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3592 ctxt->sax->endElement(ctxt->userData, name);
3593 oldname = htmlnamePop(ctxt);
3594#ifdef DEBUG
3595 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
3596#endif
3597 if (oldname != NULL)
3598 xmlFree(oldname);
3599 return;
3600 }
3601
3602 /*
3603 * Parse the content of the element:
3604 */
3605 currentNode = xmlStrdup(ctxt->name);
3606 depth = ctxt->nameNr;
3607 while (IS_CHAR(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00003608 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00003609 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00003610 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00003611 if (ctxt->nameNr < depth) break;
3612 }
3613
Owen Taylor3473f882001-02-23 17:55:21 +00003614 /*
3615 * Capture end position and add node
3616 */
3617 if ( currentNode != NULL && ctxt->record_info ) {
3618 node_info.end_pos = ctxt->input->consumed +
3619 (CUR_PTR - ctxt->input->base);
3620 node_info.end_line = ctxt->input->line;
3621 node_info.node = ctxt->node;
3622 xmlParserAddNodeInfo(ctxt, &node_info);
3623 }
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003624 if (!IS_CHAR(CUR)) {
3625 htmlAutoCloseOnEnd(ctxt);
3626 }
3627
Owen Taylor3473f882001-02-23 17:55:21 +00003628 if (currentNode != NULL)
3629 xmlFree(currentNode);
3630}
3631
3632/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003633 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00003634 * @ctxt: an HTML parser context
3635 *
3636 * parse an HTML document (and build a tree if using the standard SAX
3637 * interface).
3638 *
3639 * Returns 0, -1 in case of error. the parser context is augmented
3640 * as a result of the parsing.
3641 */
3642
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00003643int
Owen Taylor3473f882001-02-23 17:55:21 +00003644htmlParseDocument(htmlParserCtxtPtr ctxt) {
3645 xmlDtdPtr dtd;
3646
Daniel Veillardd0463562001-10-13 09:15:48 +00003647 xmlInitParser();
3648
Owen Taylor3473f882001-02-23 17:55:21 +00003649 htmlDefaultSAXHandlerInit();
3650 ctxt->html = 1;
3651
3652 GROW;
3653 /*
3654 * SAX: beginning of the document processing.
3655 */
3656 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3657 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3658
3659 /*
3660 * Wipe out everything which is before the first '<'
3661 */
3662 SKIP_BLANKS;
3663 if (CUR == 0) {
3664 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3665 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3666 ctxt->wellFormed = 0;
3667 }
3668
3669 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3670 ctxt->sax->startDocument(ctxt->userData);
3671
3672
3673 /*
3674 * Parse possible comments before any content
3675 */
3676 while ((CUR == '<') && (NXT(1) == '!') &&
3677 (NXT(2) == '-') && (NXT(3) == '-')) {
3678 htmlParseComment(ctxt);
3679 SKIP_BLANKS;
3680 }
3681
3682
3683 /*
3684 * Then possibly doc type declaration(s) and more Misc
3685 * (doctypedecl Misc*)?
3686 */
3687 if ((CUR == '<') && (NXT(1) == '!') &&
3688 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3689 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3690 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3691 (UPP(8) == 'E')) {
3692 htmlParseDocTypeDecl(ctxt);
3693 }
3694 SKIP_BLANKS;
3695
3696 /*
3697 * Parse possible comments before any content
3698 */
3699 while ((CUR == '<') && (NXT(1) == '!') &&
3700 (NXT(2) == '-') && (NXT(3) == '-')) {
3701 htmlParseComment(ctxt);
3702 SKIP_BLANKS;
3703 }
3704
3705 /*
3706 * Time to start parsing the tree itself
3707 */
3708 htmlParseContent(ctxt);
3709
3710 /*
3711 * autoclose
3712 */
3713 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003714 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003715
3716
3717 /*
3718 * SAX: end of the document processing.
3719 */
3720 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3721 ctxt->sax->endDocument(ctxt->userData);
3722
3723 if (ctxt->myDoc != NULL) {
3724 dtd = xmlGetIntSubset(ctxt->myDoc);
3725 if (dtd == NULL)
3726 ctxt->myDoc->intSubset =
3727 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3728 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3729 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3730 }
3731 if (! ctxt->wellFormed) return(-1);
3732 return(0);
3733}
3734
3735
3736/************************************************************************
3737 * *
3738 * Parser contexts handling *
3739 * *
3740 ************************************************************************/
3741
3742/**
3743 * xmlInitParserCtxt:
3744 * @ctxt: an HTML parser context
3745 *
3746 * Initialize a parser context
3747 */
3748
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003749static void
Owen Taylor3473f882001-02-23 17:55:21 +00003750htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3751{
3752 htmlSAXHandler *sax;
3753
3754 if (ctxt == NULL) return;
3755 memset(ctxt, 0, sizeof(htmlParserCtxt));
3756
3757 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3758 if (sax == NULL) {
3759 xmlGenericError(xmlGenericErrorContext,
3760 "htmlInitParserCtxt: out of memory\n");
3761 }
3762 else
3763 memset(sax, 0, sizeof(htmlSAXHandler));
3764
3765 /* Allocate the Input stack */
3766 ctxt->inputTab = (htmlParserInputPtr *)
3767 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3768 if (ctxt->inputTab == NULL) {
3769 xmlGenericError(xmlGenericErrorContext,
3770 "htmlInitParserCtxt: out of memory\n");
3771 ctxt->inputNr = 0;
3772 ctxt->inputMax = 0;
3773 ctxt->input = NULL;
3774 return;
3775 }
3776 ctxt->inputNr = 0;
3777 ctxt->inputMax = 5;
3778 ctxt->input = NULL;
3779 ctxt->version = NULL;
3780 ctxt->encoding = NULL;
3781 ctxt->standalone = -1;
3782 ctxt->instate = XML_PARSER_START;
3783
3784 /* Allocate the Node stack */
3785 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3786 if (ctxt->nodeTab == NULL) {
3787 xmlGenericError(xmlGenericErrorContext,
3788 "htmlInitParserCtxt: out of memory\n");
3789 ctxt->nodeNr = 0;
3790 ctxt->nodeMax = 0;
3791 ctxt->node = NULL;
3792 ctxt->inputNr = 0;
3793 ctxt->inputMax = 0;
3794 ctxt->input = NULL;
3795 return;
3796 }
3797 ctxt->nodeNr = 0;
3798 ctxt->nodeMax = 10;
3799 ctxt->node = NULL;
3800
3801 /* Allocate the Name stack */
3802 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
3803 if (ctxt->nameTab == NULL) {
3804 xmlGenericError(xmlGenericErrorContext,
3805 "htmlInitParserCtxt: out of memory\n");
3806 ctxt->nameNr = 0;
3807 ctxt->nameMax = 10;
3808 ctxt->name = NULL;
3809 ctxt->nodeNr = 0;
3810 ctxt->nodeMax = 0;
3811 ctxt->node = NULL;
3812 ctxt->inputNr = 0;
3813 ctxt->inputMax = 0;
3814 ctxt->input = NULL;
3815 return;
3816 }
3817 ctxt->nameNr = 0;
3818 ctxt->nameMax = 10;
3819 ctxt->name = NULL;
3820
3821 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3822 else {
3823 ctxt->sax = sax;
3824 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3825 }
3826 ctxt->userData = ctxt;
3827 ctxt->myDoc = NULL;
3828 ctxt->wellFormed = 1;
3829 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00003830 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00003831 ctxt->html = 1;
3832 ctxt->record_info = 0;
3833 ctxt->validate = 0;
3834 ctxt->nbChars = 0;
3835 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00003836 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003837 xmlInitNodeInfoSeq(&ctxt->node_seq);
3838}
3839
3840/**
3841 * htmlFreeParserCtxt:
3842 * @ctxt: an HTML parser context
3843 *
3844 * Free all the memory used by a parser context. However the parsed
3845 * document in ctxt->myDoc is not freed.
3846 */
3847
3848void
3849htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3850{
3851 xmlFreeParserCtxt(ctxt);
3852}
3853
3854/**
Daniel Veillard1d995272002-07-22 16:43:32 +00003855 * htmlNewParserCtxt:
3856 *
3857 * Allocate and initialize a new parser context.
3858 *
3859 * Returns the xmlParserCtxtPtr or NULL
3860 */
3861
3862static htmlParserCtxtPtr
3863htmlNewParserCtxt(void)
3864{
3865 xmlParserCtxtPtr ctxt;
3866
3867 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
3868 if (ctxt == NULL) {
3869 xmlGenericError(xmlGenericErrorContext,
3870 "xmlNewParserCtxt : cannot allocate context\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00003871 return(NULL);
3872 }
3873 memset(ctxt, 0, sizeof(xmlParserCtxt));
3874 htmlInitParserCtxt(ctxt);
3875 return(ctxt);
3876}
3877
3878/**
3879 * htmlCreateMemoryParserCtxt:
3880 * @buffer: a pointer to a char array
3881 * @size: the size of the array
3882 *
3883 * Create a parser context for an HTML in-memory document.
3884 *
3885 * Returns the new parser context or NULL
3886 */
3887static htmlParserCtxtPtr
3888htmlCreateMemoryParserCtxt(const char *buffer, int size) {
3889 xmlParserCtxtPtr ctxt;
3890 xmlParserInputPtr input;
3891 xmlParserInputBufferPtr buf;
3892
3893 if (buffer == NULL)
3894 return(NULL);
3895 if (size <= 0)
3896 return(NULL);
3897
3898 ctxt = htmlNewParserCtxt();
3899 if (ctxt == NULL)
3900 return(NULL);
3901
3902 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
3903 if (buf == NULL) return(NULL);
3904
3905 input = xmlNewInputStream(ctxt);
3906 if (input == NULL) {
3907 xmlFreeParserCtxt(ctxt);
3908 return(NULL);
3909 }
3910
3911 input->filename = NULL;
3912 input->buf = buf;
3913 input->base = input->buf->buffer->content;
3914 input->cur = input->buf->buffer->content;
3915 input->end = &input->buf->buffer->content[input->buf->buffer->use];
3916
3917 inputPush(ctxt, input);
3918 return(ctxt);
3919}
3920
3921/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003922 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00003923 * @cur: a pointer to an array of xmlChar
3924 * @encoding: a free form C string describing the HTML document encoding, or NULL
3925 *
3926 * Create a parser context for an HTML document.
3927 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003928 * TODO: check the need to add encoding handling there
3929 *
Owen Taylor3473f882001-02-23 17:55:21 +00003930 * Returns the new parser context or NULL
3931 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003932static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00003933htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Daniel Veillard1d995272002-07-22 16:43:32 +00003934 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00003935 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00003936
Daniel Veillard1d995272002-07-22 16:43:32 +00003937 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00003938 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00003939 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00003940 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
3941
3942 if (encoding != NULL) {
3943 xmlCharEncoding enc;
3944 xmlCharEncodingHandlerPtr handler;
3945
3946 if (ctxt->input->encoding != NULL)
3947 xmlFree((xmlChar *) ctxt->input->encoding);
3948 ctxt->input->encoding = (const xmlChar *) encoding;
3949
3950 enc = xmlParseCharEncoding(encoding);
3951 /*
3952 * registered set of known encodings
3953 */
3954 if (enc != XML_CHAR_ENCODING_ERROR) {
3955 xmlSwitchEncoding(ctxt, enc);
3956 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
3957 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3958 ctxt->sax->error(ctxt->userData,
3959 "Unsupported encoding %s\n", encoding);
3960 ctxt->input->encoding = NULL;
3961 }
3962 } else {
3963 /*
3964 * fallback for unknown encodings
3965 */
3966 handler = xmlFindCharEncodingHandler((const char *) encoding);
3967 if (handler != NULL) {
3968 xmlSwitchToEncoding(ctxt, handler);
3969 } else {
3970 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3971 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3972 ctxt->sax->error(ctxt->userData,
3973 "Unsupported encoding %s\n", encoding);
3974 }
3975 }
3976 }
3977 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003978}
3979
3980/************************************************************************
3981 * *
3982 * Progressive parsing interfaces *
3983 * *
3984 ************************************************************************/
3985
3986/**
3987 * htmlParseLookupSequence:
3988 * @ctxt: an HTML parser context
3989 * @first: the first char to lookup
3990 * @next: the next char to lookup or zero
3991 * @third: the next char to lookup or zero
3992 *
3993 * Try to find if a sequence (first, next, third) or just (first next) or
3994 * (first) is available in the input stream.
3995 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3996 * to avoid rescanning sequences of bytes, it DOES change the state of the
3997 * parser, do not use liberally.
3998 * This is basically similar to xmlParseLookupSequence()
3999 *
4000 * Returns the index to the current parsing point if the full sequence
4001 * is available, -1 otherwise.
4002 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004003static int
Owen Taylor3473f882001-02-23 17:55:21 +00004004htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
4005 xmlChar next, xmlChar third) {
4006 int base, len;
4007 htmlParserInputPtr in;
4008 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004009 int incomment = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004010
4011 in = ctxt->input;
4012 if (in == NULL) return(-1);
4013 base = in->cur - in->base;
4014 if (base < 0) return(-1);
4015 if (ctxt->checkIndex > base)
4016 base = ctxt->checkIndex;
4017 if (in->buf == NULL) {
4018 buf = in->base;
4019 len = in->length;
4020 } else {
4021 buf = in->buf->buffer->content;
4022 len = in->buf->buffer->use;
4023 }
4024 /* take into account the sequence length */
4025 if (third) len -= 2;
4026 else if (next) len --;
4027 for (;base < len;base++) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00004028 if (!incomment && (base + 4 < len)) {
4029 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4030 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4031 incomment = 1;
4032 }
4033 /* do not increment base, some people use <!--> */
4034 }
4035 if (incomment) {
4036 if (base + 3 < len)
4037 return(-1);
4038 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4039 (buf[base + 2] == '>')) {
4040 incomment = 0;
4041 base += 2;
4042 }
4043 continue;
4044 }
Owen Taylor3473f882001-02-23 17:55:21 +00004045 if (buf[base] == first) {
4046 if (third != 0) {
4047 if ((buf[base + 1] != next) ||
4048 (buf[base + 2] != third)) continue;
4049 } else if (next != 0) {
4050 if (buf[base + 1] != next) continue;
4051 }
4052 ctxt->checkIndex = 0;
4053#ifdef DEBUG_PUSH
4054 if (next == 0)
4055 xmlGenericError(xmlGenericErrorContext,
4056 "HPP: lookup '%c' found at %d\n",
4057 first, base);
4058 else if (third == 0)
4059 xmlGenericError(xmlGenericErrorContext,
4060 "HPP: lookup '%c%c' found at %d\n",
4061 first, next, base);
4062 else
4063 xmlGenericError(xmlGenericErrorContext,
4064 "HPP: lookup '%c%c%c' found at %d\n",
4065 first, next, third, base);
4066#endif
4067 return(base - (in->cur - in->base));
4068 }
4069 }
4070 ctxt->checkIndex = base;
4071#ifdef DEBUG_PUSH
4072 if (next == 0)
4073 xmlGenericError(xmlGenericErrorContext,
4074 "HPP: lookup '%c' failed\n", first);
4075 else if (third == 0)
4076 xmlGenericError(xmlGenericErrorContext,
4077 "HPP: lookup '%c%c' failed\n", first, next);
4078 else
4079 xmlGenericError(xmlGenericErrorContext,
4080 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4081#endif
4082 return(-1);
4083}
4084
4085/**
4086 * htmlParseTryOrFinish:
4087 * @ctxt: an HTML parser context
4088 * @terminate: last chunk indicator
4089 *
4090 * Try to progress on parsing
4091 *
4092 * Returns zero if no parsing was possible
4093 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004094static int
Owen Taylor3473f882001-02-23 17:55:21 +00004095htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4096 int ret = 0;
4097 htmlParserInputPtr in;
4098 int avail = 0;
4099 xmlChar cur, next;
4100
4101#ifdef DEBUG_PUSH
4102 switch (ctxt->instate) {
4103 case XML_PARSER_EOF:
4104 xmlGenericError(xmlGenericErrorContext,
4105 "HPP: try EOF\n"); break;
4106 case XML_PARSER_START:
4107 xmlGenericError(xmlGenericErrorContext,
4108 "HPP: try START\n"); break;
4109 case XML_PARSER_MISC:
4110 xmlGenericError(xmlGenericErrorContext,
4111 "HPP: try MISC\n");break;
4112 case XML_PARSER_COMMENT:
4113 xmlGenericError(xmlGenericErrorContext,
4114 "HPP: try COMMENT\n");break;
4115 case XML_PARSER_PROLOG:
4116 xmlGenericError(xmlGenericErrorContext,
4117 "HPP: try PROLOG\n");break;
4118 case XML_PARSER_START_TAG:
4119 xmlGenericError(xmlGenericErrorContext,
4120 "HPP: try START_TAG\n");break;
4121 case XML_PARSER_CONTENT:
4122 xmlGenericError(xmlGenericErrorContext,
4123 "HPP: try CONTENT\n");break;
4124 case XML_PARSER_CDATA_SECTION:
4125 xmlGenericError(xmlGenericErrorContext,
4126 "HPP: try CDATA_SECTION\n");break;
4127 case XML_PARSER_END_TAG:
4128 xmlGenericError(xmlGenericErrorContext,
4129 "HPP: try END_TAG\n");break;
4130 case XML_PARSER_ENTITY_DECL:
4131 xmlGenericError(xmlGenericErrorContext,
4132 "HPP: try ENTITY_DECL\n");break;
4133 case XML_PARSER_ENTITY_VALUE:
4134 xmlGenericError(xmlGenericErrorContext,
4135 "HPP: try ENTITY_VALUE\n");break;
4136 case XML_PARSER_ATTRIBUTE_VALUE:
4137 xmlGenericError(xmlGenericErrorContext,
4138 "HPP: try ATTRIBUTE_VALUE\n");break;
4139 case XML_PARSER_DTD:
4140 xmlGenericError(xmlGenericErrorContext,
4141 "HPP: try DTD\n");break;
4142 case XML_PARSER_EPILOG:
4143 xmlGenericError(xmlGenericErrorContext,
4144 "HPP: try EPILOG\n");break;
4145 case XML_PARSER_PI:
4146 xmlGenericError(xmlGenericErrorContext,
4147 "HPP: try PI\n");break;
4148 case XML_PARSER_SYSTEM_LITERAL:
4149 xmlGenericError(xmlGenericErrorContext,
4150 "HPP: try SYSTEM_LITERAL\n");break;
4151 }
4152#endif
4153
4154 while (1) {
4155
4156 in = ctxt->input;
4157 if (in == NULL) break;
4158 if (in->buf == NULL)
4159 avail = in->length - (in->cur - in->base);
4160 else
4161 avail = in->buf->buffer->use - (in->cur - in->base);
4162 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004163 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004164 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4165 /*
4166 * SAX: end of the document processing.
4167 */
4168 ctxt->instate = XML_PARSER_EOF;
4169 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4170 ctxt->sax->endDocument(ctxt->userData);
4171 }
4172 }
4173 if (avail < 1)
4174 goto done;
4175 switch (ctxt->instate) {
4176 case XML_PARSER_EOF:
4177 /*
4178 * Document parsing is done !
4179 */
4180 goto done;
4181 case XML_PARSER_START:
4182 /*
4183 * Very first chars read from the document flow.
4184 */
4185 cur = in->cur[0];
4186 if (IS_BLANK(cur)) {
4187 SKIP_BLANKS;
4188 if (in->buf == NULL)
4189 avail = in->length - (in->cur - in->base);
4190 else
4191 avail = in->buf->buffer->use - (in->cur - in->base);
4192 }
4193 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4194 ctxt->sax->setDocumentLocator(ctxt->userData,
4195 &xmlDefaultSAXLocator);
4196 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4197 (!ctxt->disableSAX))
4198 ctxt->sax->startDocument(ctxt->userData);
4199
4200 cur = in->cur[0];
4201 next = in->cur[1];
4202 if ((cur == '<') && (next == '!') &&
4203 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4204 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4205 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4206 (UPP(8) == 'E')) {
4207 if ((!terminate) &&
4208 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4209 goto done;
4210#ifdef DEBUG_PUSH
4211 xmlGenericError(xmlGenericErrorContext,
4212 "HPP: Parsing internal subset\n");
4213#endif
4214 htmlParseDocTypeDecl(ctxt);
4215 ctxt->instate = XML_PARSER_PROLOG;
4216#ifdef DEBUG_PUSH
4217 xmlGenericError(xmlGenericErrorContext,
4218 "HPP: entering PROLOG\n");
4219#endif
4220 } else {
4221 ctxt->instate = XML_PARSER_MISC;
4222 }
4223#ifdef DEBUG_PUSH
4224 xmlGenericError(xmlGenericErrorContext,
4225 "HPP: entering MISC\n");
4226#endif
4227 break;
4228 case XML_PARSER_MISC:
4229 SKIP_BLANKS;
4230 if (in->buf == NULL)
4231 avail = in->length - (in->cur - in->base);
4232 else
4233 avail = in->buf->buffer->use - (in->cur - in->base);
4234 if (avail < 2)
4235 goto done;
4236 cur = in->cur[0];
4237 next = in->cur[1];
4238 if ((cur == '<') && (next == '!') &&
4239 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4240 if ((!terminate) &&
4241 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4242 goto done;
4243#ifdef DEBUG_PUSH
4244 xmlGenericError(xmlGenericErrorContext,
4245 "HPP: Parsing Comment\n");
4246#endif
4247 htmlParseComment(ctxt);
4248 ctxt->instate = XML_PARSER_MISC;
4249 } else if ((cur == '<') && (next == '!') &&
4250 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4251 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4252 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4253 (UPP(8) == 'E')) {
4254 if ((!terminate) &&
4255 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4256 goto done;
4257#ifdef DEBUG_PUSH
4258 xmlGenericError(xmlGenericErrorContext,
4259 "HPP: Parsing internal subset\n");
4260#endif
4261 htmlParseDocTypeDecl(ctxt);
4262 ctxt->instate = XML_PARSER_PROLOG;
4263#ifdef DEBUG_PUSH
4264 xmlGenericError(xmlGenericErrorContext,
4265 "HPP: entering PROLOG\n");
4266#endif
4267 } else if ((cur == '<') && (next == '!') &&
4268 (avail < 9)) {
4269 goto done;
4270 } else {
4271 ctxt->instate = XML_PARSER_START_TAG;
4272#ifdef DEBUG_PUSH
4273 xmlGenericError(xmlGenericErrorContext,
4274 "HPP: entering START_TAG\n");
4275#endif
4276 }
4277 break;
4278 case XML_PARSER_PROLOG:
4279 SKIP_BLANKS;
4280 if (in->buf == NULL)
4281 avail = in->length - (in->cur - in->base);
4282 else
4283 avail = in->buf->buffer->use - (in->cur - in->base);
4284 if (avail < 2)
4285 goto done;
4286 cur = in->cur[0];
4287 next = in->cur[1];
4288 if ((cur == '<') && (next == '!') &&
4289 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4290 if ((!terminate) &&
4291 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4292 goto done;
4293#ifdef DEBUG_PUSH
4294 xmlGenericError(xmlGenericErrorContext,
4295 "HPP: Parsing Comment\n");
4296#endif
4297 htmlParseComment(ctxt);
4298 ctxt->instate = XML_PARSER_PROLOG;
4299 } else if ((cur == '<') && (next == '!') &&
4300 (avail < 4)) {
4301 goto done;
4302 } else {
4303 ctxt->instate = XML_PARSER_START_TAG;
4304#ifdef DEBUG_PUSH
4305 xmlGenericError(xmlGenericErrorContext,
4306 "HPP: entering START_TAG\n");
4307#endif
4308 }
4309 break;
4310 case XML_PARSER_EPILOG:
4311 if (in->buf == NULL)
4312 avail = in->length - (in->cur - in->base);
4313 else
4314 avail = in->buf->buffer->use - (in->cur - in->base);
4315 if (avail < 1)
4316 goto done;
4317 cur = in->cur[0];
4318 if (IS_BLANK(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004319 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004320 goto done;
4321 }
4322 if (avail < 2)
4323 goto done;
4324 next = in->cur[1];
4325 if ((cur == '<') && (next == '!') &&
4326 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4327 if ((!terminate) &&
4328 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4329 goto done;
4330#ifdef DEBUG_PUSH
4331 xmlGenericError(xmlGenericErrorContext,
4332 "HPP: Parsing Comment\n");
4333#endif
4334 htmlParseComment(ctxt);
4335 ctxt->instate = XML_PARSER_EPILOG;
4336 } else if ((cur == '<') && (next == '!') &&
4337 (avail < 4)) {
4338 goto done;
4339 } else {
4340 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004341 ctxt->wellFormed = 0;
4342 ctxt->instate = XML_PARSER_EOF;
4343#ifdef DEBUG_PUSH
4344 xmlGenericError(xmlGenericErrorContext,
4345 "HPP: entering EOF\n");
4346#endif
4347 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4348 ctxt->sax->endDocument(ctxt->userData);
4349 goto done;
4350 }
4351 break;
4352 case XML_PARSER_START_TAG: {
4353 xmlChar *name, *oldname;
4354 int depth = ctxt->nameNr;
Daniel Veillardbb371292001-08-16 23:26:59 +00004355 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004356
4357 if (avail < 2)
4358 goto done;
4359 cur = in->cur[0];
4360 if (cur != '<') {
4361 ctxt->instate = XML_PARSER_CONTENT;
4362#ifdef DEBUG_PUSH
4363 xmlGenericError(xmlGenericErrorContext,
4364 "HPP: entering CONTENT\n");
4365#endif
4366 break;
4367 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004368 if (in->cur[1] == '/') {
4369 ctxt->instate = XML_PARSER_END_TAG;
4370 ctxt->checkIndex = 0;
4371#ifdef DEBUG_PUSH
4372 xmlGenericError(xmlGenericErrorContext,
4373 "HPP: entering END_TAG\n");
4374#endif
4375 break;
4376 }
Owen Taylor3473f882001-02-23 17:55:21 +00004377 if ((!terminate) &&
4378 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4379 goto done;
4380
4381 oldname = xmlStrdup(ctxt->name);
4382 htmlParseStartTag(ctxt);
4383 name = ctxt->name;
4384#ifdef DEBUG
4385 if (oldname == NULL)
4386 xmlGenericError(xmlGenericErrorContext,
4387 "Start of element %s\n", name);
4388 else if (name == NULL)
4389 xmlGenericError(xmlGenericErrorContext,
4390 "Start of element failed, was %s\n",
4391 oldname);
4392 else
4393 xmlGenericError(xmlGenericErrorContext,
4394 "Start of element %s, was %s\n",
4395 name, oldname);
4396#endif
4397 if (((depth == ctxt->nameNr) &&
4398 (xmlStrEqual(oldname, ctxt->name))) ||
4399 (name == NULL)) {
4400 if (CUR == '>')
4401 NEXT;
4402 if (oldname != NULL)
4403 xmlFree(oldname);
4404 break;
4405 }
4406 if (oldname != NULL)
4407 xmlFree(oldname);
4408
4409 /*
4410 * Lookup the info for that element.
4411 */
4412 info = htmlTagLookup(name);
4413 if (info == NULL) {
4414 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4415 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4416 name);
4417 ctxt->wellFormed = 0;
4418 } else if (info->depr) {
4419 /***************************
4420 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4421 ctxt->sax->warning(ctxt->userData,
4422 "Tag %s is deprecated\n",
4423 name);
4424 ***************************/
4425 }
4426
4427 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004428 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004429 */
4430 if ((CUR == '/') && (NXT(1) == '>')) {
4431 SKIP(2);
4432 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4433 ctxt->sax->endElement(ctxt->userData, name);
4434 oldname = htmlnamePop(ctxt);
4435#ifdef DEBUG
4436 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
4437 oldname);
4438#endif
4439 if (oldname != NULL)
4440 xmlFree(oldname);
4441 ctxt->instate = XML_PARSER_CONTENT;
4442#ifdef DEBUG_PUSH
4443 xmlGenericError(xmlGenericErrorContext,
4444 "HPP: entering CONTENT\n");
4445#endif
4446 break;
4447 }
4448
4449 if (CUR == '>') {
4450 NEXT;
4451 } else {
4452 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4453 ctxt->sax->error(ctxt->userData,
4454 "Couldn't find end of Start Tag %s\n",
4455 name);
4456 ctxt->wellFormed = 0;
4457
4458 /*
4459 * end of parsing of this node.
4460 */
4461 if (xmlStrEqual(name, ctxt->name)) {
4462 nodePop(ctxt);
4463 oldname = htmlnamePop(ctxt);
4464#ifdef DEBUG
4465 xmlGenericError(xmlGenericErrorContext,
4466 "End of start tag problem: popping out %s\n", oldname);
4467#endif
4468 if (oldname != NULL)
4469 xmlFree(oldname);
4470 }
4471
4472 ctxt->instate = XML_PARSER_CONTENT;
4473#ifdef DEBUG_PUSH
4474 xmlGenericError(xmlGenericErrorContext,
4475 "HPP: entering CONTENT\n");
4476#endif
4477 break;
4478 }
4479
4480 /*
4481 * Check for an Empty Element from DTD definition
4482 */
4483 if ((info != NULL) && (info->empty)) {
4484 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4485 ctxt->sax->endElement(ctxt->userData, name);
4486 oldname = htmlnamePop(ctxt);
4487#ifdef DEBUG
4488 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
4489#endif
4490 if (oldname != NULL)
4491 xmlFree(oldname);
4492 }
4493 ctxt->instate = XML_PARSER_CONTENT;
4494#ifdef DEBUG_PUSH
4495 xmlGenericError(xmlGenericErrorContext,
4496 "HPP: entering CONTENT\n");
4497#endif
4498 break;
4499 }
4500 case XML_PARSER_CONTENT: {
4501 long cons;
4502 /*
4503 * Handle preparsed entities and charRef
4504 */
4505 if (ctxt->token != 0) {
4506 xmlChar chr[2] = { 0 , 0 } ;
4507
4508 chr[0] = (xmlChar) ctxt->token;
4509 htmlCheckParagraph(ctxt);
4510 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4511 ctxt->sax->characters(ctxt->userData, chr, 1);
4512 ctxt->token = 0;
4513 ctxt->checkIndex = 0;
4514 }
4515 if ((avail == 1) && (terminate)) {
4516 cur = in->cur[0];
4517 if ((cur != '<') && (cur != '&')) {
4518 if (ctxt->sax != NULL) {
4519 if (IS_BLANK(cur)) {
4520 if (ctxt->sax->ignorableWhitespace != NULL)
4521 ctxt->sax->ignorableWhitespace(
4522 ctxt->userData, &cur, 1);
4523 } else {
4524 htmlCheckParagraph(ctxt);
4525 if (ctxt->sax->characters != NULL)
4526 ctxt->sax->characters(
4527 ctxt->userData, &cur, 1);
4528 }
4529 }
4530 ctxt->token = 0;
4531 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00004532 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00004533 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004534 }
Owen Taylor3473f882001-02-23 17:55:21 +00004535 }
4536 if (avail < 2)
4537 goto done;
4538 cur = in->cur[0];
4539 next = in->cur[1];
4540 cons = ctxt->nbChars;
4541 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4542 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4543 /*
4544 * Handle SCRIPT/STYLE separately
4545 */
4546 if ((!terminate) &&
4547 (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4548 goto done;
4549 htmlParseScript(ctxt);
4550 if ((cur == '<') && (next == '/')) {
4551 ctxt->instate = XML_PARSER_END_TAG;
4552 ctxt->checkIndex = 0;
4553#ifdef DEBUG_PUSH
4554 xmlGenericError(xmlGenericErrorContext,
4555 "HPP: entering END_TAG\n");
4556#endif
4557 break;
4558 }
4559 } else {
4560 /*
4561 * Sometimes DOCTYPE arrives in the middle of the document
4562 */
4563 if ((cur == '<') && (next == '!') &&
4564 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4565 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4566 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4567 (UPP(8) == 'E')) {
4568 if ((!terminate) &&
4569 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4570 goto done;
4571 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4572 ctxt->sax->error(ctxt->userData,
4573 "Misplaced DOCTYPE declaration\n");
4574 ctxt->wellFormed = 0;
4575 htmlParseDocTypeDecl(ctxt);
4576 } else if ((cur == '<') && (next == '!') &&
4577 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4578 if ((!terminate) &&
4579 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4580 goto done;
4581#ifdef DEBUG_PUSH
4582 xmlGenericError(xmlGenericErrorContext,
4583 "HPP: Parsing Comment\n");
4584#endif
4585 htmlParseComment(ctxt);
4586 ctxt->instate = XML_PARSER_CONTENT;
4587 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4588 goto done;
4589 } else if ((cur == '<') && (next == '/')) {
4590 ctxt->instate = XML_PARSER_END_TAG;
4591 ctxt->checkIndex = 0;
4592#ifdef DEBUG_PUSH
4593 xmlGenericError(xmlGenericErrorContext,
4594 "HPP: entering END_TAG\n");
4595#endif
4596 break;
4597 } else if (cur == '<') {
4598 ctxt->instate = XML_PARSER_START_TAG;
4599 ctxt->checkIndex = 0;
4600#ifdef DEBUG_PUSH
4601 xmlGenericError(xmlGenericErrorContext,
4602 "HPP: entering START_TAG\n");
4603#endif
4604 break;
4605 } else if (cur == '&') {
4606 if ((!terminate) &&
4607 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4608 goto done;
4609#ifdef DEBUG_PUSH
4610 xmlGenericError(xmlGenericErrorContext,
4611 "HPP: Parsing Reference\n");
4612#endif
4613 /* TODO: check generation of subtrees if noent !!! */
4614 htmlParseReference(ctxt);
4615 } else {
4616 /* TODO Avoid the extra copy, handle directly !!!!!! */
4617 /*
Daniel Veillard01c13b52002-12-10 15:19:08 +00004618 * Goal of the following test is:
Owen Taylor3473f882001-02-23 17:55:21 +00004619 * - minimize calls to the SAX 'character' callback
4620 * when they are mergeable
4621 */
4622 if ((ctxt->inputNr == 1) &&
4623 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4624 if ((!terminate) &&
4625 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4626 goto done;
4627 }
4628 ctxt->checkIndex = 0;
4629#ifdef DEBUG_PUSH
4630 xmlGenericError(xmlGenericErrorContext,
4631 "HPP: Parsing char data\n");
4632#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004633 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004634 }
4635 }
4636 if (cons == ctxt->nbChars) {
4637 if (ctxt->node != NULL) {
4638 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4639 ctxt->sax->error(ctxt->userData,
4640 "detected an error in element content\n");
4641 ctxt->wellFormed = 0;
4642 }
4643 NEXT;
4644 break;
4645 }
4646
4647 break;
4648 }
4649 case XML_PARSER_END_TAG:
4650 if (avail < 2)
4651 goto done;
4652 if ((!terminate) &&
4653 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4654 goto done;
4655 htmlParseEndTag(ctxt);
4656 if (ctxt->nameNr == 0) {
4657 ctxt->instate = XML_PARSER_EPILOG;
4658 } else {
4659 ctxt->instate = XML_PARSER_CONTENT;
4660 }
4661 ctxt->checkIndex = 0;
4662#ifdef DEBUG_PUSH
4663 xmlGenericError(xmlGenericErrorContext,
4664 "HPP: entering CONTENT\n");
4665#endif
4666 break;
4667 case XML_PARSER_CDATA_SECTION:
4668 xmlGenericError(xmlGenericErrorContext,
4669 "HPP: internal error, state == CDATA\n");
4670 ctxt->instate = XML_PARSER_CONTENT;
4671 ctxt->checkIndex = 0;
4672#ifdef DEBUG_PUSH
4673 xmlGenericError(xmlGenericErrorContext,
4674 "HPP: entering CONTENT\n");
4675#endif
4676 break;
4677 case XML_PARSER_DTD:
4678 xmlGenericError(xmlGenericErrorContext,
4679 "HPP: internal error, state == DTD\n");
4680 ctxt->instate = XML_PARSER_CONTENT;
4681 ctxt->checkIndex = 0;
4682#ifdef DEBUG_PUSH
4683 xmlGenericError(xmlGenericErrorContext,
4684 "HPP: entering CONTENT\n");
4685#endif
4686 break;
4687 case XML_PARSER_COMMENT:
4688 xmlGenericError(xmlGenericErrorContext,
4689 "HPP: internal error, state == COMMENT\n");
4690 ctxt->instate = XML_PARSER_CONTENT;
4691 ctxt->checkIndex = 0;
4692#ifdef DEBUG_PUSH
4693 xmlGenericError(xmlGenericErrorContext,
4694 "HPP: entering CONTENT\n");
4695#endif
4696 break;
4697 case XML_PARSER_PI:
4698 xmlGenericError(xmlGenericErrorContext,
4699 "HPP: internal error, state == PI\n");
4700 ctxt->instate = XML_PARSER_CONTENT;
4701 ctxt->checkIndex = 0;
4702#ifdef DEBUG_PUSH
4703 xmlGenericError(xmlGenericErrorContext,
4704 "HPP: entering CONTENT\n");
4705#endif
4706 break;
4707 case XML_PARSER_ENTITY_DECL:
4708 xmlGenericError(xmlGenericErrorContext,
4709 "HPP: internal error, state == ENTITY_DECL\n");
4710 ctxt->instate = XML_PARSER_CONTENT;
4711 ctxt->checkIndex = 0;
4712#ifdef DEBUG_PUSH
4713 xmlGenericError(xmlGenericErrorContext,
4714 "HPP: entering CONTENT\n");
4715#endif
4716 break;
4717 case XML_PARSER_ENTITY_VALUE:
4718 xmlGenericError(xmlGenericErrorContext,
4719 "HPP: internal error, state == ENTITY_VALUE\n");
4720 ctxt->instate = XML_PARSER_CONTENT;
4721 ctxt->checkIndex = 0;
4722#ifdef DEBUG_PUSH
4723 xmlGenericError(xmlGenericErrorContext,
4724 "HPP: entering DTD\n");
4725#endif
4726 break;
4727 case XML_PARSER_ATTRIBUTE_VALUE:
4728 xmlGenericError(xmlGenericErrorContext,
4729 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
4730 ctxt->instate = XML_PARSER_START_TAG;
4731 ctxt->checkIndex = 0;
4732#ifdef DEBUG_PUSH
4733 xmlGenericError(xmlGenericErrorContext,
4734 "HPP: entering START_TAG\n");
4735#endif
4736 break;
4737 case XML_PARSER_SYSTEM_LITERAL:
4738 xmlGenericError(xmlGenericErrorContext,
4739 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
4740 ctxt->instate = XML_PARSER_CONTENT;
4741 ctxt->checkIndex = 0;
4742#ifdef DEBUG_PUSH
4743 xmlGenericError(xmlGenericErrorContext,
4744 "HPP: entering CONTENT\n");
4745#endif
4746 break;
4747 case XML_PARSER_IGNORE:
4748 xmlGenericError(xmlGenericErrorContext,
4749 "HPP: internal error, state == XML_PARSER_IGNORE\n");
4750 ctxt->instate = XML_PARSER_CONTENT;
4751 ctxt->checkIndex = 0;
4752#ifdef DEBUG_PUSH
4753 xmlGenericError(xmlGenericErrorContext,
4754 "HPP: entering CONTENT\n");
4755#endif
4756 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00004757 case XML_PARSER_PUBLIC_LITERAL:
4758 xmlGenericError(xmlGenericErrorContext,
4759 "HPP: internal error, state == XML_PARSER_LITERAL\n");
4760 ctxt->instate = XML_PARSER_CONTENT;
4761 ctxt->checkIndex = 0;
4762#ifdef DEBUG_PUSH
4763 xmlGenericError(xmlGenericErrorContext,
4764 "HPP: entering CONTENT\n");
4765#endif
4766 break;
4767
Owen Taylor3473f882001-02-23 17:55:21 +00004768 }
4769 }
4770done:
4771 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004772 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004773 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4774 /*
4775 * SAX: end of the document processing.
4776 */
4777 ctxt->instate = XML_PARSER_EOF;
4778 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4779 ctxt->sax->endDocument(ctxt->userData);
4780 }
4781 }
4782 if ((ctxt->myDoc != NULL) &&
4783 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4784 (ctxt->instate == XML_PARSER_EPILOG))) {
4785 xmlDtdPtr dtd;
4786 dtd = xmlGetIntSubset(ctxt->myDoc);
4787 if (dtd == NULL)
4788 ctxt->myDoc->intSubset =
4789 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4790 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4791 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4792 }
4793#ifdef DEBUG_PUSH
4794 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4795#endif
4796 return(ret);
4797}
4798
4799/**
Owen Taylor3473f882001-02-23 17:55:21 +00004800 * htmlParseChunk:
4801 * @ctxt: an XML parser context
4802 * @chunk: an char array
4803 * @size: the size in byte of the chunk
4804 * @terminate: last chunk indicator
4805 *
4806 * Parse a Chunk of memory
4807 *
4808 * Returns zero if no error, the xmlParserErrors otherwise.
4809 */
4810int
4811htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4812 int terminate) {
4813 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4814 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4815 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4816 int cur = ctxt->input->cur - ctxt->input->base;
4817
4818 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4819 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4820 ctxt->input->cur = ctxt->input->base + cur;
4821#ifdef DEBUG_PUSH
4822 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4823#endif
4824
4825 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4826 htmlParseTryOrFinish(ctxt, terminate);
4827 } else if (ctxt->instate != XML_PARSER_EOF) {
4828 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
4829 htmlParseTryOrFinish(ctxt, terminate);
4830 }
4831 if (terminate) {
4832 if ((ctxt->instate != XML_PARSER_EOF) &&
4833 (ctxt->instate != XML_PARSER_EPILOG) &&
4834 (ctxt->instate != XML_PARSER_MISC)) {
4835 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004836 ctxt->wellFormed = 0;
4837 }
4838 if (ctxt->instate != XML_PARSER_EOF) {
4839 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4840 ctxt->sax->endDocument(ctxt->userData);
4841 }
4842 ctxt->instate = XML_PARSER_EOF;
4843 }
4844 return((xmlParserErrors) ctxt->errNo);
4845}
4846
4847/************************************************************************
4848 * *
4849 * User entry points *
4850 * *
4851 ************************************************************************/
4852
4853/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004854 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004855 * @sax: a SAX handler
4856 * @user_data: The user data returned on SAX callbacks
4857 * @chunk: a pointer to an array of chars
4858 * @size: number of chars in the array
4859 * @filename: an optional file name or URI
4860 * @enc: an optional encoding
4861 *
4862 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00004863 * The value of @filename is used for fetching external entities
4864 * and error/warning reports.
4865 *
4866 * Returns the new parser context or NULL
4867 */
4868htmlParserCtxtPtr
4869htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4870 const char *chunk, int size, const char *filename,
4871 xmlCharEncoding enc) {
4872 htmlParserCtxtPtr ctxt;
4873 htmlParserInputPtr inputStream;
4874 xmlParserInputBufferPtr buf;
4875
Daniel Veillardd0463562001-10-13 09:15:48 +00004876 xmlInitParser();
4877
Owen Taylor3473f882001-02-23 17:55:21 +00004878 buf = xmlAllocParserInputBuffer(enc);
4879 if (buf == NULL) return(NULL);
4880
4881 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4882 if (ctxt == NULL) {
4883 xmlFree(buf);
4884 return(NULL);
4885 }
4886 memset(ctxt, 0, sizeof(htmlParserCtxt));
4887 htmlInitParserCtxt(ctxt);
4888 if (sax != NULL) {
4889 if (ctxt->sax != &htmlDefaultSAXHandler)
4890 xmlFree(ctxt->sax);
4891 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4892 if (ctxt->sax == NULL) {
4893 xmlFree(buf);
4894 xmlFree(ctxt);
4895 return(NULL);
4896 }
4897 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4898 if (user_data != NULL)
4899 ctxt->userData = user_data;
4900 }
4901 if (filename == NULL) {
4902 ctxt->directory = NULL;
4903 } else {
4904 ctxt->directory = xmlParserGetDirectory(filename);
4905 }
4906
4907 inputStream = htmlNewInputStream(ctxt);
4908 if (inputStream == NULL) {
4909 xmlFreeParserCtxt(ctxt);
4910 return(NULL);
4911 }
4912
4913 if (filename == NULL)
4914 inputStream->filename = NULL;
4915 else
4916 inputStream->filename = xmlMemStrdup(filename);
4917 inputStream->buf = buf;
4918 inputStream->base = inputStream->buf->buffer->content;
4919 inputStream->cur = inputStream->buf->buffer->content;
4920
4921 inputPush(ctxt, inputStream);
4922
4923 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4924 (ctxt->input->buf != NULL)) {
4925 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4926#ifdef DEBUG_PUSH
4927 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4928#endif
4929 }
4930
4931 return(ctxt);
4932}
4933
4934/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004935 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00004936 * @cur: a pointer to an array of xmlChar
4937 * @encoding: a free form C string describing the HTML document encoding, or NULL
4938 * @sax: the SAX handler block
4939 * @userData: if using SAX, this pointer will be provided on callbacks.
4940 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004941 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
4942 * to handle parse events. If sax is NULL, fallback to the default DOM
4943 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00004944 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004945 * Returns the resulting document tree unless SAX is NULL or the document is
4946 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00004947 */
4948
4949htmlDocPtr
4950htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
4951 htmlDocPtr ret;
4952 htmlParserCtxtPtr ctxt;
4953
Daniel Veillardd0463562001-10-13 09:15:48 +00004954 xmlInitParser();
4955
Owen Taylor3473f882001-02-23 17:55:21 +00004956 if (cur == NULL) return(NULL);
4957
4958
4959 ctxt = htmlCreateDocParserCtxt(cur, encoding);
4960 if (ctxt == NULL) return(NULL);
4961 if (sax != NULL) {
4962 ctxt->sax = sax;
4963 ctxt->userData = userData;
4964 }
4965
4966 htmlParseDocument(ctxt);
4967 ret = ctxt->myDoc;
4968 if (sax != NULL) {
4969 ctxt->sax = NULL;
4970 ctxt->userData = NULL;
4971 }
4972 htmlFreeParserCtxt(ctxt);
4973
4974 return(ret);
4975}
4976
4977/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004978 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00004979 * @cur: a pointer to an array of xmlChar
4980 * @encoding: a free form C string describing the HTML document encoding, or NULL
4981 *
4982 * parse an HTML in-memory document and build a tree.
4983 *
4984 * Returns the resulting document tree
4985 */
4986
4987htmlDocPtr
4988htmlParseDoc(xmlChar *cur, const char *encoding) {
4989 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4990}
4991
4992
4993/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004994 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004995 * @filename: the filename
4996 * @encoding: a free form C string describing the HTML document encoding, or NULL
4997 *
4998 * Create a parser context for a file content.
4999 * Automatic support for ZLIB/Compress compressed document is provided
5000 * by default if found at compile-time.
5001 *
5002 * Returns the new parser context or NULL
5003 */
5004htmlParserCtxtPtr
5005htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5006{
5007 htmlParserCtxtPtr ctxt;
5008 htmlParserInputPtr inputStream;
5009 xmlParserInputBufferPtr buf;
5010 /* htmlCharEncoding enc; */
5011 xmlChar *content, *content_line = (xmlChar *) "charset=";
5012
5013 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
5014 if (buf == NULL) return(NULL);
5015
5016 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
5017 if (ctxt == NULL) {
Daniel Veillard3487c8d2002-09-05 11:33:25 +00005018 xmlGenericError(xmlGenericErrorContext, "malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00005019 return(NULL);
5020 }
5021 memset(ctxt, 0, sizeof(htmlParserCtxt));
5022 htmlInitParserCtxt(ctxt);
5023 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
5024 if (inputStream == NULL) {
Daniel Veillard3487c8d2002-09-05 11:33:25 +00005025 xmlGenericError(xmlGenericErrorContext, "malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00005026 xmlFree(ctxt);
5027 return(NULL);
5028 }
5029 memset(inputStream, 0, sizeof(htmlParserInput));
5030
Daniel Veillarda646cfd2002-09-17 21:50:03 +00005031 inputStream->filename = (char *)
5032 xmlNormalizeWindowsPath((xmlChar *)filename);
Owen Taylor3473f882001-02-23 17:55:21 +00005033 inputStream->line = 1;
5034 inputStream->col = 1;
5035 inputStream->buf = buf;
5036 inputStream->directory = NULL;
5037
5038 inputStream->base = inputStream->buf->buffer->content;
5039 inputStream->cur = inputStream->buf->buffer->content;
5040 inputStream->free = NULL;
5041
5042 inputPush(ctxt, inputStream);
5043
5044 /* set encoding */
5045 if (encoding) {
5046 content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
5047 if (content) {
5048 strcpy ((char *)content, (char *)content_line);
5049 strcat ((char *)content, (char *)encoding);
5050 htmlCheckEncoding (ctxt, content);
5051 xmlFree (content);
5052 }
5053 }
5054
5055 return(ctxt);
5056}
5057
5058/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005059 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005060 * @filename: the filename
5061 * @encoding: a free form C string describing the HTML document encoding, or NULL
5062 * @sax: the SAX handler block
5063 * @userData: if using SAX, this pointer will be provided on callbacks.
5064 *
5065 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5066 * compressed document is provided by default if found at compile-time.
5067 * It use the given SAX function block to handle the parsing callback.
5068 * If sax is NULL, fallback to the default DOM tree building routines.
5069 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005070 * Returns the resulting document tree unless SAX is NULL or the document is
5071 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005072 */
5073
5074htmlDocPtr
5075htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5076 void *userData) {
5077 htmlDocPtr ret;
5078 htmlParserCtxtPtr ctxt;
5079 htmlSAXHandlerPtr oldsax = NULL;
5080
Daniel Veillardd0463562001-10-13 09:15:48 +00005081 xmlInitParser();
5082
Owen Taylor3473f882001-02-23 17:55:21 +00005083 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5084 if (ctxt == NULL) return(NULL);
5085 if (sax != NULL) {
5086 oldsax = ctxt->sax;
5087 ctxt->sax = sax;
5088 ctxt->userData = userData;
5089 }
5090
5091 htmlParseDocument(ctxt);
5092
5093 ret = ctxt->myDoc;
5094 if (sax != NULL) {
5095 ctxt->sax = oldsax;
5096 ctxt->userData = NULL;
5097 }
5098 htmlFreeParserCtxt(ctxt);
5099
5100 return(ret);
5101}
5102
5103/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005104 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005105 * @filename: the filename
5106 * @encoding: a free form C string describing the HTML document encoding, or NULL
5107 *
5108 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5109 * compressed document is provided by default if found at compile-time.
5110 *
5111 * Returns the resulting document tree
5112 */
5113
5114htmlDocPtr
5115htmlParseFile(const char *filename, const char *encoding) {
5116 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5117}
5118
5119/**
5120 * htmlHandleOmittedElem:
5121 * @val: int 0 or 1
5122 *
5123 * Set and return the previous value for handling HTML omitted tags.
5124 *
5125 * Returns the last value for 0 for no handling, 1 for auto insertion.
5126 */
5127
5128int
5129htmlHandleOmittedElem(int val) {
5130 int old = htmlOmittedDefaultValue;
5131
5132 htmlOmittedDefaultValue = val;
5133 return(old);
5134}
5135
5136#endif /* LIBXML_HTML_ENABLED */