blob: bc437ed8c7b453c3022c35f79f3562f55a154df0 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Owen Taylor3473f882001-02-23 17:55:21 +000045
46#define HTML_MAX_NAMELEN 1000
47#define HTML_PARSER_BIG_BUFFER_SIZE 1000
48#define HTML_PARSER_BUFFER_SIZE 100
49
50/* #define DEBUG */
51/* #define DEBUG_PUSH */
52
Daniel Veillard22090732001-07-16 00:06:07 +000053static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000054
Daniel Veillard56a4cb82001-03-24 17:00:36 +000055xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
56 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000057static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000058
59/************************************************************************
60 * *
Owen Taylor3473f882001-02-23 17:55:21 +000061 * Parser stacks related functions and macros *
62 * *
63 ************************************************************************/
64
Daniel Veillard1c732d22002-11-30 11:22:59 +000065/**
66 * htmlnamePush:
67 * @ctxt: an HTML parser context
68 * @value: the element name
69 *
70 * Pushes a new element name on top of the name stack
71 *
72 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +000073 */
Daniel Veillard1c732d22002-11-30 11:22:59 +000074static int
75htmlnamePush(htmlParserCtxtPtr ctxt, xmlChar * value)
76{
77 if (ctxt->nameNr >= ctxt->nameMax) {
78 ctxt->nameMax *= 2;
79 ctxt->nameTab =
80 (xmlChar * *)xmlRealloc(ctxt->nameTab,
81 ctxt->nameMax *
82 sizeof(ctxt->nameTab[0]));
83 if (ctxt->nameTab == NULL) {
84 xmlGenericError(xmlGenericErrorContext, "realloc failed !\n");
85 return (0);
86 }
87 }
88 ctxt->nameTab[ctxt->nameNr] = value;
89 ctxt->name = value;
90 return (ctxt->nameNr++);
91}
92/**
93 * htmlnamePop:
94 * @ctxt: an HTML parser context
95 *
96 * Pops the top element name from the name stack
97 *
98 * Returns the name just removed
99 */
100static xmlChar *
101htmlnamePop(htmlParserCtxtPtr ctxt)
102{
103 xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000104
Daniel Veillard1c732d22002-11-30 11:22:59 +0000105 if (ctxt->nameNr <= 0)
106 return (0);
107 ctxt->nameNr--;
108 if (ctxt->nameNr < 0)
109 return (0);
110 if (ctxt->nameNr > 0)
111 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
112 else
113 ctxt->name = NULL;
114 ret = ctxt->nameTab[ctxt->nameNr];
115 ctxt->nameTab[ctxt->nameNr] = 0;
116 return (ret);
117}
Owen Taylor3473f882001-02-23 17:55:21 +0000118
119/*
120 * Macros for accessing the content. Those should be used only by the parser,
121 * and not exported.
122 *
123 * Dirty macros, i.e. one need to make assumption on the context to use them
124 *
125 * CUR_PTR return the current pointer to the xmlChar to be parsed.
126 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
127 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
128 * in UNICODE mode. This should be used internally by the parser
129 * only to compare to ASCII values otherwise it would break when
130 * running with UTF-8 encoding.
131 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
132 * to compare on ASCII based substring.
133 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
134 * it should be used only to compare on ASCII based substring.
135 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
136 * strings within the parser.
137 *
138 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
139 *
140 * CURRENT Returns the current char value, with the full decoding of
141 * UTF-8 if we are using this mode. It returns an int.
142 * NEXT Skip to the next character, this does the proper decoding
143 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
144 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
145 */
146
147#define UPPER (toupper(*ctxt->input->cur))
148
149#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
150
151#define NXT(val) ctxt->input->cur[(val)]
152
153#define UPP(val) (toupper(ctxt->input->cur[(val)]))
154
155#define CUR_PTR ctxt->input->cur
156
157#define SHRINK xmlParserInputShrink(ctxt->input)
158
159#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
160
161#define CURRENT ((int) (*ctxt->input->cur))
162
163#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
164
165/* Inported from XML */
166
Daniel Veillard561b7f82002-03-20 21:55:57 +0000167/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
168#define CUR ((int) (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000169#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
170
Daniel Veillard561b7f82002-03-20 21:55:57 +0000171#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000172#define NXT(val) ctxt->input->cur[(val)]
173#define CUR_PTR ctxt->input->cur
174
175
176#define NEXTL(l) do { \
177 if (*(ctxt->input->cur) == '\n') { \
178 ctxt->input->line++; ctxt->input->col = 1; \
179 } else ctxt->input->col++; \
180 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
181 } while (0)
182
183/************
184 \
185 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
186 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
187 ************/
188
189#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
190#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
191
192#define COPY_BUF(l,b,i,v) \
193 if (l == 1) b[i++] = (xmlChar) v; \
194 else i += xmlCopyChar(l,&b[i],v)
195
196/**
197 * htmlCurrentChar:
198 * @ctxt: the HTML parser context
199 * @len: pointer to the length of the char read
200 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000201 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000202 * bytes in the input buffer. Implement the end of line normalization:
203 * 2.11 End-of-Line Handling
204 * If the encoding is unspecified, in the case we find an ISO-Latin-1
205 * char, then the encoding converter is plugged in automatically.
206 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000207 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000208 */
209
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000210static int
Owen Taylor3473f882001-02-23 17:55:21 +0000211htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
212 if (ctxt->instate == XML_PARSER_EOF)
213 return(0);
214
215 if (ctxt->token != 0) {
216 *len = 0;
217 return(ctxt->token);
218 }
219 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
220 /*
221 * We are supposed to handle UTF8, check it's valid
222 * From rfc2044: encoding of the Unicode values on UTF-8:
223 *
224 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
225 * 0000 0000-0000 007F 0xxxxxxx
226 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
227 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
228 *
229 * Check for the 0x110000 limit too
230 */
231 const unsigned char *cur = ctxt->input->cur;
232 unsigned char c;
233 unsigned int val;
234
235 c = *cur;
236 if (c & 0x80) {
237 if (cur[1] == 0)
238 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
239 if ((cur[1] & 0xc0) != 0x80)
240 goto encoding_error;
241 if ((c & 0xe0) == 0xe0) {
242
243 if (cur[2] == 0)
244 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
245 if ((cur[2] & 0xc0) != 0x80)
246 goto encoding_error;
247 if ((c & 0xf0) == 0xf0) {
248 if (cur[3] == 0)
249 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
250 if (((c & 0xf8) != 0xf0) ||
251 ((cur[3] & 0xc0) != 0x80))
252 goto encoding_error;
253 /* 4-byte code */
254 *len = 4;
255 val = (cur[0] & 0x7) << 18;
256 val |= (cur[1] & 0x3f) << 12;
257 val |= (cur[2] & 0x3f) << 6;
258 val |= cur[3] & 0x3f;
259 } else {
260 /* 3-byte code */
261 *len = 3;
262 val = (cur[0] & 0xf) << 12;
263 val |= (cur[1] & 0x3f) << 6;
264 val |= cur[2] & 0x3f;
265 }
266 } else {
267 /* 2-byte code */
268 *len = 2;
269 val = (cur[0] & 0x1f) << 6;
270 val |= cur[1] & 0x3f;
271 }
272 if (!IS_CHAR(val)) {
273 ctxt->errNo = XML_ERR_INVALID_ENCODING;
274 if ((ctxt->sax != NULL) &&
275 (ctxt->sax->error != NULL))
276 ctxt->sax->error(ctxt->userData,
277 "Char 0x%X out of allowed range\n", val);
278 ctxt->wellFormed = 0;
Daniel Veillarddad3f682002-11-17 16:47:27 +0000279 if (ctxt->recovery == 0) ctxt->disableSAX = 1;
Owen Taylor3473f882001-02-23 17:55:21 +0000280 }
281 return(val);
282 } else {
283 /* 1-byte code */
284 *len = 1;
285 return((int) *ctxt->input->cur);
286 }
287 }
288 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000289 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000290 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000291 * XML constructs only use < 128 chars
292 */
293 *len = 1;
294 if ((int) *ctxt->input->cur < 0x80)
295 return((int) *ctxt->input->cur);
296
297 /*
298 * Humm this is bad, do an automatic flow conversion
299 */
300 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
301 ctxt->charset = XML_CHAR_ENCODING_UTF8;
302 return(xmlCurrentChar(ctxt, len));
303
304encoding_error:
305 /*
306 * If we detect an UTF8 error that probably mean that the
307 * input encoding didn't get properly advertized in the
308 * declaration header. Report the error and switch the encoding
309 * to ISO-Latin-1 (if you don't like this policy, just declare the
310 * encoding !)
311 */
312 ctxt->errNo = XML_ERR_INVALID_ENCODING;
313 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
314 ctxt->sax->error(ctxt->userData,
315 "Input is not proper UTF-8, indicate encoding !\n");
316 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
317 ctxt->input->cur[0], ctxt->input->cur[1],
318 ctxt->input->cur[2], ctxt->input->cur[3]);
319 }
320
321 ctxt->charset = XML_CHAR_ENCODING_8859_1;
322 *len = 1;
323 return((int) *ctxt->input->cur);
324}
325
326/**
Owen Taylor3473f882001-02-23 17:55:21 +0000327 * htmlSkipBlankChars:
328 * @ctxt: the HTML parser context
329 *
330 * skip all blanks character found at that point in the input streams.
331 *
332 * Returns the number of space chars skipped
333 */
334
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000335static int
Owen Taylor3473f882001-02-23 17:55:21 +0000336htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
337 int res = 0;
338
339 while (IS_BLANK(*(ctxt->input->cur))) {
340 if ((*ctxt->input->cur == 0) &&
341 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
342 xmlPopInput(ctxt);
343 } else {
344 if (*(ctxt->input->cur) == '\n') {
345 ctxt->input->line++; ctxt->input->col = 1;
346 } else ctxt->input->col++;
347 ctxt->input->cur++;
348 ctxt->nbChars++;
349 if (*ctxt->input->cur == 0)
350 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
351 }
352 res++;
353 }
354 return(res);
355}
356
357
358
359/************************************************************************
360 * *
361 * The list of HTML elements and their properties *
362 * *
363 ************************************************************************/
364
365/*
366 * Start Tag: 1 means the start tag can be ommited
367 * End Tag: 1 means the end tag can be ommited
368 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000369 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000370 * Depr: this element is deprecated
371 * DTD: 1 means that this element is valid only in the Loose DTD
372 * 2 means that this element is valid only in the Frameset DTD
373 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000374 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Owen Taylor3473f882001-02-23 17:55:21 +0000375 */
Daniel Veillard22090732001-07-16 00:06:07 +0000376static const htmlElemDesc
377html40ElementTable[] = {
Daniel Veillard02bb1702001-06-13 21:11:59 +0000378{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor " },
379{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form" },
380{ "acronym", 0, 0, 0, 0, 0, 0, 1, "" },
381{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author " },
382{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet " },
383{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area " },
384{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style" },
385{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri " },
386{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " },
387{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride " },
388{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style" },
389{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation " },
390{ "body", 1, 1, 0, 0, 0, 0, 0, "document body " },
391{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break " },
392{ "button", 0, 0, 0, 0, 0, 0, 2, "push button " },
393{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption " },
394{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center " },
395{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation" },
396{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment" },
397{ "col", 0, 2, 2, 1, 0, 0, 0, "table column " },
398{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group " },
399{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description " },
400{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text " },
401{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition" },
402{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list" },
403{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container"},
404{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list " },
405{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term " },
406{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis" },
407{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group " },
408{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font " },
409{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form " },
410{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " },
411{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" },
412{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading " },
413{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading " },
414{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading " },
415{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading " },
416{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading " },
417{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading " },
418{ "head", 1, 1, 0, 0, 0, 0, 0, "document head " },
419{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " },
420{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element " },
421{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style" },
422{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow " },
423{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image " },
424{ "input", 0, 2, 2, 1, 0, 0, 1, "form control " },
425{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text" },
426{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt " },
427{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user" },
428{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text " },
429{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend " },
430{ "li", 0, 1, 1, 0, 0, 0, 0, "list item " },
431{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link " },
432{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map " },
433{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list " },
434{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation " },
435{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering " },
436{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
437{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object " },
438{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list " },
439{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group " },
440{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " },
Daniel Veillardfee408f2002-11-22 13:18:30 +0000441{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph " },
Daniel Veillard02bb1702001-06-13 21:11:59 +0000442{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value " },
443{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text " },
444{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation " },
445{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style" },
446{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc." },
447{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements " },
448{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector " },
449{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style" },
450{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container " },
451{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text" },
452{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis" },
453{ "style", 0, 0, 0, 0, 0, 0, 0, "style info " },
454{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript" },
455{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript " },
456{ "table", 0, 0, 0, 0, 0, 0, 0, "&#160;" },
457{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body " },
458{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell" },
459{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field " },
460{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer " },
461{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell" },
462{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header " },
463{ "title", 0, 0, 0, 0, 0, 0, 0, "document title " },
464{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row " },
465{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style" },
466{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style" },
467{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list " },
468{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument" },
Owen Taylor3473f882001-02-23 17:55:21 +0000469};
470
471/*
Owen Taylor3473f882001-02-23 17:55:21 +0000472 * start tags that imply the end of current element
473 */
Daniel Veillard22090732001-07-16 00:06:07 +0000474static const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000475"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
476 "dl", "ul", "ol", "menu", "dir", "address", "pre",
477 "listing", "xmp", "head", NULL,
478"head", "p", NULL,
479"title", "p", NULL,
480"body", "head", "style", "link", "title", "p", NULL,
481"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
482 "pre", "listing", "xmp", "head", "li", NULL,
483"hr", "p", "head", NULL,
484"h1", "p", "head", NULL,
485"h2", "p", "head", NULL,
486"h3", "p", "head", NULL,
487"h4", "p", "head", NULL,
488"h5", "p", "head", NULL,
489"h6", "p", "head", NULL,
490"dir", "p", "head", NULL,
491"address", "p", "head", "ul", NULL,
492"pre", "p", "head", "ul", NULL,
493"listing", "p", "head", NULL,
494"xmp", "p", "head", NULL,
495"blockquote", "p", "head", NULL,
496"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
497 "xmp", "head", NULL,
498"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
499 "head", "dd", NULL,
500"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
501 "head", "dt", NULL,
502"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
503 "listing", "xmp", NULL,
504"ol", "p", "head", "ul", NULL,
505"menu", "p", "head", "ul", NULL,
506"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
507"div", "p", "head", NULL,
508"noscript", "p", "head", NULL,
509"center", "font", "b", "i", "p", "head", NULL,
510"a", "a", NULL,
511"caption", "p", NULL,
512"colgroup", "caption", "colgroup", "col", "p", NULL,
513"col", "caption", "col", "p", NULL,
514"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
515 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000516"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
517"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000518"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
519"thead", "caption", "col", "colgroup", NULL,
520"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
521 "tbody", "p", NULL,
522"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
523 "tfoot", "tbody", "p", NULL,
524"optgroup", "option", NULL,
525"option", "option", NULL,
526"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
527 "pre", "listing", "xmp", "a", NULL,
528NULL
529};
530
531/*
532 * The list of HTML elements which are supposed not to have
533 * CDATA content and where a p element will be implied
534 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000535 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +0000536 * implied paragraph
537 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000538static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000539 "html",
540 "head",
541 "body",
542 NULL
543};
544
545/*
546 * The list of HTML attributes which are of content %Script;
547 * NOTE: when adding ones, check htmlIsScriptAttribute() since
548 * it assumes the name starts with 'on'
549 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000550static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000551 "onclick",
552 "ondblclick",
553 "onmousedown",
554 "onmouseup",
555 "onmouseover",
556 "onmousemove",
557 "onmouseout",
558 "onkeypress",
559 "onkeydown",
560 "onkeyup",
561 "onload",
562 "onunload",
563 "onfocus",
564 "onblur",
565 "onsubmit",
566 "onrest",
567 "onchange",
568 "onselect"
569};
570
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000571/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000572 * This table is used by the htmlparser to know what to do with
573 * broken html pages. By assigning different priorities to different
574 * elements the parser can decide how to handle extra endtags.
575 * Endtags are only allowed to close elements with lower or equal
576 * priority.
577 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000578
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000579typedef struct {
580 const char *name;
581 int priority;
582} elementPriority;
583
Daniel Veillard22090732001-07-16 00:06:07 +0000584static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000585 {"div", 150},
586 {"td", 160},
587 {"th", 160},
588 {"tr", 170},
589 {"thead", 180},
590 {"tbody", 180},
591 {"tfoot", 180},
592 {"table", 190},
593 {"head", 200},
594 {"body", 200},
595 {"html", 220},
596 {NULL, 100} /* Default priority */
597};
Owen Taylor3473f882001-02-23 17:55:21 +0000598
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000599static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +0000600static int htmlStartCloseIndexinitialized = 0;
601
602/************************************************************************
603 * *
604 * functions to handle HTML specific data *
605 * *
606 ************************************************************************/
607
608/**
609 * htmlInitAutoClose:
610 *
611 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
612 * This is not reentrant. Call xmlInitParser() once before processing in
613 * case of use in multithreaded programs.
614 */
615void
616htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000617 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000618
619 if (htmlStartCloseIndexinitialized) return;
620
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000621 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
622 indx = 0;
623 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
624 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +0000625 while (htmlStartClose[i] != NULL) i++;
626 i++;
627 }
628 htmlStartCloseIndexinitialized = 1;
629}
630
631/**
632 * htmlTagLookup:
633 * @tag: The tag name in lowercase
634 *
635 * Lookup the HTML tag in the ElementTable
636 *
637 * Returns the related htmlElemDescPtr or NULL if not found.
638 */
Daniel Veillardbb371292001-08-16 23:26:59 +0000639const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +0000640htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000641 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000642
643 for (i = 0; i < (sizeof(html40ElementTable) /
644 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000645 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillard22090732001-07-16 00:06:07 +0000646 return((const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +0000647 }
648 return(NULL);
649}
650
651/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000652 * htmlGetEndPriority:
653 * @name: The name of the element to look up the priority for.
654 *
655 * Return value: The "endtag" priority.
656 **/
657static int
658htmlGetEndPriority (const xmlChar *name) {
659 int i = 0;
660
661 while ((htmlEndPriority[i].name != NULL) &&
662 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
663 i++;
664
665 return(htmlEndPriority[i].priority);
666}
667
668/**
Owen Taylor3473f882001-02-23 17:55:21 +0000669 * htmlCheckAutoClose:
670 * @newtag: The new tag name
671 * @oldtag: The old tag name
672 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000673 * Checks whether the new tag is one of the registered valid tags for
674 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +0000675 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
676 *
677 * Returns 0 if no, 1 if yes.
678 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000679static int
Owen Taylor3473f882001-02-23 17:55:21 +0000680htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000681 int i, indx;
682 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +0000683
684 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
685
686 /* inefficient, but not a big deal */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000687 for (indx = 0; indx < 100;indx++) {
688 closed = htmlStartCloseIndex[indx];
689 if (closed == NULL) return(0);
690 if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor3473f882001-02-23 17:55:21 +0000691 }
692
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000693 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +0000694 i++;
695 while (htmlStartClose[i] != NULL) {
696 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
697 return(1);
698 }
699 i++;
700 }
701 return(0);
702}
703
704/**
705 * htmlAutoCloseOnClose:
706 * @ctxt: an HTML parser context
707 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000708 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +0000709 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000710 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000711 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000712static void
Owen Taylor3473f882001-02-23 17:55:21 +0000713htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillardbb371292001-08-16 23:26:59 +0000714 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +0000715 xmlChar *oldname;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000716 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +0000717
718#ifdef DEBUG
719 xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
720 for (i = 0;i < ctxt->nameNr;i++)
721 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
722#endif
723
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000724 priority = htmlGetEndPriority (newtag);
725
Owen Taylor3473f882001-02-23 17:55:21 +0000726 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000727
Owen Taylor3473f882001-02-23 17:55:21 +0000728 if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000729 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000730 * A missplaced endtag can only close elements with lower
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000731 * or equal priority, so if we find an element with higher
732 * priority before we find an element with
733 * matching name, we just ignore this endtag
734 */
735 if (htmlGetEndPriority (ctxt->nameTab[i]) > priority) return;
Owen Taylor3473f882001-02-23 17:55:21 +0000736 }
737 if (i < 0) return;
738
739 while (!xmlStrEqual(newtag, ctxt->name)) {
740 info = htmlTagLookup(ctxt->name);
741 if ((info == NULL) || (info->endTag == 1)) {
742#ifdef DEBUG
743 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
744#endif
Daniel Veillard56098d42001-04-24 12:51:09 +0000745 } else if (info->endTag == 3) {
746#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +0000747 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", newtag, ctxt->name);
William M. Brack1633d182001-10-05 15:41:19 +0000748
Daniel Veillard56098d42001-04-24 12:51:09 +0000749#endif
750 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
751 ctxt->sax->error(ctxt->userData,
752 "Opening and ending tag mismatch: %s and %s\n",
753 newtag, ctxt->name);
754 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000755 }
756 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
757 ctxt->sax->endElement(ctxt->userData, ctxt->name);
758 oldname = htmlnamePop(ctxt);
759 if (oldname != NULL) {
760#ifdef DEBUG
761 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
762#endif
763 xmlFree(oldname);
764 }
765 }
766}
767
768/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000769 * htmlAutoCloseOnEnd:
770 * @ctxt: an HTML parser context
771 *
772 * Close all remaining tags at the end of the stream
773 */
774static void
775htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
776 xmlChar *oldname;
777 int i;
778
779 if (ctxt->nameNr == 0)
780 return;
781#ifdef DEBUG
782 xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
783#endif
784
785 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
786#ifdef DEBUG
787 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
788#endif
789 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
790 ctxt->sax->endElement(ctxt->userData, ctxt->name);
791 oldname = htmlnamePop(ctxt);
792 if (oldname != NULL) {
793#ifdef DEBUG
794 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
795#endif
796 xmlFree(oldname);
797 }
798 }
799}
800
801/**
Owen Taylor3473f882001-02-23 17:55:21 +0000802 * htmlAutoClose:
803 * @ctxt: an HTML parser context
804 * @newtag: The new tag name or NULL
805 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000806 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000807 * The list is kept in htmlStartClose array. This function is
808 * called when a new tag has been detected and generates the
809 * appropriates closes if possible/needed.
810 * If newtag is NULL this mean we are at the end of the resource
811 * and we should check
812 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000813static void
Owen Taylor3473f882001-02-23 17:55:21 +0000814htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
815 xmlChar *oldname;
816 while ((newtag != NULL) && (ctxt->name != NULL) &&
817 (htmlCheckAutoClose(newtag, ctxt->name))) {
818#ifdef DEBUG
819 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
820#endif
821 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
822 ctxt->sax->endElement(ctxt->userData, ctxt->name);
823 oldname = htmlnamePop(ctxt);
824 if (oldname != NULL) {
825#ifdef DEBUG
826 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
827#endif
828 xmlFree(oldname);
829 }
830 }
831 if (newtag == NULL) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000832 htmlAutoCloseOnEnd(ctxt);
833 return;
Owen Taylor3473f882001-02-23 17:55:21 +0000834 }
835 while ((newtag == NULL) && (ctxt->name != NULL) &&
836 ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
837 (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
838 (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
839#ifdef DEBUG
840 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
841#endif
842 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
843 ctxt->sax->endElement(ctxt->userData, ctxt->name);
844 oldname = htmlnamePop(ctxt);
845 if (oldname != NULL) {
846#ifdef DEBUG
847 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
848#endif
849 xmlFree(oldname);
850 }
851 }
852
853}
854
855/**
856 * htmlAutoCloseTag:
857 * @doc: the HTML document
858 * @name: The tag name
859 * @elem: the HTML element
860 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000861 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000862 * The list is kept in htmlStartClose array. This function checks
863 * if the element or one of it's children would autoclose the
864 * given tag.
865 *
866 * Returns 1 if autoclose, 0 otherwise
867 */
868int
869htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
870 htmlNodePtr child;
871
872 if (elem == NULL) return(1);
873 if (xmlStrEqual(name, elem->name)) return(0);
874 if (htmlCheckAutoClose(elem->name, name)) return(1);
875 child = elem->children;
876 while (child != NULL) {
877 if (htmlAutoCloseTag(doc, name, child)) return(1);
878 child = child->next;
879 }
880 return(0);
881}
882
883/**
884 * htmlIsAutoClosed:
885 * @doc: the HTML document
886 * @elem: the HTML element
887 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000888 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000889 * The list is kept in htmlStartClose array. This function checks
890 * if a tag is autoclosed by one of it's child
891 *
892 * Returns 1 if autoclosed, 0 otherwise
893 */
894int
895htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
896 htmlNodePtr child;
897
898 if (elem == NULL) return(1);
899 child = elem->children;
900 while (child != NULL) {
901 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
902 child = child->next;
903 }
904 return(0);
905}
906
907/**
908 * htmlCheckImplied:
909 * @ctxt: an HTML parser context
910 * @newtag: The new tag name
911 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000912 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +0000913 * called when a new tag has been detected and generates the
914 * appropriates implicit tags if missing
915 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000916static void
Owen Taylor3473f882001-02-23 17:55:21 +0000917htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
918 if (!htmlOmittedDefaultValue)
919 return;
920 if (xmlStrEqual(newtag, BAD_CAST"html"))
921 return;
922 if (ctxt->nameNr <= 0) {
923#ifdef DEBUG
924 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
925#endif
926 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
927 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
928 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
929 }
930 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
931 return;
932 if ((ctxt->nameNr <= 1) &&
933 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
934 (xmlStrEqual(newtag, BAD_CAST"style")) ||
935 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
936 (xmlStrEqual(newtag, BAD_CAST"link")) ||
937 (xmlStrEqual(newtag, BAD_CAST"title")) ||
938 (xmlStrEqual(newtag, BAD_CAST"base")))) {
939 /*
940 * dropped OBJECT ... i you put it first BODY will be
941 * assumed !
942 */
943#ifdef DEBUG
944 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
945#endif
946 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
947 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
948 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
949 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
950 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
951 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
952 int i;
953 for (i = 0;i < ctxt->nameNr;i++) {
954 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
955 return;
956 }
957 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
958 return;
959 }
960 }
961
962#ifdef DEBUG
963 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
964#endif
965 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
966 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
967 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
968 }
969}
970
971/**
972 * htmlCheckParagraph
973 * @ctxt: an HTML parser context
974 *
975 * Check whether a p element need to be implied before inserting
976 * characters in the current element.
977 *
978 * Returns 1 if a paragraph has been inserted, 0 if not and -1
979 * in case of error.
980 */
981
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000982static int
Owen Taylor3473f882001-02-23 17:55:21 +0000983htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
984 const xmlChar *tag;
985 int i;
986
987 if (ctxt == NULL)
988 return(-1);
989 tag = ctxt->name;
990 if (tag == NULL) {
991 htmlAutoClose(ctxt, BAD_CAST"p");
992 htmlCheckImplied(ctxt, BAD_CAST"p");
993 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
994 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
995 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
996 return(1);
997 }
998 if (!htmlOmittedDefaultValue)
999 return(0);
1000 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1001 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1002#ifdef DEBUG
1003 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
1004#endif
1005 htmlAutoClose(ctxt, BAD_CAST"p");
1006 htmlCheckImplied(ctxt, BAD_CAST"p");
1007 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
1008 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1009 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1010 return(1);
1011 }
1012 }
1013 return(0);
1014}
1015
1016/**
1017 * htmlIsScriptAttribute:
1018 * @name: an attribute name
1019 *
1020 * Check if an attribute is of content type Script
1021 *
1022 * Returns 1 is the attribute is a script 0 otherwise
1023 */
1024int
1025htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001026 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001027
1028 if (name == NULL)
1029 return(0);
1030 /*
1031 * all script attributes start with 'on'
1032 */
1033 if ((name[0] != 'o') || (name[1] != 'n'))
1034 return(0);
1035 for (i = 0;
1036 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1037 i++) {
1038 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1039 return(1);
1040 }
1041 return(0);
1042}
1043
1044/************************************************************************
1045 * *
1046 * The list of HTML predefined entities *
1047 * *
1048 ************************************************************************/
1049
1050
Daniel Veillard22090732001-07-16 00:06:07 +00001051static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001052/*
1053 * the 4 absolute ones, plus apostrophe.
1054 */
1055{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1056{ 38, "amp", "ampersand, U+0026 ISOnum" },
1057{ 39, "apos", "single quote" },
1058{ 60, "lt", "less-than sign, U+003C ISOnum" },
1059{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1060
1061/*
1062 * A bunch still in the 128-255 range
1063 * Replacing them depend really on the charset used.
1064 */
1065{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1066{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1067{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1068{ 163, "pound","pound sign, U+00A3 ISOnum" },
1069{ 164, "curren","currency sign, U+00A4 ISOnum" },
1070{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1071{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1072{ 167, "sect", "section sign, U+00A7 ISOnum" },
1073{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1074{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1075{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1076{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1077{ 172, "not", "not sign, U+00AC ISOnum" },
1078{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1079{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1080{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1081{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1082{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1083{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1084{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1085{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1086{ 181, "micro","micro sign, U+00B5 ISOnum" },
1087{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1088{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1089{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1090{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1091{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1092{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1093{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1094{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1095{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1096{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1097{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1098{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1099{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1100{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1101{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1102{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1103{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1104{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1105{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1106{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1107{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1108{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1109{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1110{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1111{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1112{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1113{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1114{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1115{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1116{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1117{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1118{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1119{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1120{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1121{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1122{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1123{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1124{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1125{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1126{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1127{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1128{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1129{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1130{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1131{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1132{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1133{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1134{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1135{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1136{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1137{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1138{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1139{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1140{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1141{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1142{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1143{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1144{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1145{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1146{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1147{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1148{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1149{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1150{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1151{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1152{ 247, "divide","division sign, U+00F7 ISOnum" },
1153{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1154{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1155{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1156{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1157{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1158{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1159{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1160{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1161
1162{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1163{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1164{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1165{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1166{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1167
1168/*
1169 * Anything below should really be kept as entities references
1170 */
1171{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1172
1173{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1174{ 732, "tilde","small tilde, U+02DC ISOdia" },
1175
1176{ 913, "Alpha","greek capital letter alpha, U+0391" },
1177{ 914, "Beta", "greek capital letter beta, U+0392" },
1178{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1179{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1180{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1181{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1182{ 919, "Eta", "greek capital letter eta, U+0397" },
1183{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1184{ 921, "Iota", "greek capital letter iota, U+0399" },
1185{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001186{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001187{ 924, "Mu", "greek capital letter mu, U+039C" },
1188{ 925, "Nu", "greek capital letter nu, U+039D" },
1189{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1190{ 927, "Omicron","greek capital letter omicron, U+039F" },
1191{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1192{ 929, "Rho", "greek capital letter rho, U+03A1" },
1193{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1194{ 932, "Tau", "greek capital letter tau, U+03A4" },
1195{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1196{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1197{ 935, "Chi", "greek capital letter chi, U+03A7" },
1198{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1199{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1200
1201{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1202{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1203{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1204{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1205{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1206{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1207{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1208{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1209{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1210{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1211{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1212{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1213{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1214{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1215{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1216{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1217{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1218{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1219{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1220{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1221{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1222{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1223{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1224{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1225{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1226{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1227{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1228{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1229
1230{ 8194, "ensp", "en space, U+2002 ISOpub" },
1231{ 8195, "emsp", "em space, U+2003 ISOpub" },
1232{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1233{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1234{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1235{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1236{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1237{ 8211, "ndash","en dash, U+2013 ISOpub" },
1238{ 8212, "mdash","em dash, U+2014 ISOpub" },
1239{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1240{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1241{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1242{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1243{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1244{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1245{ 8224, "dagger","dagger, U+2020 ISOpub" },
1246{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1247
1248{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1249{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1250
1251{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1252
1253{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1254{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1255
1256{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1257{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1258
1259{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1260{ 8260, "frasl","fraction slash, U+2044 NEW" },
1261
1262{ 8364, "euro", "euro sign, U+20AC NEW" },
1263
1264{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1265{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1266{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1267{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1268{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1269{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1270{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1271{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1272{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1273{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1274{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1275{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1276{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1277{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1278{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1279{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1280
1281{ 8704, "forall","for all, U+2200 ISOtech" },
1282{ 8706, "part", "partial differential, U+2202 ISOtech" },
1283{ 8707, "exist","there exists, U+2203 ISOtech" },
1284{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1285{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1286{ 8712, "isin", "element of, U+2208 ISOtech" },
1287{ 8713, "notin","not an element of, U+2209 ISOtech" },
1288{ 8715, "ni", "contains as member, U+220B ISOtech" },
1289{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001290{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001291{ 8722, "minus","minus sign, U+2212 ISOtech" },
1292{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1293{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1294{ 8733, "prop", "proportional to, U+221D ISOtech" },
1295{ 8734, "infin","infinity, U+221E ISOtech" },
1296{ 8736, "ang", "angle, U+2220 ISOamso" },
1297{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1298{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1299{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1300{ 8746, "cup", "union = cup, U+222A ISOtech" },
1301{ 8747, "int", "integral, U+222B ISOtech" },
1302{ 8756, "there4","therefore, U+2234 ISOtech" },
1303{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1304{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1305{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1306{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1307{ 8801, "equiv","identical to, U+2261 ISOtech" },
1308{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1309{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1310{ 8834, "sub", "subset of, U+2282 ISOtech" },
1311{ 8835, "sup", "superset of, U+2283 ISOtech" },
1312{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1313{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1314{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1315{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1316{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1317{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1318{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1319{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1320{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1321{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1322{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1323{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1324{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1325{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1326
1327{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1328{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1329{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1330{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1331
1332};
1333
1334/************************************************************************
1335 * *
1336 * Commodity functions to handle entities *
1337 * *
1338 ************************************************************************/
1339
1340/*
1341 * Macro used to grow the current buffer.
1342 */
1343#define growBuffer(buffer) { \
1344 buffer##_size *= 2; \
Daniel Veillard3487c8d2002-09-05 11:33:25 +00001345 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
Owen Taylor3473f882001-02-23 17:55:21 +00001346 if (buffer == NULL) { \
Daniel Veillard3487c8d2002-09-05 11:33:25 +00001347 xmlGenericError(xmlGenericErrorContext, "realloc failed\n"); \
Owen Taylor3473f882001-02-23 17:55:21 +00001348 return(NULL); \
1349 } \
1350}
1351
1352/**
1353 * htmlEntityLookup:
1354 * @name: the entity name
1355 *
1356 * Lookup the given entity in EntitiesTable
1357 *
1358 * TODO: the linear scan is really ugly, an hash table is really needed.
1359 *
1360 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1361 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001362const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001363htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001364 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001365
1366 for (i = 0;i < (sizeof(html40EntitiesTable)/
1367 sizeof(html40EntitiesTable[0]));i++) {
1368 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1369#ifdef DEBUG
1370 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
1371#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001372 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001373 }
1374 }
1375 return(NULL);
1376}
1377
1378/**
1379 * htmlEntityValueLookup:
1380 * @value: the entity's unicode value
1381 *
1382 * Lookup the given entity in EntitiesTable
1383 *
1384 * TODO: the linear scan is really ugly, an hash table is really needed.
1385 *
1386 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1387 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001388const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001389htmlEntityValueLookup(unsigned int value) {
1390 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001391#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00001392 unsigned int lv = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001393#endif
1394
1395 for (i = 0;i < (sizeof(html40EntitiesTable)/
1396 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001397 if (html40EntitiesTable[i].value >= value) {
1398 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001399 break;
1400#ifdef DEBUG
1401 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
1402#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001403 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001404 }
1405#ifdef DEBUG
1406 if (lv > html40EntitiesTable[i].value) {
1407 xmlGenericError(xmlGenericErrorContext,
1408 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1409 lv, html40EntitiesTable[i].value);
1410 }
1411 lv = html40EntitiesTable[i].value;
1412#endif
1413 }
1414 return(NULL);
1415}
1416
1417/**
1418 * UTF8ToHtml:
1419 * @out: a pointer to an array of bytes to store the result
1420 * @outlen: the length of @out
1421 * @in: a pointer to an array of UTF-8 chars
1422 * @inlen: the length of @in
1423 *
1424 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1425 * plus HTML entities block of chars out.
1426 *
1427 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1428 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001429 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001430 * The value of @outlen after return is the number of octets consumed.
1431 */
1432int
1433UTF8ToHtml(unsigned char* out, int *outlen,
1434 const unsigned char* in, int *inlen) {
1435 const unsigned char* processed = in;
1436 const unsigned char* outend;
1437 const unsigned char* outstart = out;
1438 const unsigned char* instart = in;
1439 const unsigned char* inend;
1440 unsigned int c, d;
1441 int trailing;
1442
1443 if (in == NULL) {
1444 /*
1445 * initialization nothing to do
1446 */
1447 *outlen = 0;
1448 *inlen = 0;
1449 return(0);
1450 }
1451 inend = in + (*inlen);
1452 outend = out + (*outlen);
1453 while (in < inend) {
1454 d = *in++;
1455 if (d < 0x80) { c= d; trailing= 0; }
1456 else if (d < 0xC0) {
1457 /* trailing byte in leading position */
1458 *outlen = out - outstart;
1459 *inlen = processed - instart;
1460 return(-2);
1461 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1462 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1463 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1464 else {
1465 /* no chance for this in Ascii */
1466 *outlen = out - outstart;
1467 *inlen = processed - instart;
1468 return(-2);
1469 }
1470
1471 if (inend - in < trailing) {
1472 break;
1473 }
1474
1475 for ( ; trailing; trailing--) {
1476 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1477 break;
1478 c <<= 6;
1479 c |= d & 0x3F;
1480 }
1481
1482 /* assertion: c is a single UTF-4 value */
1483 if (c < 0x80) {
1484 if (out + 1 >= outend)
1485 break;
1486 *out++ = c;
1487 } else {
1488 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001489 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001490
1491 /*
1492 * Try to lookup a predefined HTML entity for it
1493 */
1494
1495 ent = htmlEntityValueLookup(c);
1496 if (ent == NULL) {
1497 /* no chance for this in Ascii */
1498 *outlen = out - outstart;
1499 *inlen = processed - instart;
1500 return(-2);
1501 }
1502 len = strlen(ent->name);
1503 if (out + 2 + len >= outend)
1504 break;
1505 *out++ = '&';
1506 memcpy(out, ent->name, len);
1507 out += len;
1508 *out++ = ';';
1509 }
1510 processed = in;
1511 }
1512 *outlen = out - outstart;
1513 *inlen = processed - instart;
1514 return(0);
1515}
1516
1517/**
1518 * htmlEncodeEntities:
1519 * @out: a pointer to an array of bytes to store the result
1520 * @outlen: the length of @out
1521 * @in: a pointer to an array of UTF-8 chars
1522 * @inlen: the length of @in
1523 * @quoteChar: the quote character to escape (' or ") or zero.
1524 *
1525 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1526 * plus HTML entities block of chars out.
1527 *
1528 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1529 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001530 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001531 * The value of @outlen after return is the number of octets consumed.
1532 */
1533int
1534htmlEncodeEntities(unsigned char* out, int *outlen,
1535 const unsigned char* in, int *inlen, int quoteChar) {
1536 const unsigned char* processed = in;
1537 const unsigned char* outend = out + (*outlen);
1538 const unsigned char* outstart = out;
1539 const unsigned char* instart = in;
1540 const unsigned char* inend = in + (*inlen);
1541 unsigned int c, d;
1542 int trailing;
1543
1544 while (in < inend) {
1545 d = *in++;
1546 if (d < 0x80) { c= d; trailing= 0; }
1547 else if (d < 0xC0) {
1548 /* trailing byte in leading position */
1549 *outlen = out - outstart;
1550 *inlen = processed - instart;
1551 return(-2);
1552 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1553 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1554 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1555 else {
1556 /* no chance for this in Ascii */
1557 *outlen = out - outstart;
1558 *inlen = processed - instart;
1559 return(-2);
1560 }
1561
1562 if (inend - in < trailing)
1563 break;
1564
1565 while (trailing--) {
1566 if (((d= *in++) & 0xC0) != 0x80) {
1567 *outlen = out - outstart;
1568 *inlen = processed - instart;
1569 return(-2);
1570 }
1571 c <<= 6;
1572 c |= d & 0x3F;
1573 }
1574
1575 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001576 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1577 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001578 if (out >= outend)
1579 break;
1580 *out++ = c;
1581 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001582 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001583 const char *cp;
1584 char nbuf[16];
1585 int len;
1586
1587 /*
1588 * Try to lookup a predefined HTML entity for it
1589 */
1590 ent = htmlEntityValueLookup(c);
1591 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00001592 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00001593 cp = nbuf;
1594 }
1595 else
1596 cp = ent->name;
1597 len = strlen(cp);
1598 if (out + 2 + len > outend)
1599 break;
1600 *out++ = '&';
1601 memcpy(out, cp, len);
1602 out += len;
1603 *out++ = ';';
1604 }
1605 processed = in;
1606 }
1607 *outlen = out - outstart;
1608 *inlen = processed - instart;
1609 return(0);
1610}
1611
1612/**
1613 * htmlDecodeEntities:
1614 * @ctxt: the parser context
1615 * @len: the len to decode (in bytes !), -1 for no size limit
1616 * @end: an end marker xmlChar, 0 if none
1617 * @end2: an end marker xmlChar, 0 if none
1618 * @end3: an end marker xmlChar, 0 if none
1619 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001620 * Substitute the HTML entities by their value
Owen Taylor3473f882001-02-23 17:55:21 +00001621 *
1622 * DEPRECATED !!!!
1623 *
1624 * Returns A newly allocated string with the substitution done. The caller
1625 * must deallocate it !
1626 */
1627xmlChar *
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00001628htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
1629 xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001630 static int deprecated = 0;
1631 if (!deprecated) {
1632 xmlGenericError(xmlGenericErrorContext,
1633 "htmlDecodeEntities() deprecated function reached\n");
1634 deprecated = 1;
1635 }
1636 return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00001637}
1638
1639/************************************************************************
1640 * *
1641 * Commodity functions to handle streams *
1642 * *
1643 ************************************************************************/
1644
1645/**
Owen Taylor3473f882001-02-23 17:55:21 +00001646 * htmlNewInputStream:
1647 * @ctxt: an HTML parser context
1648 *
1649 * Create a new input stream structure
1650 * Returns the new input stream or NULL
1651 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001652static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001653htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1654 htmlParserInputPtr input;
1655
1656 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1657 if (input == NULL) {
1658 ctxt->errNo = XML_ERR_NO_MEMORY;
1659 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1660 ctxt->sax->error(ctxt->userData,
1661 "malloc: couldn't allocate a new input stream\n");
1662 return(NULL);
1663 }
1664 memset(input, 0, sizeof(htmlParserInput));
1665 input->filename = NULL;
1666 input->directory = NULL;
1667 input->base = NULL;
1668 input->cur = NULL;
1669 input->buf = NULL;
1670 input->line = 1;
1671 input->col = 1;
1672 input->buf = NULL;
1673 input->free = NULL;
1674 input->version = NULL;
1675 input->consumed = 0;
1676 input->length = 0;
1677 return(input);
1678}
1679
1680
1681/************************************************************************
1682 * *
1683 * Commodity functions, cleanup needed ? *
1684 * *
1685 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001686/*
1687 * all tags allowing pc data from the html 4.01 loose dtd
1688 * NOTE: it might be more apropriate to integrate this information
1689 * into the html40ElementTable array but I don't want to risk any
1690 * binary incomptibility
1691 */
1692static const char *allowPCData[] = {
1693 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
1694 "blockquote", "body", "button", "caption", "center", "cite", "code",
1695 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
1696 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
1697 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
1698 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
1699};
Owen Taylor3473f882001-02-23 17:55:21 +00001700
1701/**
1702 * areBlanks:
1703 * @ctxt: an HTML parser context
1704 * @str: a xmlChar *
1705 * @len: the size of @str
1706 *
1707 * Is this a sequence of blank chars that one can ignore ?
1708 *
1709 * Returns 1 if ignorable 0 otherwise.
1710 */
1711
1712static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001713 unsigned int i;
1714 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00001715 xmlNodePtr lastChild;
1716
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001717 for (j = 0;j < len;j++)
1718 if (!(IS_BLANK(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001719
1720 if (CUR == 0) return(1);
1721 if (CUR != '<') return(0);
1722 if (ctxt->name == NULL)
1723 return(1);
1724 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
1725 return(1);
1726 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
1727 return(1);
1728 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
1729 return(1);
1730 if (ctxt->node == NULL) return(0);
1731 lastChild = xmlGetLastChild(ctxt->node);
1732 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00001733 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
1734 (ctxt->node->content != NULL)) return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001735 /* keep ws in constructs like ...<b> </b>...
1736 for all tags "b" allowing PCDATA */
1737 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
1738 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
1739 return(0);
1740 }
1741 }
Owen Taylor3473f882001-02-23 17:55:21 +00001742 } else if (xmlNodeIsText(lastChild)) {
1743 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001744 } else {
1745 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
1746 for all tags "p" allowing PCDATA */
1747 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
1748 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
1749 return(0);
1750 }
1751 }
Owen Taylor3473f882001-02-23 17:55:21 +00001752 }
1753 return(1);
1754}
1755
1756/**
Owen Taylor3473f882001-02-23 17:55:21 +00001757 * htmlNewDocNoDtD:
1758 * @URI: URI for the dtd, or NULL
1759 * @ExternalID: the external ID of the DTD, or NULL
1760 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001761 * Creates a new HTML document without a DTD node if @URI and @ExternalID
1762 * are NULL
1763 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001764 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00001765 */
1766htmlDocPtr
1767htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
1768 xmlDocPtr cur;
1769
1770 /*
1771 * Allocate a new document and fill the fields.
1772 */
1773 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
1774 if (cur == NULL) {
1775 xmlGenericError(xmlGenericErrorContext,
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001776 "htmlNewDocNoDtD : malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00001777 return(NULL);
1778 }
1779 memset(cur, 0, sizeof(xmlDoc));
1780
1781 cur->type = XML_HTML_DOCUMENT_NODE;
1782 cur->version = NULL;
1783 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001784 cur->doc = cur;
1785 cur->name = NULL;
1786 cur->children = NULL;
1787 cur->extSubset = NULL;
1788 cur->oldNs = NULL;
1789 cur->encoding = NULL;
1790 cur->standalone = 1;
1791 cur->compression = 0;
1792 cur->ids = NULL;
1793 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001794 cur->_private = NULL;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00001795 if ((ExternalID != NULL) ||
1796 (URI != NULL))
Daniel Veillard5151c062001-10-23 13:10:19 +00001797 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00001798 return(cur);
1799}
1800
1801/**
1802 * htmlNewDoc:
1803 * @URI: URI for the dtd, or NULL
1804 * @ExternalID: the external ID of the DTD, or NULL
1805 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001806 * Creates a new HTML document
1807 *
Owen Taylor3473f882001-02-23 17:55:21 +00001808 * Returns a new document
1809 */
1810htmlDocPtr
1811htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1812 if ((URI == NULL) && (ExternalID == NULL))
1813 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00001814 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
1815 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00001816
1817 return(htmlNewDocNoDtD(URI, ExternalID));
1818}
1819
1820
1821/************************************************************************
1822 * *
1823 * The parser itself *
1824 * Relates to http://www.w3.org/TR/html40 *
1825 * *
1826 ************************************************************************/
1827
1828/************************************************************************
1829 * *
1830 * The parser itself *
1831 * *
1832 ************************************************************************/
1833
Daniel Veillarde55e8e42003-01-10 12:50:02 +00001834static xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
1835
Owen Taylor3473f882001-02-23 17:55:21 +00001836/**
1837 * htmlParseHTMLName:
1838 * @ctxt: an HTML parser context
1839 *
1840 * parse an HTML tag or attribute name, note that we convert it to lowercase
1841 * since HTML names are not case-sensitive.
1842 *
1843 * Returns the Tag Name parsed or NULL
1844 */
1845
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001846static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001847htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1848 xmlChar *ret = NULL;
1849 int i = 0;
1850 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
1851
1852 if (!IS_LETTER(CUR) && (CUR != '_') &&
1853 (CUR != ':')) return(NULL);
1854
1855 while ((i < HTML_PARSER_BUFFER_SIZE) &&
1856 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1857 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
1858 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
1859 else loc[i] = CUR;
1860 i++;
1861
1862 NEXT;
1863 }
1864
1865 ret = xmlStrndup(loc, i);
1866
1867 return(ret);
1868}
1869
1870/**
1871 * htmlParseName:
1872 * @ctxt: an HTML parser context
1873 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001874 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00001875 *
1876 * Returns the Name parsed or NULL
1877 */
1878
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001879static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001880htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00001881 const xmlChar *in;
1882 xmlChar *ret;
1883 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001884
1885 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00001886
1887 /*
1888 * Accelerator for simple ASCII names
1889 */
1890 in = ctxt->input->cur;
1891 if (((*in >= 0x61) && (*in <= 0x7A)) ||
1892 ((*in >= 0x41) && (*in <= 0x5A)) ||
1893 (*in == '_') || (*in == ':')) {
1894 in++;
1895 while (((*in >= 0x61) && (*in <= 0x7A)) ||
1896 ((*in >= 0x41) && (*in <= 0x5A)) ||
1897 ((*in >= 0x30) && (*in <= 0x39)) ||
1898 (*in == '_') || (*in == '-') ||
1899 (*in == ':') || (*in == '.'))
1900 in++;
1901 if ((*in > 0) && (*in < 0x80)) {
1902 count = in - ctxt->input->cur;
1903 ret = xmlStrndup(ctxt->input->cur, count);
1904 ctxt->input->cur = in;
1905 return(ret);
1906 }
1907 }
1908 return(htmlParseNameComplex(ctxt));
1909}
1910
1911static xmlChar *
1912htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
1913 xmlChar buf[XML_MAX_NAMELEN + 5];
1914 int len = 0, l;
1915 int c;
1916 int count = 0;
1917
1918 /*
1919 * Handler for more complex cases
1920 */
1921 GROW;
1922 c = CUR_CHAR(l);
1923 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
1924 (!IS_LETTER(c) && (c != '_') &&
1925 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00001926 return(NULL);
1927 }
1928
Daniel Veillarde55e8e42003-01-10 12:50:02 +00001929 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
1930 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
1931 (c == '.') || (c == '-') ||
1932 (c == '_') || (c == ':') ||
1933 (IS_COMBINING(c)) ||
1934 (IS_EXTENDER(c)))) {
1935 if (count++ > 100) {
1936 count = 0;
1937 GROW;
1938 }
1939 COPY_BUF(l,buf,len,c);
1940 NEXTL(l);
1941 c = CUR_CHAR(l);
1942 if (len >= XML_MAX_NAMELEN) {
1943 /*
1944 * Okay someone managed to make a huge name, so he's ready to pay
1945 * for the processing speed.
1946 */
1947 xmlChar *buffer;
1948 int max = len * 2;
1949
1950 buffer = (xmlChar *) xmlMalloc(max * sizeof(xmlChar));
1951 if (buffer == NULL) {
1952 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1953 ctxt->sax->error(ctxt->userData,
1954 "htmlParseNameComplex: out of memory\n");
1955 return(NULL);
1956 }
1957 memcpy(buffer, buf, len);
1958 while ((IS_LETTER(c)) || (IS_DIGIT(c)) || /* test bigname.xml */
1959 (c == '.') || (c == '-') ||
1960 (c == '_') || (c == ':') ||
1961 (IS_COMBINING(c)) ||
1962 (IS_EXTENDER(c))) {
1963 if (count++ > 100) {
1964 count = 0;
1965 GROW;
1966 }
1967 if (len + 10 > max) {
1968 max *= 2;
1969 buffer = (xmlChar *) xmlRealloc(buffer,
1970 max * sizeof(xmlChar));
1971 if (buffer == NULL) {
1972 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1973 ctxt->sax->error(ctxt->userData,
1974 "htmlParseNameComplex: out of memory\n");
1975 return(NULL);
1976 }
1977 }
1978 COPY_BUF(l,buffer,len,c);
1979 NEXTL(l);
1980 c = CUR_CHAR(l);
1981 }
1982 buffer[len] = 0;
1983 return(buffer);
Owen Taylor3473f882001-02-23 17:55:21 +00001984 }
1985 }
1986 return(xmlStrndup(buf, len));
1987}
1988
Daniel Veillarde55e8e42003-01-10 12:50:02 +00001989
Owen Taylor3473f882001-02-23 17:55:21 +00001990/**
1991 * htmlParseHTMLAttribute:
1992 * @ctxt: an HTML parser context
1993 * @stop: a char stop value
1994 *
1995 * parse an HTML attribute value till the stop (quote), if
1996 * stop is 0 then it stops at the first space
1997 *
1998 * Returns the attribute parsed or NULL
1999 */
2000
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002001static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002002htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2003 xmlChar *buffer = NULL;
2004 int buffer_size = 0;
2005 xmlChar *out = NULL;
2006 xmlChar *name = NULL;
2007
2008 xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002009 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002010
2011 /*
2012 * allocate a translation buffer.
2013 */
2014 buffer_size = HTML_PARSER_BUFFER_SIZE;
2015 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
2016 if (buffer == NULL) {
Daniel Veillard3487c8d2002-09-05 11:33:25 +00002017 xmlGenericError(xmlGenericErrorContext,
2018 "htmlParseHTMLAttribute: malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002019 return(NULL);
2020 }
2021 out = buffer;
2022
2023 /*
2024 * Ok loop until we reach one of the ending chars
2025 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002026 while ((CUR != 0) && (CUR != stop)) {
2027 if ((stop == 0) && (CUR == '>')) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002028 if ((stop == 0) && (IS_BLANK(CUR))) break;
2029 if (CUR == '&') {
2030 if (NXT(1) == '#') {
2031 unsigned int c;
2032 int bits;
2033
2034 c = htmlParseCharRef(ctxt);
2035 if (c < 0x80)
2036 { *out++ = c; bits= -6; }
2037 else if (c < 0x800)
2038 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2039 else if (c < 0x10000)
2040 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2041 else
2042 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2043
2044 for ( ; bits >= 0; bits-= 6) {
2045 *out++ = ((c >> bits) & 0x3F) | 0x80;
2046 }
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002047
2048 if (out - buffer > buffer_size - 100) {
2049 int indx = out - buffer;
2050
2051 growBuffer(buffer);
2052 out = &buffer[indx];
2053 }
Owen Taylor3473f882001-02-23 17:55:21 +00002054 } else {
2055 ent = htmlParseEntityRef(ctxt, &name);
2056 if (name == NULL) {
2057 *out++ = '&';
2058 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002059 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002060
2061 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002062 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002063 }
2064 } else if (ent == NULL) {
2065 *out++ = '&';
2066 cur = name;
2067 while (*cur != 0) {
2068 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002069 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002070
2071 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002072 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002073 }
2074 *out++ = *cur++;
2075 }
2076 xmlFree(name);
2077 } else {
2078 unsigned int c;
2079 int bits;
2080
2081 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002082 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002083
2084 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002085 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002086 }
2087 c = (xmlChar)ent->value;
2088 if (c < 0x80)
2089 { *out++ = c; bits= -6; }
2090 else if (c < 0x800)
2091 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2092 else if (c < 0x10000)
2093 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2094 else
2095 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2096
2097 for ( ; bits >= 0; bits-= 6) {
2098 *out++ = ((c >> bits) & 0x3F) | 0x80;
2099 }
2100 xmlFree(name);
2101 }
2102 }
2103 } else {
2104 unsigned int c;
2105 int bits, l;
2106
2107 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002108 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002109
2110 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002111 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002112 }
2113 c = CUR_CHAR(l);
2114 if (c < 0x80)
2115 { *out++ = c; bits= -6; }
2116 else if (c < 0x800)
2117 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2118 else if (c < 0x10000)
2119 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2120 else
2121 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2122
2123 for ( ; bits >= 0; bits-= 6) {
2124 *out++ = ((c >> bits) & 0x3F) | 0x80;
2125 }
2126 NEXT;
2127 }
2128 }
2129 *out++ = 0;
2130 return(buffer);
2131}
2132
2133/**
Owen Taylor3473f882001-02-23 17:55:21 +00002134 * htmlParseEntityRef:
2135 * @ctxt: an HTML parser context
2136 * @str: location to store the entity name
2137 *
2138 * parse an HTML ENTITY references
2139 *
2140 * [68] EntityRef ::= '&' Name ';'
2141 *
2142 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2143 * if non-NULL *str will have to be freed by the caller.
2144 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002145const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00002146htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2147 xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002148 const htmlEntityDesc * ent = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002149 *str = NULL;
2150
2151 if (CUR == '&') {
2152 NEXT;
2153 name = htmlParseName(ctxt);
2154 if (name == NULL) {
2155 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2156 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2157 ctxt->wellFormed = 0;
2158 } else {
2159 GROW;
2160 if (CUR == ';') {
2161 *str = name;
2162
2163 /*
2164 * Lookup the entity in the table.
2165 */
2166 ent = htmlEntityLookup(name);
2167 if (ent != NULL) /* OK that's ugly !!! */
2168 NEXT;
2169 } else {
2170 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2171 ctxt->sax->error(ctxt->userData,
2172 "htmlParseEntityRef: expecting ';'\n");
2173 *str = name;
2174 }
2175 }
2176 }
2177 return(ent);
2178}
2179
2180/**
2181 * htmlParseAttValue:
2182 * @ctxt: an HTML parser context
2183 *
2184 * parse a value for an attribute
2185 * Note: the parser won't do substitution of entities here, this
2186 * will be handled later in xmlStringGetNodeList, unless it was
2187 * asked for ctxt->replaceEntities != 0
2188 *
2189 * Returns the AttValue parsed or NULL.
2190 */
2191
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002192static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002193htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2194 xmlChar *ret = NULL;
2195
2196 if (CUR == '"') {
2197 NEXT;
2198 ret = htmlParseHTMLAttribute(ctxt, '"');
2199 if (CUR != '"') {
2200 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2201 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2202 ctxt->wellFormed = 0;
2203 } else
2204 NEXT;
2205 } else if (CUR == '\'') {
2206 NEXT;
2207 ret = htmlParseHTMLAttribute(ctxt, '\'');
2208 if (CUR != '\'') {
2209 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2210 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2211 ctxt->wellFormed = 0;
2212 } else
2213 NEXT;
2214 } else {
2215 /*
2216 * That's an HTMLism, the attribute value may not be quoted
2217 */
2218 ret = htmlParseHTMLAttribute(ctxt, 0);
2219 if (ret == NULL) {
2220 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2221 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2222 ctxt->wellFormed = 0;
2223 }
2224 }
2225 return(ret);
2226}
2227
2228/**
2229 * htmlParseSystemLiteral:
2230 * @ctxt: an HTML parser context
2231 *
2232 * parse an HTML Literal
2233 *
2234 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2235 *
2236 * Returns the SystemLiteral parsed or NULL
2237 */
2238
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002239static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002240htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2241 const xmlChar *q;
2242 xmlChar *ret = NULL;
2243
2244 if (CUR == '"') {
2245 NEXT;
2246 q = CUR_PTR;
2247 while ((IS_CHAR(CUR)) && (CUR != '"'))
2248 NEXT;
2249 if (!IS_CHAR(CUR)) {
2250 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2251 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2252 ctxt->wellFormed = 0;
2253 } else {
2254 ret = xmlStrndup(q, CUR_PTR - q);
2255 NEXT;
2256 }
2257 } else if (CUR == '\'') {
2258 NEXT;
2259 q = CUR_PTR;
2260 while ((IS_CHAR(CUR)) && (CUR != '\''))
2261 NEXT;
2262 if (!IS_CHAR(CUR)) {
2263 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2264 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2265 ctxt->wellFormed = 0;
2266 } else {
2267 ret = xmlStrndup(q, CUR_PTR - q);
2268 NEXT;
2269 }
2270 } else {
2271 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2272 ctxt->sax->error(ctxt->userData,
2273 "SystemLiteral \" or ' expected\n");
2274 ctxt->wellFormed = 0;
2275 }
2276
2277 return(ret);
2278}
2279
2280/**
2281 * htmlParsePubidLiteral:
2282 * @ctxt: an HTML parser context
2283 *
2284 * parse an HTML public literal
2285 *
2286 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2287 *
2288 * Returns the PubidLiteral parsed or NULL.
2289 */
2290
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002291static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002292htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2293 const xmlChar *q;
2294 xmlChar *ret = NULL;
2295 /*
2296 * Name ::= (Letter | '_') (NameChar)*
2297 */
2298 if (CUR == '"') {
2299 NEXT;
2300 q = CUR_PTR;
2301 while (IS_PUBIDCHAR(CUR)) NEXT;
2302 if (CUR != '"') {
2303 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2304 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2305 ctxt->wellFormed = 0;
2306 } else {
2307 ret = xmlStrndup(q, CUR_PTR - q);
2308 NEXT;
2309 }
2310 } else if (CUR == '\'') {
2311 NEXT;
2312 q = CUR_PTR;
2313 while ((IS_LETTER(CUR)) && (CUR != '\''))
2314 NEXT;
2315 if (!IS_LETTER(CUR)) {
2316 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2317 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2318 ctxt->wellFormed = 0;
2319 } else {
2320 ret = xmlStrndup(q, CUR_PTR - q);
2321 NEXT;
2322 }
2323 } else {
2324 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2325 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2326 ctxt->wellFormed = 0;
2327 }
2328
2329 return(ret);
2330}
2331
2332/**
2333 * htmlParseScript:
2334 * @ctxt: an HTML parser context
2335 *
2336 * parse the content of an HTML SCRIPT or STYLE element
2337 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2338 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2339 * http://www.w3.org/TR/html4/types.html#type-script
2340 * http://www.w3.org/TR/html4/types.html#h-6.15
2341 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2342 *
2343 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2344 * element and the value of intrinsic event attributes. User agents must
2345 * not evaluate script data as HTML markup but instead must pass it on as
2346 * data to a script engine.
2347 * NOTES:
2348 * - The content is passed like CDATA
2349 * - the attributes for style and scripting "onXXX" are also described
2350 * as CDATA but SGML allows entities references in attributes so their
2351 * processing is identical as other attributes
2352 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002353static void
Owen Taylor3473f882001-02-23 17:55:21 +00002354htmlParseScript(htmlParserCtxtPtr ctxt) {
2355 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2356 int nbchar = 0;
2357 xmlChar cur;
2358
2359 SHRINK;
2360 cur = CUR;
2361 while (IS_CHAR(cur)) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00002362 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2363 (NXT(3) == '-')) {
2364 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2365 if (ctxt->sax->cdataBlock!= NULL) {
2366 /*
2367 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2368 */
2369 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2370 }
2371 }
2372 nbchar = 0;
2373 htmlParseComment(ctxt);
2374 cur = CUR;
2375 continue;
2376 } else if ((cur == '<') && (NXT(1) == '/')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002377 /*
2378 * One should break here, the specification is clear:
2379 * Authors should therefore escape "</" within the content.
2380 * Escape mechanisms are specific to each scripting or
2381 * style sheet language.
2382 */
2383 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2384 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2385 break; /* while */
2386 }
2387 buf[nbchar++] = cur;
2388 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2389 if (ctxt->sax->cdataBlock!= NULL) {
2390 /*
2391 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2392 */
2393 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2394 }
2395 nbchar = 0;
2396 }
2397 NEXT;
2398 cur = CUR;
2399 }
2400 if (!(IS_CHAR(cur))) {
2401 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2402 ctxt->sax->error(ctxt->userData,
2403 "Invalid char in CDATA 0x%X\n", cur);
2404 ctxt->wellFormed = 0;
2405 NEXT;
2406 }
2407
2408 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2409 if (ctxt->sax->cdataBlock!= NULL) {
2410 /*
2411 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2412 */
2413 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2414 }
2415 }
2416}
2417
2418
2419/**
2420 * htmlParseCharData:
2421 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002422 *
2423 * parse a CharData section.
2424 * if we are within a CDATA section ']]>' marks an end of section.
2425 *
2426 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2427 */
2428
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002429static void
2430htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002431 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2432 int nbchar = 0;
2433 int cur, l;
2434
2435 SHRINK;
2436 cur = CUR_CHAR(l);
2437 while (((cur != '<') || (ctxt->token == '<')) &&
2438 ((cur != '&') || (ctxt->token == '&')) &&
2439 (IS_CHAR(cur))) {
2440 COPY_BUF(l,buf,nbchar,cur);
2441 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2442 /*
2443 * Ok the segment is to be consumed as chars.
2444 */
2445 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2446 if (areBlanks(ctxt, buf, nbchar)) {
2447 if (ctxt->sax->ignorableWhitespace != NULL)
2448 ctxt->sax->ignorableWhitespace(ctxt->userData,
2449 buf, nbchar);
2450 } else {
2451 htmlCheckParagraph(ctxt);
2452 if (ctxt->sax->characters != NULL)
2453 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2454 }
2455 }
2456 nbchar = 0;
2457 }
2458 NEXTL(l);
2459 cur = CUR_CHAR(l);
2460 }
2461 if (nbchar != 0) {
2462 /*
2463 * Ok the segment is to be consumed as chars.
2464 */
2465 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2466 if (areBlanks(ctxt, buf, nbchar)) {
2467 if (ctxt->sax->ignorableWhitespace != NULL)
2468 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2469 } else {
2470 htmlCheckParagraph(ctxt);
2471 if (ctxt->sax->characters != NULL)
2472 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2473 }
2474 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002475 } else {
2476 /*
2477 * Loop detection
2478 */
2479 if (cur == 0)
2480 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002481 }
2482}
2483
2484/**
2485 * htmlParseExternalID:
2486 * @ctxt: an HTML parser context
2487 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002488 *
2489 * Parse an External ID or a Public ID
2490 *
Owen Taylor3473f882001-02-23 17:55:21 +00002491 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2492 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2493 *
2494 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2495 *
2496 * Returns the function returns SystemLiteral and in the second
2497 * case publicID receives PubidLiteral, is strict is off
2498 * it is possible to return NULL and have publicID set.
2499 */
2500
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002501static xmlChar *
2502htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002503 xmlChar *URI = NULL;
2504
2505 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2506 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2507 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2508 SKIP(6);
2509 if (!IS_BLANK(CUR)) {
2510 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2511 ctxt->sax->error(ctxt->userData,
2512 "Space required after 'SYSTEM'\n");
2513 ctxt->wellFormed = 0;
2514 }
2515 SKIP_BLANKS;
2516 URI = htmlParseSystemLiteral(ctxt);
2517 if (URI == NULL) {
2518 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2519 ctxt->sax->error(ctxt->userData,
2520 "htmlParseExternalID: SYSTEM, no URI\n");
2521 ctxt->wellFormed = 0;
2522 }
2523 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2524 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2525 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2526 SKIP(6);
2527 if (!IS_BLANK(CUR)) {
2528 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2529 ctxt->sax->error(ctxt->userData,
2530 "Space required after 'PUBLIC'\n");
2531 ctxt->wellFormed = 0;
2532 }
2533 SKIP_BLANKS;
2534 *publicID = htmlParsePubidLiteral(ctxt);
2535 if (*publicID == NULL) {
2536 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2537 ctxt->sax->error(ctxt->userData,
2538 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2539 ctxt->wellFormed = 0;
2540 }
2541 SKIP_BLANKS;
2542 if ((CUR == '"') || (CUR == '\'')) {
2543 URI = htmlParseSystemLiteral(ctxt);
2544 }
2545 }
2546 return(URI);
2547}
2548
2549/**
2550 * htmlParseComment:
2551 * @ctxt: an HTML parser context
2552 *
2553 * Parse an XML (SGML) comment <!-- .... -->
2554 *
2555 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2556 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002557static void
Owen Taylor3473f882001-02-23 17:55:21 +00002558htmlParseComment(htmlParserCtxtPtr ctxt) {
2559 xmlChar *buf = NULL;
2560 int len;
2561 int size = HTML_PARSER_BUFFER_SIZE;
2562 int q, ql;
2563 int r, rl;
2564 int cur, l;
2565 xmlParserInputState state;
2566
2567 /*
2568 * Check that there is a comment right here.
2569 */
2570 if ((RAW != '<') || (NXT(1) != '!') ||
2571 (NXT(2) != '-') || (NXT(3) != '-')) return;
2572
2573 state = ctxt->instate;
2574 ctxt->instate = XML_PARSER_COMMENT;
2575 SHRINK;
2576 SKIP(4);
2577 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2578 if (buf == NULL) {
2579 xmlGenericError(xmlGenericErrorContext,
2580 "malloc of %d byte failed\n", size);
2581 ctxt->instate = state;
2582 return;
2583 }
2584 q = CUR_CHAR(ql);
2585 NEXTL(ql);
2586 r = CUR_CHAR(rl);
2587 NEXTL(rl);
2588 cur = CUR_CHAR(l);
2589 len = 0;
2590 while (IS_CHAR(cur) &&
2591 ((cur != '>') ||
2592 (r != '-') || (q != '-'))) {
2593 if (len + 5 >= size) {
2594 size *= 2;
2595 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2596 if (buf == NULL) {
2597 xmlGenericError(xmlGenericErrorContext,
2598 "realloc of %d byte failed\n", size);
2599 ctxt->instate = state;
2600 return;
2601 }
2602 }
2603 COPY_BUF(ql,buf,len,q);
2604 q = r;
2605 ql = rl;
2606 r = cur;
2607 rl = l;
2608 NEXTL(l);
2609 cur = CUR_CHAR(l);
2610 if (cur == 0) {
2611 SHRINK;
2612 GROW;
2613 cur = CUR_CHAR(l);
2614 }
2615 }
2616 buf[len] = 0;
2617 if (!IS_CHAR(cur)) {
2618 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2619 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2620 ctxt->sax->error(ctxt->userData,
2621 "Comment not terminated \n<!--%.50s\n", buf);
2622 ctxt->wellFormed = 0;
2623 xmlFree(buf);
2624 } else {
2625 NEXT;
2626 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2627 (!ctxt->disableSAX))
2628 ctxt->sax->comment(ctxt->userData, buf);
2629 xmlFree(buf);
2630 }
2631 ctxt->instate = state;
2632}
2633
2634/**
2635 * htmlParseCharRef:
2636 * @ctxt: an HTML parser context
2637 *
2638 * parse Reference declarations
2639 *
2640 * [66] CharRef ::= '&#' [0-9]+ ';' |
2641 * '&#x' [0-9a-fA-F]+ ';'
2642 *
2643 * Returns the value parsed (as an int)
2644 */
2645int
2646htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2647 int val = 0;
2648
2649 if ((CUR == '&') && (NXT(1) == '#') &&
2650 (NXT(2) == 'x')) {
2651 SKIP(3);
2652 while (CUR != ';') {
2653 if ((CUR >= '0') && (CUR <= '9'))
2654 val = val * 16 + (CUR - '0');
2655 else if ((CUR >= 'a') && (CUR <= 'f'))
2656 val = val * 16 + (CUR - 'a') + 10;
2657 else if ((CUR >= 'A') && (CUR <= 'F'))
2658 val = val * 16 + (CUR - 'A') + 10;
2659 else {
2660 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2661 ctxt->sax->error(ctxt->userData,
2662 "htmlParseCharRef: invalid hexadecimal value\n");
2663 ctxt->wellFormed = 0;
2664 return(0);
2665 }
2666 NEXT;
2667 }
2668 if (CUR == ';')
2669 NEXT;
2670 } else if ((CUR == '&') && (NXT(1) == '#')) {
2671 SKIP(2);
2672 while (CUR != ';') {
2673 if ((CUR >= '0') && (CUR <= '9'))
2674 val = val * 10 + (CUR - '0');
2675 else {
2676 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2677 ctxt->sax->error(ctxt->userData,
2678 "htmlParseCharRef: invalid decimal value\n");
2679 ctxt->wellFormed = 0;
2680 return(0);
2681 }
2682 NEXT;
2683 }
2684 if (CUR == ';')
2685 NEXT;
2686 } else {
2687 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2688 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2689 ctxt->wellFormed = 0;
2690 }
2691 /*
2692 * Check the value IS_CHAR ...
2693 */
2694 if (IS_CHAR(val)) {
2695 return(val);
2696 } else {
2697 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2698 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
2699 val);
2700 ctxt->wellFormed = 0;
2701 }
2702 return(0);
2703}
2704
2705
2706/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00002707 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00002708 * @ctxt: an HTML parser context
2709 *
2710 * parse a DOCTYPE declaration
2711 *
2712 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2713 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2714 */
2715
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002716static void
Owen Taylor3473f882001-02-23 17:55:21 +00002717htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
2718 xmlChar *name;
2719 xmlChar *ExternalID = NULL;
2720 xmlChar *URI = NULL;
2721
2722 /*
2723 * We know that '<!DOCTYPE' has been detected.
2724 */
2725 SKIP(9);
2726
2727 SKIP_BLANKS;
2728
2729 /*
2730 * Parse the DOCTYPE name.
2731 */
2732 name = htmlParseName(ctxt);
2733 if (name == NULL) {
2734 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2735 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2736 ctxt->wellFormed = 0;
2737 }
2738 /*
2739 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2740 */
2741
2742 SKIP_BLANKS;
2743
2744 /*
2745 * Check for SystemID and ExternalID
2746 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002747 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00002748 SKIP_BLANKS;
2749
2750 /*
2751 * We should be at the end of the DOCTYPE declaration.
2752 */
2753 if (CUR != '>') {
2754 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardf6ed8bc2001-10-02 09:22:47 +00002755 ctxt->sax->error(ctxt->userData, "DOCTYPE improperly terminated\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002756 ctxt->wellFormed = 0;
2757 /* We shouldn't try to resynchronize ... */
2758 }
2759 NEXT;
2760
2761 /*
2762 * Create or update the document accordingly to the DOCTYPE
2763 */
2764 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2765 (!ctxt->disableSAX))
2766 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
2767
2768 /*
2769 * Cleanup, since we don't use all those identifiers
2770 */
2771 if (URI != NULL) xmlFree(URI);
2772 if (ExternalID != NULL) xmlFree(ExternalID);
2773 if (name != NULL) xmlFree(name);
2774}
2775
2776/**
2777 * htmlParseAttribute:
2778 * @ctxt: an HTML parser context
2779 * @value: a xmlChar ** used to store the value of the attribute
2780 *
2781 * parse an attribute
2782 *
2783 * [41] Attribute ::= Name Eq AttValue
2784 *
2785 * [25] Eq ::= S? '=' S?
2786 *
2787 * With namespace:
2788 *
2789 * [NS 11] Attribute ::= QName Eq AttValue
2790 *
2791 * Also the case QName == xmlns:??? is handled independently as a namespace
2792 * definition.
2793 *
2794 * Returns the attribute name, and the value in *value.
2795 */
2796
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002797static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002798htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
2799 xmlChar *name, *val = NULL;
2800
2801 *value = NULL;
2802 name = htmlParseHTMLName(ctxt);
2803 if (name == NULL) {
2804 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2805 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2806 ctxt->wellFormed = 0;
2807 return(NULL);
2808 }
2809
2810 /*
2811 * read the value
2812 */
2813 SKIP_BLANKS;
2814 if (CUR == '=') {
2815 NEXT;
2816 SKIP_BLANKS;
2817 val = htmlParseAttValue(ctxt);
2818 /******
2819 } else {
2820 * TODO : some attribute must have values, some may not
2821 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2822 ctxt->sax->warning(ctxt->userData,
2823 "No value for attribute %s\n", name); */
2824 }
2825
2826 *value = val;
2827 return(name);
2828}
2829
2830/**
2831 * htmlCheckEncoding:
2832 * @ctxt: an HTML parser context
2833 * @attvalue: the attribute value
2834 *
2835 * Checks an http-equiv attribute from a Meta tag to detect
2836 * the encoding
2837 * If a new encoding is detected the parser is switched to decode
2838 * it and pass UTF8
2839 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002840static void
Owen Taylor3473f882001-02-23 17:55:21 +00002841htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2842 const xmlChar *encoding;
2843
2844 if ((ctxt == NULL) || (attvalue == NULL))
2845 return;
2846
2847 /* do not change encoding */
2848 if (ctxt->input->encoding != NULL)
2849 return;
2850
2851 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
2852 if (encoding != NULL) {
2853 encoding += 8;
2854 } else {
2855 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
2856 if (encoding != NULL)
2857 encoding += 9;
2858 }
2859 if (encoding != NULL) {
2860 xmlCharEncoding enc;
2861 xmlCharEncodingHandlerPtr handler;
2862
2863 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2864
2865 if (ctxt->input->encoding != NULL)
2866 xmlFree((xmlChar *) ctxt->input->encoding);
2867 ctxt->input->encoding = xmlStrdup(encoding);
2868
2869 enc = xmlParseCharEncoding((const char *) encoding);
2870 /*
2871 * registered set of known encodings
2872 */
2873 if (enc != XML_CHAR_ENCODING_ERROR) {
2874 xmlSwitchEncoding(ctxt, enc);
2875 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2876 } else {
2877 /*
2878 * fallback for unknown encodings
2879 */
2880 handler = xmlFindCharEncodingHandler((const char *) encoding);
2881 if (handler != NULL) {
2882 xmlSwitchToEncoding(ctxt, handler);
2883 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2884 } else {
2885 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2886 }
2887 }
2888
2889 if ((ctxt->input->buf != NULL) &&
2890 (ctxt->input->buf->encoder != NULL) &&
2891 (ctxt->input->buf->raw != NULL) &&
2892 (ctxt->input->buf->buffer != NULL)) {
2893 int nbchars;
2894 int processed;
2895
2896 /*
2897 * convert as much as possible to the parser reading buffer.
2898 */
2899 processed = ctxt->input->cur - ctxt->input->base;
2900 xmlBufferShrink(ctxt->input->buf->buffer, processed);
2901 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2902 ctxt->input->buf->buffer,
2903 ctxt->input->buf->raw);
2904 if (nbchars < 0) {
2905 ctxt->errNo = XML_ERR_INVALID_ENCODING;
2906 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2907 ctxt->sax->error(ctxt->userData,
2908 "htmlCheckEncoding: encoder error\n");
2909 }
2910 ctxt->input->base =
2911 ctxt->input->cur = ctxt->input->buf->buffer->content;
2912 }
2913 }
2914}
2915
2916/**
2917 * htmlCheckMeta:
2918 * @ctxt: an HTML parser context
2919 * @atts: the attributes values
2920 *
2921 * Checks an attributes from a Meta tag
2922 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002923static void
Owen Taylor3473f882001-02-23 17:55:21 +00002924htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2925 int i;
2926 const xmlChar *att, *value;
2927 int http = 0;
2928 const xmlChar *content = NULL;
2929
2930 if ((ctxt == NULL) || (atts == NULL))
2931 return;
2932
2933 i = 0;
2934 att = atts[i++];
2935 while (att != NULL) {
2936 value = atts[i++];
2937 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
2938 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
2939 http = 1;
2940 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
2941 content = value;
2942 att = atts[i++];
2943 }
2944 if ((http) && (content != NULL))
2945 htmlCheckEncoding(ctxt, content);
2946
2947}
2948
2949/**
2950 * htmlParseStartTag:
2951 * @ctxt: an HTML parser context
2952 *
2953 * parse a start of tag either for rule element or
2954 * EmptyElement. In both case we don't parse the tag closing chars.
2955 *
2956 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2957 *
2958 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2959 *
2960 * With namespace:
2961 *
2962 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2963 *
2964 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2965 *
2966 */
2967
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002968static void
Owen Taylor3473f882001-02-23 17:55:21 +00002969htmlParseStartTag(htmlParserCtxtPtr ctxt) {
2970 xmlChar *name;
2971 xmlChar *attname;
2972 xmlChar *attvalue;
2973 const xmlChar **atts = NULL;
2974 int nbatts = 0;
2975 int maxatts = 0;
2976 int meta = 0;
2977 int i;
2978
2979 if (CUR != '<') return;
2980 NEXT;
2981
2982 GROW;
2983 name = htmlParseHTMLName(ctxt);
2984 if (name == NULL) {
2985 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2986 ctxt->sax->error(ctxt->userData,
2987 "htmlParseStartTag: invalid element name\n");
2988 ctxt->wellFormed = 0;
2989 /* Dump the bogus tag like browsers do */
2990 while ((IS_CHAR(CUR)) && (CUR != '>'))
2991 NEXT;
2992 return;
2993 }
2994 if (xmlStrEqual(name, BAD_CAST"meta"))
2995 meta = 1;
2996
2997 /*
2998 * Check for auto-closure of HTML elements.
2999 */
3000 htmlAutoClose(ctxt, name);
3001
3002 /*
3003 * Check for implied HTML elements.
3004 */
3005 htmlCheckImplied(ctxt, name);
3006
3007 /*
3008 * Avoid html at any level > 0, head at any level != 1
3009 * or any attempt to recurse body
3010 */
3011 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3012 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3013 ctxt->sax->error(ctxt->userData,
3014 "htmlParseStartTag: misplaced <html> tag\n");
3015 ctxt->wellFormed = 0;
3016 xmlFree(name);
3017 return;
3018 }
3019 if ((ctxt->nameNr != 1) &&
3020 (xmlStrEqual(name, BAD_CAST"head"))) {
3021 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3022 ctxt->sax->error(ctxt->userData,
3023 "htmlParseStartTag: misplaced <head> tag\n");
3024 ctxt->wellFormed = 0;
3025 xmlFree(name);
3026 return;
3027 }
3028 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003029 int indx;
3030 for (indx = 0;indx < ctxt->nameNr;indx++) {
3031 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor3473f882001-02-23 17:55:21 +00003032 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3033 ctxt->sax->error(ctxt->userData,
3034 "htmlParseStartTag: misplaced <body> tag\n");
3035 ctxt->wellFormed = 0;
3036 xmlFree(name);
3037 return;
3038 }
3039 }
3040 }
3041
3042 /*
3043 * Now parse the attributes, it ends up with the ending
3044 *
3045 * (S Attribute)* S?
3046 */
3047 SKIP_BLANKS;
3048 while ((IS_CHAR(CUR)) &&
3049 (CUR != '>') &&
3050 ((CUR != '/') || (NXT(1) != '>'))) {
3051 long cons = ctxt->nbChars;
3052
3053 GROW;
3054 attname = htmlParseAttribute(ctxt, &attvalue);
3055 if (attname != NULL) {
3056
3057 /*
3058 * Well formedness requires at most one declaration of an attribute
3059 */
3060 for (i = 0; i < nbatts;i += 2) {
3061 if (xmlStrEqual(atts[i], attname)) {
3062 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3063 ctxt->sax->error(ctxt->userData,
3064 "Attribute %s redefined\n",
3065 attname);
3066 ctxt->wellFormed = 0;
3067 xmlFree(attname);
3068 if (attvalue != NULL)
3069 xmlFree(attvalue);
3070 goto failed;
3071 }
3072 }
3073
3074 /*
3075 * Add the pair to atts
3076 */
3077 if (atts == NULL) {
3078 maxatts = 10;
3079 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
3080 if (atts == NULL) {
3081 xmlGenericError(xmlGenericErrorContext,
3082 "malloc of %ld byte failed\n",
3083 maxatts * (long)sizeof(xmlChar *));
3084 if (name != NULL) xmlFree(name);
3085 return;
3086 }
3087 } else if (nbatts + 4 > maxatts) {
3088 maxatts *= 2;
3089 atts = (const xmlChar **) xmlRealloc((void *) atts,
3090 maxatts * sizeof(xmlChar *));
3091 if (atts == NULL) {
3092 xmlGenericError(xmlGenericErrorContext,
3093 "realloc of %ld byte failed\n",
3094 maxatts * (long)sizeof(xmlChar *));
3095 if (name != NULL) xmlFree(name);
3096 return;
3097 }
3098 }
3099 atts[nbatts++] = attname;
3100 atts[nbatts++] = attvalue;
3101 atts[nbatts] = NULL;
3102 atts[nbatts + 1] = NULL;
3103 }
3104 else {
3105 /* Dump the bogus attribute string up to the next blank or
3106 * the end of the tag. */
Daniel Veillard561b7f82002-03-20 21:55:57 +00003107 while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
3108 && ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003109 NEXT;
3110 }
3111
3112failed:
3113 SKIP_BLANKS;
3114 if (cons == ctxt->nbChars) {
3115 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3116 ctxt->sax->error(ctxt->userData,
3117 "htmlParseStartTag: problem parsing attributes\n");
3118 ctxt->wellFormed = 0;
3119 break;
3120 }
3121 }
3122
3123 /*
3124 * Handle specific association to the META tag
3125 */
3126 if (meta)
3127 htmlCheckMeta(ctxt, atts);
3128
3129 /*
3130 * SAX: Start of Element !
3131 */
3132 htmlnamePush(ctxt, xmlStrdup(name));
3133#ifdef DEBUG
3134 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
3135#endif
3136 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3137 ctxt->sax->startElement(ctxt->userData, name, atts);
3138
3139 if (atts != NULL) {
3140 for (i = 0;i < nbatts;i++) {
3141 if (atts[i] != NULL)
3142 xmlFree((xmlChar *) atts[i]);
3143 }
3144 xmlFree((void *) atts);
3145 }
3146 if (name != NULL) xmlFree(name);
3147}
3148
3149/**
3150 * htmlParseEndTag:
3151 * @ctxt: an HTML parser context
3152 *
3153 * parse an end of tag
3154 *
3155 * [42] ETag ::= '</' Name S? '>'
3156 *
3157 * With namespace
3158 *
3159 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003160 *
3161 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003162 */
3163
Daniel Veillardf420ac52001-07-04 16:04:09 +00003164static int
Owen Taylor3473f882001-02-23 17:55:21 +00003165htmlParseEndTag(htmlParserCtxtPtr ctxt) {
3166 xmlChar *name;
3167 xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003168 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003169
3170 if ((CUR != '<') || (NXT(1) != '/')) {
3171 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3172 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3173 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003174 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003175 }
3176 SKIP(2);
3177
3178 name = htmlParseHTMLName(ctxt);
Daniel Veillardf420ac52001-07-04 16:04:09 +00003179 if (name == NULL) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003180
3181 /*
3182 * We should definitely be at the ending "S? '>'" part
3183 */
3184 SKIP_BLANKS;
3185 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3186 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3187 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3188 ctxt->wellFormed = 0;
3189 } else
3190 NEXT;
3191
3192 /*
3193 * If the name read is not one of the element in the parsing stack
3194 * then return, it's just an error.
3195 */
3196 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
3197 if (xmlStrEqual(name, ctxt->nameTab[i])) break;
3198 }
3199 if (i < 0) {
3200 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3201 ctxt->sax->error(ctxt->userData,
3202 "Unexpected end tag : %s\n", name);
3203 xmlFree(name);
3204 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003205 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003206 }
3207
3208
3209 /*
3210 * Check for auto-closure of HTML elements.
3211 */
3212
3213 htmlAutoCloseOnClose(ctxt, name);
3214
3215 /*
3216 * Well formedness constraints, opening and closing must match.
3217 * With the exception that the autoclose may have popped stuff out
3218 * of the stack.
3219 */
3220 if (!xmlStrEqual(name, ctxt->name)) {
3221#ifdef DEBUG
3222 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
3223#endif
3224 if ((ctxt->name != NULL) &&
3225 (!xmlStrEqual(ctxt->name, name))) {
3226 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3227 ctxt->sax->error(ctxt->userData,
3228 "Opening and ending tag mismatch: %s and %s\n",
3229 name, ctxt->name);
3230 ctxt->wellFormed = 0;
3231 }
3232 }
3233
3234 /*
3235 * SAX: End of Tag
3236 */
3237 oldname = ctxt->name;
3238 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3239 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3240 ctxt->sax->endElement(ctxt->userData, name);
3241 oldname = htmlnamePop(ctxt);
3242 if (oldname != NULL) {
3243#ifdef DEBUG
3244 xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
3245#endif
3246 xmlFree(oldname);
3247#ifdef DEBUG
3248 } else {
3249 xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
3250#endif
3251 }
Daniel Veillardf420ac52001-07-04 16:04:09 +00003252 ret = 1;
3253 } else {
3254 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003255 }
3256
3257 if (name != NULL)
3258 xmlFree(name);
3259
Daniel Veillardf420ac52001-07-04 16:04:09 +00003260 return(ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003261}
3262
3263
3264/**
3265 * htmlParseReference:
3266 * @ctxt: an HTML parser context
3267 *
3268 * parse and handle entity references in content,
3269 * this will end-up in a call to character() since this is either a
3270 * CharRef, or a predefined entity.
3271 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003272static void
Owen Taylor3473f882001-02-23 17:55:21 +00003273htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003274 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003275 xmlChar out[6];
3276 xmlChar *name;
3277 if (CUR != '&') return;
3278
3279 if (NXT(1) == '#') {
3280 unsigned int c;
3281 int bits, i = 0;
3282
3283 c = htmlParseCharRef(ctxt);
3284 if (c == 0)
3285 return;
3286
3287 if (c < 0x80) { out[i++]= c; bits= -6; }
3288 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3289 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3290 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3291
3292 for ( ; bits >= 0; bits-= 6) {
3293 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3294 }
3295 out[i] = 0;
3296
3297 htmlCheckParagraph(ctxt);
3298 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3299 ctxt->sax->characters(ctxt->userData, out, i);
3300 } else {
3301 ent = htmlParseEntityRef(ctxt, &name);
3302 if (name == NULL) {
3303 htmlCheckParagraph(ctxt);
3304 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3305 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3306 return;
3307 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003308 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003309 htmlCheckParagraph(ctxt);
3310 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3311 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3312 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3313 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3314 }
3315 } else {
3316 unsigned int c;
3317 int bits, i = 0;
3318
3319 c = ent->value;
3320 if (c < 0x80)
3321 { out[i++]= c; bits= -6; }
3322 else if (c < 0x800)
3323 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3324 else if (c < 0x10000)
3325 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3326 else
3327 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3328
3329 for ( ; bits >= 0; bits-= 6) {
3330 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3331 }
3332 out[i] = 0;
3333
3334 htmlCheckParagraph(ctxt);
3335 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3336 ctxt->sax->characters(ctxt->userData, out, i);
3337 }
3338 xmlFree(name);
3339 }
3340}
3341
3342/**
3343 * htmlParseContent:
3344 * @ctxt: an HTML parser context
3345 * @name: the node name
3346 *
3347 * Parse a content: comment, sub-element, reference or text.
3348 *
3349 */
3350
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003351static void
Owen Taylor3473f882001-02-23 17:55:21 +00003352htmlParseContent(htmlParserCtxtPtr ctxt) {
3353 xmlChar *currentNode;
3354 int depth;
3355
3356 currentNode = xmlStrdup(ctxt->name);
3357 depth = ctxt->nameNr;
3358 while (1) {
3359 long cons = ctxt->nbChars;
3360
3361 GROW;
3362 /*
3363 * Our tag or one of it's parent or children is ending.
3364 */
3365 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003366 if (htmlParseEndTag(ctxt) &&
3367 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3368 if (currentNode != NULL)
3369 xmlFree(currentNode);
3370 return;
3371 }
3372 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003373 }
3374
3375 /*
3376 * Has this node been popped out during parsing of
3377 * the next element
3378 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003379 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3380 (!xmlStrEqual(currentNode, ctxt->name)))
3381 {
Owen Taylor3473f882001-02-23 17:55:21 +00003382 if (currentNode != NULL) xmlFree(currentNode);
3383 return;
3384 }
3385
Daniel Veillardf9533d12001-03-03 10:04:57 +00003386 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3387 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003388 /*
3389 * Handle SCRIPT/STYLE separately
3390 */
3391 htmlParseScript(ctxt);
3392 } else {
3393 /*
3394 * Sometimes DOCTYPE arrives in the middle of the document
3395 */
3396 if ((CUR == '<') && (NXT(1) == '!') &&
3397 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3398 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3399 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3400 (UPP(8) == 'E')) {
3401 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3402 ctxt->sax->error(ctxt->userData,
3403 "Misplaced DOCTYPE declaration\n");
3404 ctxt->wellFormed = 0;
3405 htmlParseDocTypeDecl(ctxt);
3406 }
3407
3408 /*
3409 * First case : a comment
3410 */
3411 if ((CUR == '<') && (NXT(1) == '!') &&
3412 (NXT(2) == '-') && (NXT(3) == '-')) {
3413 htmlParseComment(ctxt);
3414 }
3415
3416 /*
3417 * Second case : a sub-element.
3418 */
3419 else if (CUR == '<') {
3420 htmlParseElement(ctxt);
3421 }
3422
3423 /*
3424 * Third case : a reference. If if has not been resolved,
3425 * parsing returns it's Name, create the node
3426 */
3427 else if (CUR == '&') {
3428 htmlParseReference(ctxt);
3429 }
3430
3431 /*
3432 * Fourth : end of the resource
3433 */
3434 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003435 htmlAutoCloseOnEnd(ctxt);
3436 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003437 }
3438
3439 /*
3440 * Last case, text. Note that References are handled directly.
3441 */
3442 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003443 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003444 }
3445
3446 if (cons == ctxt->nbChars) {
3447 if (ctxt->node != NULL) {
3448 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3449 ctxt->sax->error(ctxt->userData,
3450 "detected an error in element content\n");
3451 ctxt->wellFormed = 0;
3452 }
3453 break;
3454 }
3455 }
3456 GROW;
3457 }
3458 if (currentNode != NULL) xmlFree(currentNode);
3459}
3460
3461/**
3462 * htmlParseElement:
3463 * @ctxt: an HTML parser context
3464 *
3465 * parse an HTML element, this is highly recursive
3466 *
3467 * [39] element ::= EmptyElemTag | STag content ETag
3468 *
3469 * [41] Attribute ::= Name Eq AttValue
3470 */
3471
3472void
3473htmlParseElement(htmlParserCtxtPtr ctxt) {
3474 xmlChar *name;
3475 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003476 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003477 htmlParserNodeInfo node_info;
3478 xmlChar *oldname;
3479 int depth = ctxt->nameNr;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003480 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003481
3482 /* Capture start position */
3483 if (ctxt->record_info) {
3484 node_info.begin_pos = ctxt->input->consumed +
3485 (CUR_PTR - ctxt->input->base);
3486 node_info.begin_line = ctxt->input->line;
3487 }
3488
3489 oldname = xmlStrdup(ctxt->name);
3490 htmlParseStartTag(ctxt);
3491 name = ctxt->name;
3492#ifdef DEBUG
3493 if (oldname == NULL)
3494 xmlGenericError(xmlGenericErrorContext,
3495 "Start of element %s\n", name);
3496 else if (name == NULL)
3497 xmlGenericError(xmlGenericErrorContext,
3498 "Start of element failed, was %s\n", oldname);
3499 else
3500 xmlGenericError(xmlGenericErrorContext,
3501 "Start of element %s, was %s\n", name, oldname);
3502#endif
3503 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3504 (name == NULL)) {
3505 if (CUR == '>')
3506 NEXT;
3507 if (oldname != NULL)
3508 xmlFree(oldname);
3509 return;
3510 }
3511 if (oldname != NULL)
3512 xmlFree(oldname);
3513
3514 /*
3515 * Lookup the info for that element.
3516 */
3517 info = htmlTagLookup(name);
3518 if (info == NULL) {
3519 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3520 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3521 name);
3522 ctxt->wellFormed = 0;
3523 } else if (info->depr) {
3524/***************************
3525 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3526 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3527 name);
3528 ***************************/
3529 }
3530
3531 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00003532 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00003533 */
3534 if ((CUR == '/') && (NXT(1) == '>')) {
3535 SKIP(2);
3536 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3537 ctxt->sax->endElement(ctxt->userData, name);
3538 oldname = htmlnamePop(ctxt);
3539#ifdef DEBUG
3540 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
3541#endif
3542 if (oldname != NULL)
3543 xmlFree(oldname);
3544 return;
3545 }
3546
3547 if (CUR == '>') {
3548 NEXT;
3549 } else {
3550 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3551 ctxt->sax->error(ctxt->userData,
3552 "Couldn't find end of Start Tag %s\n",
3553 name);
3554 ctxt->wellFormed = 0;
3555
3556 /*
3557 * end of parsing of this node.
3558 */
3559 if (xmlStrEqual(name, ctxt->name)) {
3560 nodePop(ctxt);
3561 oldname = htmlnamePop(ctxt);
3562#ifdef DEBUG
3563 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
3564#endif
3565 if (oldname != NULL)
3566 xmlFree(oldname);
3567 }
3568
3569 /*
3570 * Capture end position and add node
3571 */
3572 if ( currentNode != NULL && ctxt->record_info ) {
3573 node_info.end_pos = ctxt->input->consumed +
3574 (CUR_PTR - ctxt->input->base);
3575 node_info.end_line = ctxt->input->line;
3576 node_info.node = ctxt->node;
3577 xmlParserAddNodeInfo(ctxt, &node_info);
3578 }
3579 return;
3580 }
3581
3582 /*
3583 * Check for an Empty Element from DTD definition
3584 */
3585 if ((info != NULL) && (info->empty)) {
3586 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3587 ctxt->sax->endElement(ctxt->userData, name);
3588 oldname = htmlnamePop(ctxt);
3589#ifdef DEBUG
3590 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
3591#endif
3592 if (oldname != NULL)
3593 xmlFree(oldname);
3594 return;
3595 }
3596
3597 /*
3598 * Parse the content of the element:
3599 */
3600 currentNode = xmlStrdup(ctxt->name);
3601 depth = ctxt->nameNr;
3602 while (IS_CHAR(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00003603 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00003604 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00003605 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00003606 if (ctxt->nameNr < depth) break;
3607 }
3608
Owen Taylor3473f882001-02-23 17:55:21 +00003609 /*
3610 * Capture end position and add node
3611 */
3612 if ( currentNode != NULL && ctxt->record_info ) {
3613 node_info.end_pos = ctxt->input->consumed +
3614 (CUR_PTR - ctxt->input->base);
3615 node_info.end_line = ctxt->input->line;
3616 node_info.node = ctxt->node;
3617 xmlParserAddNodeInfo(ctxt, &node_info);
3618 }
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003619 if (!IS_CHAR(CUR)) {
3620 htmlAutoCloseOnEnd(ctxt);
3621 }
3622
Owen Taylor3473f882001-02-23 17:55:21 +00003623 if (currentNode != NULL)
3624 xmlFree(currentNode);
3625}
3626
3627/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003628 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00003629 * @ctxt: an HTML parser context
3630 *
3631 * parse an HTML document (and build a tree if using the standard SAX
3632 * interface).
3633 *
3634 * Returns 0, -1 in case of error. the parser context is augmented
3635 * as a result of the parsing.
3636 */
3637
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00003638int
Owen Taylor3473f882001-02-23 17:55:21 +00003639htmlParseDocument(htmlParserCtxtPtr ctxt) {
3640 xmlDtdPtr dtd;
3641
Daniel Veillardd0463562001-10-13 09:15:48 +00003642 xmlInitParser();
3643
Owen Taylor3473f882001-02-23 17:55:21 +00003644 htmlDefaultSAXHandlerInit();
3645 ctxt->html = 1;
3646
3647 GROW;
3648 /*
3649 * SAX: beginning of the document processing.
3650 */
3651 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3652 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3653
3654 /*
3655 * Wipe out everything which is before the first '<'
3656 */
3657 SKIP_BLANKS;
3658 if (CUR == 0) {
3659 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3660 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3661 ctxt->wellFormed = 0;
3662 }
3663
3664 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3665 ctxt->sax->startDocument(ctxt->userData);
3666
3667
3668 /*
3669 * Parse possible comments before any content
3670 */
3671 while ((CUR == '<') && (NXT(1) == '!') &&
3672 (NXT(2) == '-') && (NXT(3) == '-')) {
3673 htmlParseComment(ctxt);
3674 SKIP_BLANKS;
3675 }
3676
3677
3678 /*
3679 * Then possibly doc type declaration(s) and more Misc
3680 * (doctypedecl Misc*)?
3681 */
3682 if ((CUR == '<') && (NXT(1) == '!') &&
3683 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3684 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3685 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3686 (UPP(8) == 'E')) {
3687 htmlParseDocTypeDecl(ctxt);
3688 }
3689 SKIP_BLANKS;
3690
3691 /*
3692 * Parse possible comments before any content
3693 */
3694 while ((CUR == '<') && (NXT(1) == '!') &&
3695 (NXT(2) == '-') && (NXT(3) == '-')) {
3696 htmlParseComment(ctxt);
3697 SKIP_BLANKS;
3698 }
3699
3700 /*
3701 * Time to start parsing the tree itself
3702 */
3703 htmlParseContent(ctxt);
3704
3705 /*
3706 * autoclose
3707 */
3708 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003709 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003710
3711
3712 /*
3713 * SAX: end of the document processing.
3714 */
3715 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3716 ctxt->sax->endDocument(ctxt->userData);
3717
3718 if (ctxt->myDoc != NULL) {
3719 dtd = xmlGetIntSubset(ctxt->myDoc);
3720 if (dtd == NULL)
3721 ctxt->myDoc->intSubset =
3722 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3723 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3724 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3725 }
3726 if (! ctxt->wellFormed) return(-1);
3727 return(0);
3728}
3729
3730
3731/************************************************************************
3732 * *
3733 * Parser contexts handling *
3734 * *
3735 ************************************************************************/
3736
3737/**
3738 * xmlInitParserCtxt:
3739 * @ctxt: an HTML parser context
3740 *
3741 * Initialize a parser context
3742 */
3743
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003744static void
Owen Taylor3473f882001-02-23 17:55:21 +00003745htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3746{
3747 htmlSAXHandler *sax;
3748
3749 if (ctxt == NULL) return;
3750 memset(ctxt, 0, sizeof(htmlParserCtxt));
3751
3752 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3753 if (sax == NULL) {
3754 xmlGenericError(xmlGenericErrorContext,
3755 "htmlInitParserCtxt: out of memory\n");
3756 }
3757 else
3758 memset(sax, 0, sizeof(htmlSAXHandler));
3759
3760 /* Allocate the Input stack */
3761 ctxt->inputTab = (htmlParserInputPtr *)
3762 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3763 if (ctxt->inputTab == NULL) {
3764 xmlGenericError(xmlGenericErrorContext,
3765 "htmlInitParserCtxt: out of memory\n");
3766 ctxt->inputNr = 0;
3767 ctxt->inputMax = 0;
3768 ctxt->input = NULL;
3769 return;
3770 }
3771 ctxt->inputNr = 0;
3772 ctxt->inputMax = 5;
3773 ctxt->input = NULL;
3774 ctxt->version = NULL;
3775 ctxt->encoding = NULL;
3776 ctxt->standalone = -1;
3777 ctxt->instate = XML_PARSER_START;
3778
3779 /* Allocate the Node stack */
3780 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3781 if (ctxt->nodeTab == NULL) {
3782 xmlGenericError(xmlGenericErrorContext,
3783 "htmlInitParserCtxt: out of memory\n");
3784 ctxt->nodeNr = 0;
3785 ctxt->nodeMax = 0;
3786 ctxt->node = NULL;
3787 ctxt->inputNr = 0;
3788 ctxt->inputMax = 0;
3789 ctxt->input = NULL;
3790 return;
3791 }
3792 ctxt->nodeNr = 0;
3793 ctxt->nodeMax = 10;
3794 ctxt->node = NULL;
3795
3796 /* Allocate the Name stack */
3797 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
3798 if (ctxt->nameTab == NULL) {
3799 xmlGenericError(xmlGenericErrorContext,
3800 "htmlInitParserCtxt: out of memory\n");
3801 ctxt->nameNr = 0;
3802 ctxt->nameMax = 10;
3803 ctxt->name = NULL;
3804 ctxt->nodeNr = 0;
3805 ctxt->nodeMax = 0;
3806 ctxt->node = NULL;
3807 ctxt->inputNr = 0;
3808 ctxt->inputMax = 0;
3809 ctxt->input = NULL;
3810 return;
3811 }
3812 ctxt->nameNr = 0;
3813 ctxt->nameMax = 10;
3814 ctxt->name = NULL;
3815
3816 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3817 else {
3818 ctxt->sax = sax;
3819 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3820 }
3821 ctxt->userData = ctxt;
3822 ctxt->myDoc = NULL;
3823 ctxt->wellFormed = 1;
3824 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00003825 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00003826 ctxt->html = 1;
3827 ctxt->record_info = 0;
3828 ctxt->validate = 0;
3829 ctxt->nbChars = 0;
3830 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00003831 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003832 xmlInitNodeInfoSeq(&ctxt->node_seq);
3833}
3834
3835/**
3836 * htmlFreeParserCtxt:
3837 * @ctxt: an HTML parser context
3838 *
3839 * Free all the memory used by a parser context. However the parsed
3840 * document in ctxt->myDoc is not freed.
3841 */
3842
3843void
3844htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3845{
3846 xmlFreeParserCtxt(ctxt);
3847}
3848
3849/**
Daniel Veillard1d995272002-07-22 16:43:32 +00003850 * htmlNewParserCtxt:
3851 *
3852 * Allocate and initialize a new parser context.
3853 *
3854 * Returns the xmlParserCtxtPtr or NULL
3855 */
3856
3857static htmlParserCtxtPtr
3858htmlNewParserCtxt(void)
3859{
3860 xmlParserCtxtPtr ctxt;
3861
3862 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
3863 if (ctxt == NULL) {
3864 xmlGenericError(xmlGenericErrorContext,
3865 "xmlNewParserCtxt : cannot allocate context\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00003866 return(NULL);
3867 }
3868 memset(ctxt, 0, sizeof(xmlParserCtxt));
3869 htmlInitParserCtxt(ctxt);
3870 return(ctxt);
3871}
3872
3873/**
3874 * htmlCreateMemoryParserCtxt:
3875 * @buffer: a pointer to a char array
3876 * @size: the size of the array
3877 *
3878 * Create a parser context for an HTML in-memory document.
3879 *
3880 * Returns the new parser context or NULL
3881 */
3882static htmlParserCtxtPtr
3883htmlCreateMemoryParserCtxt(const char *buffer, int size) {
3884 xmlParserCtxtPtr ctxt;
3885 xmlParserInputPtr input;
3886 xmlParserInputBufferPtr buf;
3887
3888 if (buffer == NULL)
3889 return(NULL);
3890 if (size <= 0)
3891 return(NULL);
3892
3893 ctxt = htmlNewParserCtxt();
3894 if (ctxt == NULL)
3895 return(NULL);
3896
3897 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
3898 if (buf == NULL) return(NULL);
3899
3900 input = xmlNewInputStream(ctxt);
3901 if (input == NULL) {
3902 xmlFreeParserCtxt(ctxt);
3903 return(NULL);
3904 }
3905
3906 input->filename = NULL;
3907 input->buf = buf;
3908 input->base = input->buf->buffer->content;
3909 input->cur = input->buf->buffer->content;
3910 input->end = &input->buf->buffer->content[input->buf->buffer->use];
3911
3912 inputPush(ctxt, input);
3913 return(ctxt);
3914}
3915
3916/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003917 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00003918 * @cur: a pointer to an array of xmlChar
3919 * @encoding: a free form C string describing the HTML document encoding, or NULL
3920 *
3921 * Create a parser context for an HTML document.
3922 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003923 * TODO: check the need to add encoding handling there
3924 *
Owen Taylor3473f882001-02-23 17:55:21 +00003925 * Returns the new parser context or NULL
3926 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003927static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00003928htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Daniel Veillard1d995272002-07-22 16:43:32 +00003929 int len;
Owen Taylor3473f882001-02-23 17:55:21 +00003930
Daniel Veillard1d995272002-07-22 16:43:32 +00003931 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00003932 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00003933 len = xmlStrlen(cur);
3934 return(htmlCreateMemoryParserCtxt((char *)cur, len));
Owen Taylor3473f882001-02-23 17:55:21 +00003935}
3936
3937/************************************************************************
3938 * *
3939 * Progressive parsing interfaces *
3940 * *
3941 ************************************************************************/
3942
3943/**
3944 * htmlParseLookupSequence:
3945 * @ctxt: an HTML parser context
3946 * @first: the first char to lookup
3947 * @next: the next char to lookup or zero
3948 * @third: the next char to lookup or zero
3949 *
3950 * Try to find if a sequence (first, next, third) or just (first next) or
3951 * (first) is available in the input stream.
3952 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3953 * to avoid rescanning sequences of bytes, it DOES change the state of the
3954 * parser, do not use liberally.
3955 * This is basically similar to xmlParseLookupSequence()
3956 *
3957 * Returns the index to the current parsing point if the full sequence
3958 * is available, -1 otherwise.
3959 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003960static int
Owen Taylor3473f882001-02-23 17:55:21 +00003961htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3962 xmlChar next, xmlChar third) {
3963 int base, len;
3964 htmlParserInputPtr in;
3965 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00003966 int incomment = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003967
3968 in = ctxt->input;
3969 if (in == NULL) return(-1);
3970 base = in->cur - in->base;
3971 if (base < 0) return(-1);
3972 if (ctxt->checkIndex > base)
3973 base = ctxt->checkIndex;
3974 if (in->buf == NULL) {
3975 buf = in->base;
3976 len = in->length;
3977 } else {
3978 buf = in->buf->buffer->content;
3979 len = in->buf->buffer->use;
3980 }
3981 /* take into account the sequence length */
3982 if (third) len -= 2;
3983 else if (next) len --;
3984 for (;base < len;base++) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00003985 if (!incomment && (base + 4 < len)) {
3986 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
3987 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
3988 incomment = 1;
3989 }
3990 /* do not increment base, some people use <!--> */
3991 }
3992 if (incomment) {
3993 if (base + 3 < len)
3994 return(-1);
3995 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
3996 (buf[base + 2] == '>')) {
3997 incomment = 0;
3998 base += 2;
3999 }
4000 continue;
4001 }
Owen Taylor3473f882001-02-23 17:55:21 +00004002 if (buf[base] == first) {
4003 if (third != 0) {
4004 if ((buf[base + 1] != next) ||
4005 (buf[base + 2] != third)) continue;
4006 } else if (next != 0) {
4007 if (buf[base + 1] != next) continue;
4008 }
4009 ctxt->checkIndex = 0;
4010#ifdef DEBUG_PUSH
4011 if (next == 0)
4012 xmlGenericError(xmlGenericErrorContext,
4013 "HPP: lookup '%c' found at %d\n",
4014 first, base);
4015 else if (third == 0)
4016 xmlGenericError(xmlGenericErrorContext,
4017 "HPP: lookup '%c%c' found at %d\n",
4018 first, next, base);
4019 else
4020 xmlGenericError(xmlGenericErrorContext,
4021 "HPP: lookup '%c%c%c' found at %d\n",
4022 first, next, third, base);
4023#endif
4024 return(base - (in->cur - in->base));
4025 }
4026 }
4027 ctxt->checkIndex = base;
4028#ifdef DEBUG_PUSH
4029 if (next == 0)
4030 xmlGenericError(xmlGenericErrorContext,
4031 "HPP: lookup '%c' failed\n", first);
4032 else if (third == 0)
4033 xmlGenericError(xmlGenericErrorContext,
4034 "HPP: lookup '%c%c' failed\n", first, next);
4035 else
4036 xmlGenericError(xmlGenericErrorContext,
4037 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4038#endif
4039 return(-1);
4040}
4041
4042/**
4043 * htmlParseTryOrFinish:
4044 * @ctxt: an HTML parser context
4045 * @terminate: last chunk indicator
4046 *
4047 * Try to progress on parsing
4048 *
4049 * Returns zero if no parsing was possible
4050 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004051static int
Owen Taylor3473f882001-02-23 17:55:21 +00004052htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4053 int ret = 0;
4054 htmlParserInputPtr in;
4055 int avail = 0;
4056 xmlChar cur, next;
4057
4058#ifdef DEBUG_PUSH
4059 switch (ctxt->instate) {
4060 case XML_PARSER_EOF:
4061 xmlGenericError(xmlGenericErrorContext,
4062 "HPP: try EOF\n"); break;
4063 case XML_PARSER_START:
4064 xmlGenericError(xmlGenericErrorContext,
4065 "HPP: try START\n"); break;
4066 case XML_PARSER_MISC:
4067 xmlGenericError(xmlGenericErrorContext,
4068 "HPP: try MISC\n");break;
4069 case XML_PARSER_COMMENT:
4070 xmlGenericError(xmlGenericErrorContext,
4071 "HPP: try COMMENT\n");break;
4072 case XML_PARSER_PROLOG:
4073 xmlGenericError(xmlGenericErrorContext,
4074 "HPP: try PROLOG\n");break;
4075 case XML_PARSER_START_TAG:
4076 xmlGenericError(xmlGenericErrorContext,
4077 "HPP: try START_TAG\n");break;
4078 case XML_PARSER_CONTENT:
4079 xmlGenericError(xmlGenericErrorContext,
4080 "HPP: try CONTENT\n");break;
4081 case XML_PARSER_CDATA_SECTION:
4082 xmlGenericError(xmlGenericErrorContext,
4083 "HPP: try CDATA_SECTION\n");break;
4084 case XML_PARSER_END_TAG:
4085 xmlGenericError(xmlGenericErrorContext,
4086 "HPP: try END_TAG\n");break;
4087 case XML_PARSER_ENTITY_DECL:
4088 xmlGenericError(xmlGenericErrorContext,
4089 "HPP: try ENTITY_DECL\n");break;
4090 case XML_PARSER_ENTITY_VALUE:
4091 xmlGenericError(xmlGenericErrorContext,
4092 "HPP: try ENTITY_VALUE\n");break;
4093 case XML_PARSER_ATTRIBUTE_VALUE:
4094 xmlGenericError(xmlGenericErrorContext,
4095 "HPP: try ATTRIBUTE_VALUE\n");break;
4096 case XML_PARSER_DTD:
4097 xmlGenericError(xmlGenericErrorContext,
4098 "HPP: try DTD\n");break;
4099 case XML_PARSER_EPILOG:
4100 xmlGenericError(xmlGenericErrorContext,
4101 "HPP: try EPILOG\n");break;
4102 case XML_PARSER_PI:
4103 xmlGenericError(xmlGenericErrorContext,
4104 "HPP: try PI\n");break;
4105 case XML_PARSER_SYSTEM_LITERAL:
4106 xmlGenericError(xmlGenericErrorContext,
4107 "HPP: try SYSTEM_LITERAL\n");break;
4108 }
4109#endif
4110
4111 while (1) {
4112
4113 in = ctxt->input;
4114 if (in == NULL) break;
4115 if (in->buf == NULL)
4116 avail = in->length - (in->cur - in->base);
4117 else
4118 avail = in->buf->buffer->use - (in->cur - in->base);
4119 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004120 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004121 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4122 /*
4123 * SAX: end of the document processing.
4124 */
4125 ctxt->instate = XML_PARSER_EOF;
4126 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4127 ctxt->sax->endDocument(ctxt->userData);
4128 }
4129 }
4130 if (avail < 1)
4131 goto done;
4132 switch (ctxt->instate) {
4133 case XML_PARSER_EOF:
4134 /*
4135 * Document parsing is done !
4136 */
4137 goto done;
4138 case XML_PARSER_START:
4139 /*
4140 * Very first chars read from the document flow.
4141 */
4142 cur = in->cur[0];
4143 if (IS_BLANK(cur)) {
4144 SKIP_BLANKS;
4145 if (in->buf == NULL)
4146 avail = in->length - (in->cur - in->base);
4147 else
4148 avail = in->buf->buffer->use - (in->cur - in->base);
4149 }
4150 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4151 ctxt->sax->setDocumentLocator(ctxt->userData,
4152 &xmlDefaultSAXLocator);
4153 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4154 (!ctxt->disableSAX))
4155 ctxt->sax->startDocument(ctxt->userData);
4156
4157 cur = in->cur[0];
4158 next = in->cur[1];
4159 if ((cur == '<') && (next == '!') &&
4160 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4161 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4162 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4163 (UPP(8) == 'E')) {
4164 if ((!terminate) &&
4165 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4166 goto done;
4167#ifdef DEBUG_PUSH
4168 xmlGenericError(xmlGenericErrorContext,
4169 "HPP: Parsing internal subset\n");
4170#endif
4171 htmlParseDocTypeDecl(ctxt);
4172 ctxt->instate = XML_PARSER_PROLOG;
4173#ifdef DEBUG_PUSH
4174 xmlGenericError(xmlGenericErrorContext,
4175 "HPP: entering PROLOG\n");
4176#endif
4177 } else {
4178 ctxt->instate = XML_PARSER_MISC;
4179 }
4180#ifdef DEBUG_PUSH
4181 xmlGenericError(xmlGenericErrorContext,
4182 "HPP: entering MISC\n");
4183#endif
4184 break;
4185 case XML_PARSER_MISC:
4186 SKIP_BLANKS;
4187 if (in->buf == NULL)
4188 avail = in->length - (in->cur - in->base);
4189 else
4190 avail = in->buf->buffer->use - (in->cur - in->base);
4191 if (avail < 2)
4192 goto done;
4193 cur = in->cur[0];
4194 next = in->cur[1];
4195 if ((cur == '<') && (next == '!') &&
4196 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4197 if ((!terminate) &&
4198 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4199 goto done;
4200#ifdef DEBUG_PUSH
4201 xmlGenericError(xmlGenericErrorContext,
4202 "HPP: Parsing Comment\n");
4203#endif
4204 htmlParseComment(ctxt);
4205 ctxt->instate = XML_PARSER_MISC;
4206 } else if ((cur == '<') && (next == '!') &&
4207 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4208 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4209 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4210 (UPP(8) == 'E')) {
4211 if ((!terminate) &&
4212 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4213 goto done;
4214#ifdef DEBUG_PUSH
4215 xmlGenericError(xmlGenericErrorContext,
4216 "HPP: Parsing internal subset\n");
4217#endif
4218 htmlParseDocTypeDecl(ctxt);
4219 ctxt->instate = XML_PARSER_PROLOG;
4220#ifdef DEBUG_PUSH
4221 xmlGenericError(xmlGenericErrorContext,
4222 "HPP: entering PROLOG\n");
4223#endif
4224 } else if ((cur == '<') && (next == '!') &&
4225 (avail < 9)) {
4226 goto done;
4227 } else {
4228 ctxt->instate = XML_PARSER_START_TAG;
4229#ifdef DEBUG_PUSH
4230 xmlGenericError(xmlGenericErrorContext,
4231 "HPP: entering START_TAG\n");
4232#endif
4233 }
4234 break;
4235 case XML_PARSER_PROLOG:
4236 SKIP_BLANKS;
4237 if (in->buf == NULL)
4238 avail = in->length - (in->cur - in->base);
4239 else
4240 avail = in->buf->buffer->use - (in->cur - in->base);
4241 if (avail < 2)
4242 goto done;
4243 cur = in->cur[0];
4244 next = in->cur[1];
4245 if ((cur == '<') && (next == '!') &&
4246 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4247 if ((!terminate) &&
4248 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4249 goto done;
4250#ifdef DEBUG_PUSH
4251 xmlGenericError(xmlGenericErrorContext,
4252 "HPP: Parsing Comment\n");
4253#endif
4254 htmlParseComment(ctxt);
4255 ctxt->instate = XML_PARSER_PROLOG;
4256 } else if ((cur == '<') && (next == '!') &&
4257 (avail < 4)) {
4258 goto done;
4259 } else {
4260 ctxt->instate = XML_PARSER_START_TAG;
4261#ifdef DEBUG_PUSH
4262 xmlGenericError(xmlGenericErrorContext,
4263 "HPP: entering START_TAG\n");
4264#endif
4265 }
4266 break;
4267 case XML_PARSER_EPILOG:
4268 if (in->buf == NULL)
4269 avail = in->length - (in->cur - in->base);
4270 else
4271 avail = in->buf->buffer->use - (in->cur - in->base);
4272 if (avail < 1)
4273 goto done;
4274 cur = in->cur[0];
4275 if (IS_BLANK(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004276 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004277 goto done;
4278 }
4279 if (avail < 2)
4280 goto done;
4281 next = in->cur[1];
4282 if ((cur == '<') && (next == '!') &&
4283 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4284 if ((!terminate) &&
4285 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4286 goto done;
4287#ifdef DEBUG_PUSH
4288 xmlGenericError(xmlGenericErrorContext,
4289 "HPP: Parsing Comment\n");
4290#endif
4291 htmlParseComment(ctxt);
4292 ctxt->instate = XML_PARSER_EPILOG;
4293 } else if ((cur == '<') && (next == '!') &&
4294 (avail < 4)) {
4295 goto done;
4296 } else {
4297 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004298 ctxt->wellFormed = 0;
4299 ctxt->instate = XML_PARSER_EOF;
4300#ifdef DEBUG_PUSH
4301 xmlGenericError(xmlGenericErrorContext,
4302 "HPP: entering EOF\n");
4303#endif
4304 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4305 ctxt->sax->endDocument(ctxt->userData);
4306 goto done;
4307 }
4308 break;
4309 case XML_PARSER_START_TAG: {
4310 xmlChar *name, *oldname;
4311 int depth = ctxt->nameNr;
Daniel Veillardbb371292001-08-16 23:26:59 +00004312 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004313
4314 if (avail < 2)
4315 goto done;
4316 cur = in->cur[0];
4317 if (cur != '<') {
4318 ctxt->instate = XML_PARSER_CONTENT;
4319#ifdef DEBUG_PUSH
4320 xmlGenericError(xmlGenericErrorContext,
4321 "HPP: entering CONTENT\n");
4322#endif
4323 break;
4324 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004325 if (in->cur[1] == '/') {
4326 ctxt->instate = XML_PARSER_END_TAG;
4327 ctxt->checkIndex = 0;
4328#ifdef DEBUG_PUSH
4329 xmlGenericError(xmlGenericErrorContext,
4330 "HPP: entering END_TAG\n");
4331#endif
4332 break;
4333 }
Owen Taylor3473f882001-02-23 17:55:21 +00004334 if ((!terminate) &&
4335 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4336 goto done;
4337
4338 oldname = xmlStrdup(ctxt->name);
4339 htmlParseStartTag(ctxt);
4340 name = ctxt->name;
4341#ifdef DEBUG
4342 if (oldname == NULL)
4343 xmlGenericError(xmlGenericErrorContext,
4344 "Start of element %s\n", name);
4345 else if (name == NULL)
4346 xmlGenericError(xmlGenericErrorContext,
4347 "Start of element failed, was %s\n",
4348 oldname);
4349 else
4350 xmlGenericError(xmlGenericErrorContext,
4351 "Start of element %s, was %s\n",
4352 name, oldname);
4353#endif
4354 if (((depth == ctxt->nameNr) &&
4355 (xmlStrEqual(oldname, ctxt->name))) ||
4356 (name == NULL)) {
4357 if (CUR == '>')
4358 NEXT;
4359 if (oldname != NULL)
4360 xmlFree(oldname);
4361 break;
4362 }
4363 if (oldname != NULL)
4364 xmlFree(oldname);
4365
4366 /*
4367 * Lookup the info for that element.
4368 */
4369 info = htmlTagLookup(name);
4370 if (info == NULL) {
4371 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4372 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4373 name);
4374 ctxt->wellFormed = 0;
4375 } else if (info->depr) {
4376 /***************************
4377 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4378 ctxt->sax->warning(ctxt->userData,
4379 "Tag %s is deprecated\n",
4380 name);
4381 ***************************/
4382 }
4383
4384 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004385 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004386 */
4387 if ((CUR == '/') && (NXT(1) == '>')) {
4388 SKIP(2);
4389 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4390 ctxt->sax->endElement(ctxt->userData, name);
4391 oldname = htmlnamePop(ctxt);
4392#ifdef DEBUG
4393 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
4394 oldname);
4395#endif
4396 if (oldname != NULL)
4397 xmlFree(oldname);
4398 ctxt->instate = XML_PARSER_CONTENT;
4399#ifdef DEBUG_PUSH
4400 xmlGenericError(xmlGenericErrorContext,
4401 "HPP: entering CONTENT\n");
4402#endif
4403 break;
4404 }
4405
4406 if (CUR == '>') {
4407 NEXT;
4408 } else {
4409 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4410 ctxt->sax->error(ctxt->userData,
4411 "Couldn't find end of Start Tag %s\n",
4412 name);
4413 ctxt->wellFormed = 0;
4414
4415 /*
4416 * end of parsing of this node.
4417 */
4418 if (xmlStrEqual(name, ctxt->name)) {
4419 nodePop(ctxt);
4420 oldname = htmlnamePop(ctxt);
4421#ifdef DEBUG
4422 xmlGenericError(xmlGenericErrorContext,
4423 "End of start tag problem: popping out %s\n", oldname);
4424#endif
4425 if (oldname != NULL)
4426 xmlFree(oldname);
4427 }
4428
4429 ctxt->instate = XML_PARSER_CONTENT;
4430#ifdef DEBUG_PUSH
4431 xmlGenericError(xmlGenericErrorContext,
4432 "HPP: entering CONTENT\n");
4433#endif
4434 break;
4435 }
4436
4437 /*
4438 * Check for an Empty Element from DTD definition
4439 */
4440 if ((info != NULL) && (info->empty)) {
4441 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4442 ctxt->sax->endElement(ctxt->userData, name);
4443 oldname = htmlnamePop(ctxt);
4444#ifdef DEBUG
4445 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
4446#endif
4447 if (oldname != NULL)
4448 xmlFree(oldname);
4449 }
4450 ctxt->instate = XML_PARSER_CONTENT;
4451#ifdef DEBUG_PUSH
4452 xmlGenericError(xmlGenericErrorContext,
4453 "HPP: entering CONTENT\n");
4454#endif
4455 break;
4456 }
4457 case XML_PARSER_CONTENT: {
4458 long cons;
4459 /*
4460 * Handle preparsed entities and charRef
4461 */
4462 if (ctxt->token != 0) {
4463 xmlChar chr[2] = { 0 , 0 } ;
4464
4465 chr[0] = (xmlChar) ctxt->token;
4466 htmlCheckParagraph(ctxt);
4467 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4468 ctxt->sax->characters(ctxt->userData, chr, 1);
4469 ctxt->token = 0;
4470 ctxt->checkIndex = 0;
4471 }
4472 if ((avail == 1) && (terminate)) {
4473 cur = in->cur[0];
4474 if ((cur != '<') && (cur != '&')) {
4475 if (ctxt->sax != NULL) {
4476 if (IS_BLANK(cur)) {
4477 if (ctxt->sax->ignorableWhitespace != NULL)
4478 ctxt->sax->ignorableWhitespace(
4479 ctxt->userData, &cur, 1);
4480 } else {
4481 htmlCheckParagraph(ctxt);
4482 if (ctxt->sax->characters != NULL)
4483 ctxt->sax->characters(
4484 ctxt->userData, &cur, 1);
4485 }
4486 }
4487 ctxt->token = 0;
4488 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00004489 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00004490 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004491 }
Owen Taylor3473f882001-02-23 17:55:21 +00004492 }
4493 if (avail < 2)
4494 goto done;
4495 cur = in->cur[0];
4496 next = in->cur[1];
4497 cons = ctxt->nbChars;
4498 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4499 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4500 /*
4501 * Handle SCRIPT/STYLE separately
4502 */
4503 if ((!terminate) &&
4504 (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4505 goto done;
4506 htmlParseScript(ctxt);
4507 if ((cur == '<') && (next == '/')) {
4508 ctxt->instate = XML_PARSER_END_TAG;
4509 ctxt->checkIndex = 0;
4510#ifdef DEBUG_PUSH
4511 xmlGenericError(xmlGenericErrorContext,
4512 "HPP: entering END_TAG\n");
4513#endif
4514 break;
4515 }
4516 } else {
4517 /*
4518 * Sometimes DOCTYPE arrives in the middle of the document
4519 */
4520 if ((cur == '<') && (next == '!') &&
4521 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4522 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4523 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4524 (UPP(8) == 'E')) {
4525 if ((!terminate) &&
4526 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4527 goto done;
4528 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4529 ctxt->sax->error(ctxt->userData,
4530 "Misplaced DOCTYPE declaration\n");
4531 ctxt->wellFormed = 0;
4532 htmlParseDocTypeDecl(ctxt);
4533 } else if ((cur == '<') && (next == '!') &&
4534 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4535 if ((!terminate) &&
4536 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4537 goto done;
4538#ifdef DEBUG_PUSH
4539 xmlGenericError(xmlGenericErrorContext,
4540 "HPP: Parsing Comment\n");
4541#endif
4542 htmlParseComment(ctxt);
4543 ctxt->instate = XML_PARSER_CONTENT;
4544 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4545 goto done;
4546 } else if ((cur == '<') && (next == '/')) {
4547 ctxt->instate = XML_PARSER_END_TAG;
4548 ctxt->checkIndex = 0;
4549#ifdef DEBUG_PUSH
4550 xmlGenericError(xmlGenericErrorContext,
4551 "HPP: entering END_TAG\n");
4552#endif
4553 break;
4554 } else if (cur == '<') {
4555 ctxt->instate = XML_PARSER_START_TAG;
4556 ctxt->checkIndex = 0;
4557#ifdef DEBUG_PUSH
4558 xmlGenericError(xmlGenericErrorContext,
4559 "HPP: entering START_TAG\n");
4560#endif
4561 break;
4562 } else if (cur == '&') {
4563 if ((!terminate) &&
4564 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4565 goto done;
4566#ifdef DEBUG_PUSH
4567 xmlGenericError(xmlGenericErrorContext,
4568 "HPP: Parsing Reference\n");
4569#endif
4570 /* TODO: check generation of subtrees if noent !!! */
4571 htmlParseReference(ctxt);
4572 } else {
4573 /* TODO Avoid the extra copy, handle directly !!!!!! */
4574 /*
Daniel Veillard01c13b52002-12-10 15:19:08 +00004575 * Goal of the following test is:
Owen Taylor3473f882001-02-23 17:55:21 +00004576 * - minimize calls to the SAX 'character' callback
4577 * when they are mergeable
4578 */
4579 if ((ctxt->inputNr == 1) &&
4580 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4581 if ((!terminate) &&
4582 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4583 goto done;
4584 }
4585 ctxt->checkIndex = 0;
4586#ifdef DEBUG_PUSH
4587 xmlGenericError(xmlGenericErrorContext,
4588 "HPP: Parsing char data\n");
4589#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004590 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004591 }
4592 }
4593 if (cons == ctxt->nbChars) {
4594 if (ctxt->node != NULL) {
4595 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4596 ctxt->sax->error(ctxt->userData,
4597 "detected an error in element content\n");
4598 ctxt->wellFormed = 0;
4599 }
4600 NEXT;
4601 break;
4602 }
4603
4604 break;
4605 }
4606 case XML_PARSER_END_TAG:
4607 if (avail < 2)
4608 goto done;
4609 if ((!terminate) &&
4610 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4611 goto done;
4612 htmlParseEndTag(ctxt);
4613 if (ctxt->nameNr == 0) {
4614 ctxt->instate = XML_PARSER_EPILOG;
4615 } else {
4616 ctxt->instate = XML_PARSER_CONTENT;
4617 }
4618 ctxt->checkIndex = 0;
4619#ifdef DEBUG_PUSH
4620 xmlGenericError(xmlGenericErrorContext,
4621 "HPP: entering CONTENT\n");
4622#endif
4623 break;
4624 case XML_PARSER_CDATA_SECTION:
4625 xmlGenericError(xmlGenericErrorContext,
4626 "HPP: internal error, state == CDATA\n");
4627 ctxt->instate = XML_PARSER_CONTENT;
4628 ctxt->checkIndex = 0;
4629#ifdef DEBUG_PUSH
4630 xmlGenericError(xmlGenericErrorContext,
4631 "HPP: entering CONTENT\n");
4632#endif
4633 break;
4634 case XML_PARSER_DTD:
4635 xmlGenericError(xmlGenericErrorContext,
4636 "HPP: internal error, state == DTD\n");
4637 ctxt->instate = XML_PARSER_CONTENT;
4638 ctxt->checkIndex = 0;
4639#ifdef DEBUG_PUSH
4640 xmlGenericError(xmlGenericErrorContext,
4641 "HPP: entering CONTENT\n");
4642#endif
4643 break;
4644 case XML_PARSER_COMMENT:
4645 xmlGenericError(xmlGenericErrorContext,
4646 "HPP: internal error, state == COMMENT\n");
4647 ctxt->instate = XML_PARSER_CONTENT;
4648 ctxt->checkIndex = 0;
4649#ifdef DEBUG_PUSH
4650 xmlGenericError(xmlGenericErrorContext,
4651 "HPP: entering CONTENT\n");
4652#endif
4653 break;
4654 case XML_PARSER_PI:
4655 xmlGenericError(xmlGenericErrorContext,
4656 "HPP: internal error, state == PI\n");
4657 ctxt->instate = XML_PARSER_CONTENT;
4658 ctxt->checkIndex = 0;
4659#ifdef DEBUG_PUSH
4660 xmlGenericError(xmlGenericErrorContext,
4661 "HPP: entering CONTENT\n");
4662#endif
4663 break;
4664 case XML_PARSER_ENTITY_DECL:
4665 xmlGenericError(xmlGenericErrorContext,
4666 "HPP: internal error, state == ENTITY_DECL\n");
4667 ctxt->instate = XML_PARSER_CONTENT;
4668 ctxt->checkIndex = 0;
4669#ifdef DEBUG_PUSH
4670 xmlGenericError(xmlGenericErrorContext,
4671 "HPP: entering CONTENT\n");
4672#endif
4673 break;
4674 case XML_PARSER_ENTITY_VALUE:
4675 xmlGenericError(xmlGenericErrorContext,
4676 "HPP: internal error, state == ENTITY_VALUE\n");
4677 ctxt->instate = XML_PARSER_CONTENT;
4678 ctxt->checkIndex = 0;
4679#ifdef DEBUG_PUSH
4680 xmlGenericError(xmlGenericErrorContext,
4681 "HPP: entering DTD\n");
4682#endif
4683 break;
4684 case XML_PARSER_ATTRIBUTE_VALUE:
4685 xmlGenericError(xmlGenericErrorContext,
4686 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
4687 ctxt->instate = XML_PARSER_START_TAG;
4688 ctxt->checkIndex = 0;
4689#ifdef DEBUG_PUSH
4690 xmlGenericError(xmlGenericErrorContext,
4691 "HPP: entering START_TAG\n");
4692#endif
4693 break;
4694 case XML_PARSER_SYSTEM_LITERAL:
4695 xmlGenericError(xmlGenericErrorContext,
4696 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
4697 ctxt->instate = XML_PARSER_CONTENT;
4698 ctxt->checkIndex = 0;
4699#ifdef DEBUG_PUSH
4700 xmlGenericError(xmlGenericErrorContext,
4701 "HPP: entering CONTENT\n");
4702#endif
4703 break;
4704 case XML_PARSER_IGNORE:
4705 xmlGenericError(xmlGenericErrorContext,
4706 "HPP: internal error, state == XML_PARSER_IGNORE\n");
4707 ctxt->instate = XML_PARSER_CONTENT;
4708 ctxt->checkIndex = 0;
4709#ifdef DEBUG_PUSH
4710 xmlGenericError(xmlGenericErrorContext,
4711 "HPP: entering CONTENT\n");
4712#endif
4713 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00004714 case XML_PARSER_PUBLIC_LITERAL:
4715 xmlGenericError(xmlGenericErrorContext,
4716 "HPP: internal error, state == XML_PARSER_LITERAL\n");
4717 ctxt->instate = XML_PARSER_CONTENT;
4718 ctxt->checkIndex = 0;
4719#ifdef DEBUG_PUSH
4720 xmlGenericError(xmlGenericErrorContext,
4721 "HPP: entering CONTENT\n");
4722#endif
4723 break;
4724
Owen Taylor3473f882001-02-23 17:55:21 +00004725 }
4726 }
4727done:
4728 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004729 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004730 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4731 /*
4732 * SAX: end of the document processing.
4733 */
4734 ctxt->instate = XML_PARSER_EOF;
4735 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4736 ctxt->sax->endDocument(ctxt->userData);
4737 }
4738 }
4739 if ((ctxt->myDoc != NULL) &&
4740 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4741 (ctxt->instate == XML_PARSER_EPILOG))) {
4742 xmlDtdPtr dtd;
4743 dtd = xmlGetIntSubset(ctxt->myDoc);
4744 if (dtd == NULL)
4745 ctxt->myDoc->intSubset =
4746 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4747 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4748 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4749 }
4750#ifdef DEBUG_PUSH
4751 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4752#endif
4753 return(ret);
4754}
4755
4756/**
Owen Taylor3473f882001-02-23 17:55:21 +00004757 * htmlParseChunk:
4758 * @ctxt: an XML parser context
4759 * @chunk: an char array
4760 * @size: the size in byte of the chunk
4761 * @terminate: last chunk indicator
4762 *
4763 * Parse a Chunk of memory
4764 *
4765 * Returns zero if no error, the xmlParserErrors otherwise.
4766 */
4767int
4768htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4769 int terminate) {
4770 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4771 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4772 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4773 int cur = ctxt->input->cur - ctxt->input->base;
4774
4775 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4776 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4777 ctxt->input->cur = ctxt->input->base + cur;
4778#ifdef DEBUG_PUSH
4779 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4780#endif
4781
4782 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4783 htmlParseTryOrFinish(ctxt, terminate);
4784 } else if (ctxt->instate != XML_PARSER_EOF) {
4785 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
4786 htmlParseTryOrFinish(ctxt, terminate);
4787 }
4788 if (terminate) {
4789 if ((ctxt->instate != XML_PARSER_EOF) &&
4790 (ctxt->instate != XML_PARSER_EPILOG) &&
4791 (ctxt->instate != XML_PARSER_MISC)) {
4792 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004793 ctxt->wellFormed = 0;
4794 }
4795 if (ctxt->instate != XML_PARSER_EOF) {
4796 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4797 ctxt->sax->endDocument(ctxt->userData);
4798 }
4799 ctxt->instate = XML_PARSER_EOF;
4800 }
4801 return((xmlParserErrors) ctxt->errNo);
4802}
4803
4804/************************************************************************
4805 * *
4806 * User entry points *
4807 * *
4808 ************************************************************************/
4809
4810/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004811 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004812 * @sax: a SAX handler
4813 * @user_data: The user data returned on SAX callbacks
4814 * @chunk: a pointer to an array of chars
4815 * @size: number of chars in the array
4816 * @filename: an optional file name or URI
4817 * @enc: an optional encoding
4818 *
4819 * Create a parser context for using the HTML parser in push mode
4820 * To allow content encoding detection, @size should be >= 4
4821 * The value of @filename is used for fetching external entities
4822 * and error/warning reports.
4823 *
4824 * Returns the new parser context or NULL
4825 */
4826htmlParserCtxtPtr
4827htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4828 const char *chunk, int size, const char *filename,
4829 xmlCharEncoding enc) {
4830 htmlParserCtxtPtr ctxt;
4831 htmlParserInputPtr inputStream;
4832 xmlParserInputBufferPtr buf;
4833
Daniel Veillardd0463562001-10-13 09:15:48 +00004834 xmlInitParser();
4835
Owen Taylor3473f882001-02-23 17:55:21 +00004836 buf = xmlAllocParserInputBuffer(enc);
4837 if (buf == NULL) return(NULL);
4838
4839 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4840 if (ctxt == NULL) {
4841 xmlFree(buf);
4842 return(NULL);
4843 }
4844 memset(ctxt, 0, sizeof(htmlParserCtxt));
4845 htmlInitParserCtxt(ctxt);
4846 if (sax != NULL) {
4847 if (ctxt->sax != &htmlDefaultSAXHandler)
4848 xmlFree(ctxt->sax);
4849 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4850 if (ctxt->sax == NULL) {
4851 xmlFree(buf);
4852 xmlFree(ctxt);
4853 return(NULL);
4854 }
4855 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4856 if (user_data != NULL)
4857 ctxt->userData = user_data;
4858 }
4859 if (filename == NULL) {
4860 ctxt->directory = NULL;
4861 } else {
4862 ctxt->directory = xmlParserGetDirectory(filename);
4863 }
4864
4865 inputStream = htmlNewInputStream(ctxt);
4866 if (inputStream == NULL) {
4867 xmlFreeParserCtxt(ctxt);
4868 return(NULL);
4869 }
4870
4871 if (filename == NULL)
4872 inputStream->filename = NULL;
4873 else
4874 inputStream->filename = xmlMemStrdup(filename);
4875 inputStream->buf = buf;
4876 inputStream->base = inputStream->buf->buffer->content;
4877 inputStream->cur = inputStream->buf->buffer->content;
4878
4879 inputPush(ctxt, inputStream);
4880
4881 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4882 (ctxt->input->buf != NULL)) {
4883 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4884#ifdef DEBUG_PUSH
4885 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4886#endif
4887 }
4888
4889 return(ctxt);
4890}
4891
4892/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004893 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00004894 * @cur: a pointer to an array of xmlChar
4895 * @encoding: a free form C string describing the HTML document encoding, or NULL
4896 * @sax: the SAX handler block
4897 * @userData: if using SAX, this pointer will be provided on callbacks.
4898 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004899 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
4900 * to handle parse events. If sax is NULL, fallback to the default DOM
4901 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00004902 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004903 * Returns the resulting document tree unless SAX is NULL or the document is
4904 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00004905 */
4906
4907htmlDocPtr
4908htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
4909 htmlDocPtr ret;
4910 htmlParserCtxtPtr ctxt;
4911
Daniel Veillardd0463562001-10-13 09:15:48 +00004912 xmlInitParser();
4913
Owen Taylor3473f882001-02-23 17:55:21 +00004914 if (cur == NULL) return(NULL);
4915
4916
4917 ctxt = htmlCreateDocParserCtxt(cur, encoding);
4918 if (ctxt == NULL) return(NULL);
4919 if (sax != NULL) {
4920 ctxt->sax = sax;
4921 ctxt->userData = userData;
4922 }
4923
4924 htmlParseDocument(ctxt);
4925 ret = ctxt->myDoc;
4926 if (sax != NULL) {
4927 ctxt->sax = NULL;
4928 ctxt->userData = NULL;
4929 }
4930 htmlFreeParserCtxt(ctxt);
4931
4932 return(ret);
4933}
4934
4935/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004936 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00004937 * @cur: a pointer to an array of xmlChar
4938 * @encoding: a free form C string describing the HTML document encoding, or NULL
4939 *
4940 * parse an HTML in-memory document and build a tree.
4941 *
4942 * Returns the resulting document tree
4943 */
4944
4945htmlDocPtr
4946htmlParseDoc(xmlChar *cur, const char *encoding) {
4947 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4948}
4949
4950
4951/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004952 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004953 * @filename: the filename
4954 * @encoding: a free form C string describing the HTML document encoding, or NULL
4955 *
4956 * Create a parser context for a file content.
4957 * Automatic support for ZLIB/Compress compressed document is provided
4958 * by default if found at compile-time.
4959 *
4960 * Returns the new parser context or NULL
4961 */
4962htmlParserCtxtPtr
4963htmlCreateFileParserCtxt(const char *filename, const char *encoding)
4964{
4965 htmlParserCtxtPtr ctxt;
4966 htmlParserInputPtr inputStream;
4967 xmlParserInputBufferPtr buf;
4968 /* htmlCharEncoding enc; */
4969 xmlChar *content, *content_line = (xmlChar *) "charset=";
4970
4971 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
4972 if (buf == NULL) return(NULL);
4973
4974 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4975 if (ctxt == NULL) {
Daniel Veillard3487c8d2002-09-05 11:33:25 +00004976 xmlGenericError(xmlGenericErrorContext, "malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004977 return(NULL);
4978 }
4979 memset(ctxt, 0, sizeof(htmlParserCtxt));
4980 htmlInitParserCtxt(ctxt);
4981 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
4982 if (inputStream == NULL) {
Daniel Veillard3487c8d2002-09-05 11:33:25 +00004983 xmlGenericError(xmlGenericErrorContext, "malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004984 xmlFree(ctxt);
4985 return(NULL);
4986 }
4987 memset(inputStream, 0, sizeof(htmlParserInput));
4988
Daniel Veillarda646cfd2002-09-17 21:50:03 +00004989 inputStream->filename = (char *)
4990 xmlNormalizeWindowsPath((xmlChar *)filename);
Owen Taylor3473f882001-02-23 17:55:21 +00004991 inputStream->line = 1;
4992 inputStream->col = 1;
4993 inputStream->buf = buf;
4994 inputStream->directory = NULL;
4995
4996 inputStream->base = inputStream->buf->buffer->content;
4997 inputStream->cur = inputStream->buf->buffer->content;
4998 inputStream->free = NULL;
4999
5000 inputPush(ctxt, inputStream);
5001
5002 /* set encoding */
5003 if (encoding) {
5004 content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
5005 if (content) {
5006 strcpy ((char *)content, (char *)content_line);
5007 strcat ((char *)content, (char *)encoding);
5008 htmlCheckEncoding (ctxt, content);
5009 xmlFree (content);
5010 }
5011 }
5012
5013 return(ctxt);
5014}
5015
5016/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005017 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005018 * @filename: the filename
5019 * @encoding: a free form C string describing the HTML document encoding, or NULL
5020 * @sax: the SAX handler block
5021 * @userData: if using SAX, this pointer will be provided on callbacks.
5022 *
5023 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5024 * compressed document is provided by default if found at compile-time.
5025 * It use the given SAX function block to handle the parsing callback.
5026 * If sax is NULL, fallback to the default DOM tree building routines.
5027 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005028 * Returns the resulting document tree unless SAX is NULL or the document is
5029 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005030 */
5031
5032htmlDocPtr
5033htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5034 void *userData) {
5035 htmlDocPtr ret;
5036 htmlParserCtxtPtr ctxt;
5037 htmlSAXHandlerPtr oldsax = NULL;
5038
Daniel Veillardd0463562001-10-13 09:15:48 +00005039 xmlInitParser();
5040
Owen Taylor3473f882001-02-23 17:55:21 +00005041 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5042 if (ctxt == NULL) return(NULL);
5043 if (sax != NULL) {
5044 oldsax = ctxt->sax;
5045 ctxt->sax = sax;
5046 ctxt->userData = userData;
5047 }
5048
5049 htmlParseDocument(ctxt);
5050
5051 ret = ctxt->myDoc;
5052 if (sax != NULL) {
5053 ctxt->sax = oldsax;
5054 ctxt->userData = NULL;
5055 }
5056 htmlFreeParserCtxt(ctxt);
5057
5058 return(ret);
5059}
5060
5061/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005062 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005063 * @filename: the filename
5064 * @encoding: a free form C string describing the HTML document encoding, or NULL
5065 *
5066 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5067 * compressed document is provided by default if found at compile-time.
5068 *
5069 * Returns the resulting document tree
5070 */
5071
5072htmlDocPtr
5073htmlParseFile(const char *filename, const char *encoding) {
5074 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5075}
5076
5077/**
5078 * htmlHandleOmittedElem:
5079 * @val: int 0 or 1
5080 *
5081 * Set and return the previous value for handling HTML omitted tags.
5082 *
5083 * Returns the last value for 0 for no handling, 1 for auto insertion.
5084 */
5085
5086int
5087htmlHandleOmittedElem(int val) {
5088 int old = htmlOmittedDefaultValue;
5089
5090 htmlOmittedDefaultValue = val;
5091 return(old);
5092}
5093
5094#endif /* LIBXML_HTML_ENABLED */