blob: 186ab09bb183180549dc4be1359f13200cce17a1 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Owen Taylor3473f882001-02-23 17:55:21 +000045
46#define HTML_MAX_NAMELEN 1000
47#define HTML_PARSER_BIG_BUFFER_SIZE 1000
48#define HTML_PARSER_BUFFER_SIZE 100
49
50/* #define DEBUG */
51/* #define DEBUG_PUSH */
52
Daniel Veillard22090732001-07-16 00:06:07 +000053static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000054
Daniel Veillard56a4cb82001-03-24 17:00:36 +000055xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
56 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000057static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000058
59/************************************************************************
60 * *
Owen Taylor3473f882001-02-23 17:55:21 +000061 * Parser stacks related functions and macros *
62 * *
63 ************************************************************************/
64
Daniel Veillard1c732d22002-11-30 11:22:59 +000065/**
66 * htmlnamePush:
67 * @ctxt: an HTML parser context
68 * @value: the element name
69 *
70 * Pushes a new element name on top of the name stack
71 *
72 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +000073 */
Daniel Veillard1c732d22002-11-30 11:22:59 +000074static int
75htmlnamePush(htmlParserCtxtPtr ctxt, xmlChar * value)
76{
77 if (ctxt->nameNr >= ctxt->nameMax) {
78 ctxt->nameMax *= 2;
79 ctxt->nameTab =
80 (xmlChar * *)xmlRealloc(ctxt->nameTab,
81 ctxt->nameMax *
82 sizeof(ctxt->nameTab[0]));
83 if (ctxt->nameTab == NULL) {
84 xmlGenericError(xmlGenericErrorContext, "realloc failed !\n");
85 return (0);
86 }
87 }
88 ctxt->nameTab[ctxt->nameNr] = value;
89 ctxt->name = value;
90 return (ctxt->nameNr++);
91}
92/**
93 * htmlnamePop:
94 * @ctxt: an HTML parser context
95 *
96 * Pops the top element name from the name stack
97 *
98 * Returns the name just removed
99 */
100static xmlChar *
101htmlnamePop(htmlParserCtxtPtr ctxt)
102{
103 xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000104
Daniel Veillard1c732d22002-11-30 11:22:59 +0000105 if (ctxt->nameNr <= 0)
106 return (0);
107 ctxt->nameNr--;
108 if (ctxt->nameNr < 0)
109 return (0);
110 if (ctxt->nameNr > 0)
111 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
112 else
113 ctxt->name = NULL;
114 ret = ctxt->nameTab[ctxt->nameNr];
115 ctxt->nameTab[ctxt->nameNr] = 0;
116 return (ret);
117}
Owen Taylor3473f882001-02-23 17:55:21 +0000118
119/*
120 * Macros for accessing the content. Those should be used only by the parser,
121 * and not exported.
122 *
123 * Dirty macros, i.e. one need to make assumption on the context to use them
124 *
125 * CUR_PTR return the current pointer to the xmlChar to be parsed.
126 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
127 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
128 * in UNICODE mode. This should be used internally by the parser
129 * only to compare to ASCII values otherwise it would break when
130 * running with UTF-8 encoding.
131 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
132 * to compare on ASCII based substring.
133 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
134 * it should be used only to compare on ASCII based substring.
135 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
136 * strings within the parser.
137 *
138 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
139 *
140 * CURRENT Returns the current char value, with the full decoding of
141 * UTF-8 if we are using this mode. It returns an int.
142 * NEXT Skip to the next character, this does the proper decoding
143 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
144 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
145 */
146
147#define UPPER (toupper(*ctxt->input->cur))
148
149#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
150
151#define NXT(val) ctxt->input->cur[(val)]
152
153#define UPP(val) (toupper(ctxt->input->cur[(val)]))
154
155#define CUR_PTR ctxt->input->cur
156
157#define SHRINK xmlParserInputShrink(ctxt->input)
158
159#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
160
161#define CURRENT ((int) (*ctxt->input->cur))
162
163#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
164
165/* Inported from XML */
166
Daniel Veillard561b7f82002-03-20 21:55:57 +0000167/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
168#define CUR ((int) (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000169#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
170
Daniel Veillard561b7f82002-03-20 21:55:57 +0000171#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000172#define NXT(val) ctxt->input->cur[(val)]
173#define CUR_PTR ctxt->input->cur
174
175
176#define NEXTL(l) do { \
177 if (*(ctxt->input->cur) == '\n') { \
178 ctxt->input->line++; ctxt->input->col = 1; \
179 } else ctxt->input->col++; \
180 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
181 } while (0)
182
183/************
184 \
185 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
186 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
187 ************/
188
189#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
190#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
191
192#define COPY_BUF(l,b,i,v) \
193 if (l == 1) b[i++] = (xmlChar) v; \
194 else i += xmlCopyChar(l,&b[i],v)
195
196/**
197 * htmlCurrentChar:
198 * @ctxt: the HTML parser context
199 * @len: pointer to the length of the char read
200 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000201 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000202 * bytes in the input buffer. Implement the end of line normalization:
203 * 2.11 End-of-Line Handling
204 * If the encoding is unspecified, in the case we find an ISO-Latin-1
205 * char, then the encoding converter is plugged in automatically.
206 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000207 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000208 */
209
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000210static int
Owen Taylor3473f882001-02-23 17:55:21 +0000211htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
212 if (ctxt->instate == XML_PARSER_EOF)
213 return(0);
214
215 if (ctxt->token != 0) {
216 *len = 0;
217 return(ctxt->token);
218 }
219 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
220 /*
221 * We are supposed to handle UTF8, check it's valid
222 * From rfc2044: encoding of the Unicode values on UTF-8:
223 *
224 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
225 * 0000 0000-0000 007F 0xxxxxxx
226 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
227 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
228 *
229 * Check for the 0x110000 limit too
230 */
231 const unsigned char *cur = ctxt->input->cur;
232 unsigned char c;
233 unsigned int val;
234
235 c = *cur;
236 if (c & 0x80) {
237 if (cur[1] == 0)
238 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
239 if ((cur[1] & 0xc0) != 0x80)
240 goto encoding_error;
241 if ((c & 0xe0) == 0xe0) {
242
243 if (cur[2] == 0)
244 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
245 if ((cur[2] & 0xc0) != 0x80)
246 goto encoding_error;
247 if ((c & 0xf0) == 0xf0) {
248 if (cur[3] == 0)
249 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
250 if (((c & 0xf8) != 0xf0) ||
251 ((cur[3] & 0xc0) != 0x80))
252 goto encoding_error;
253 /* 4-byte code */
254 *len = 4;
255 val = (cur[0] & 0x7) << 18;
256 val |= (cur[1] & 0x3f) << 12;
257 val |= (cur[2] & 0x3f) << 6;
258 val |= cur[3] & 0x3f;
259 } else {
260 /* 3-byte code */
261 *len = 3;
262 val = (cur[0] & 0xf) << 12;
263 val |= (cur[1] & 0x3f) << 6;
264 val |= cur[2] & 0x3f;
265 }
266 } else {
267 /* 2-byte code */
268 *len = 2;
269 val = (cur[0] & 0x1f) << 6;
270 val |= cur[1] & 0x3f;
271 }
272 if (!IS_CHAR(val)) {
273 ctxt->errNo = XML_ERR_INVALID_ENCODING;
274 if ((ctxt->sax != NULL) &&
275 (ctxt->sax->error != NULL))
276 ctxt->sax->error(ctxt->userData,
277 "Char 0x%X out of allowed range\n", val);
278 ctxt->wellFormed = 0;
Daniel Veillarddad3f682002-11-17 16:47:27 +0000279 if (ctxt->recovery == 0) ctxt->disableSAX = 1;
Owen Taylor3473f882001-02-23 17:55:21 +0000280 }
281 return(val);
282 } else {
283 /* 1-byte code */
284 *len = 1;
285 return((int) *ctxt->input->cur);
286 }
287 }
288 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000289 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000290 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000291 * XML constructs only use < 128 chars
292 */
293 *len = 1;
294 if ((int) *ctxt->input->cur < 0x80)
295 return((int) *ctxt->input->cur);
296
297 /*
298 * Humm this is bad, do an automatic flow conversion
299 */
300 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
301 ctxt->charset = XML_CHAR_ENCODING_UTF8;
302 return(xmlCurrentChar(ctxt, len));
303
304encoding_error:
305 /*
306 * If we detect an UTF8 error that probably mean that the
307 * input encoding didn't get properly advertized in the
308 * declaration header. Report the error and switch the encoding
309 * to ISO-Latin-1 (if you don't like this policy, just declare the
310 * encoding !)
311 */
312 ctxt->errNo = XML_ERR_INVALID_ENCODING;
313 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
314 ctxt->sax->error(ctxt->userData,
315 "Input is not proper UTF-8, indicate encoding !\n");
316 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
317 ctxt->input->cur[0], ctxt->input->cur[1],
318 ctxt->input->cur[2], ctxt->input->cur[3]);
319 }
320
321 ctxt->charset = XML_CHAR_ENCODING_8859_1;
322 *len = 1;
323 return((int) *ctxt->input->cur);
324}
325
326/**
Owen Taylor3473f882001-02-23 17:55:21 +0000327 * htmlSkipBlankChars:
328 * @ctxt: the HTML parser context
329 *
330 * skip all blanks character found at that point in the input streams.
331 *
332 * Returns the number of space chars skipped
333 */
334
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000335static int
Owen Taylor3473f882001-02-23 17:55:21 +0000336htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
337 int res = 0;
338
339 while (IS_BLANK(*(ctxt->input->cur))) {
340 if ((*ctxt->input->cur == 0) &&
341 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
342 xmlPopInput(ctxt);
343 } else {
344 if (*(ctxt->input->cur) == '\n') {
345 ctxt->input->line++; ctxt->input->col = 1;
346 } else ctxt->input->col++;
347 ctxt->input->cur++;
348 ctxt->nbChars++;
349 if (*ctxt->input->cur == 0)
350 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
351 }
352 res++;
353 }
354 return(res);
355}
356
357
358
359/************************************************************************
360 * *
361 * The list of HTML elements and their properties *
362 * *
363 ************************************************************************/
364
365/*
366 * Start Tag: 1 means the start tag can be ommited
367 * End Tag: 1 means the end tag can be ommited
368 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000369 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000370 * Depr: this element is deprecated
371 * DTD: 1 means that this element is valid only in the Loose DTD
372 * 2 means that this element is valid only in the Frameset DTD
373 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000374 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Owen Taylor3473f882001-02-23 17:55:21 +0000375 */
Daniel Veillard22090732001-07-16 00:06:07 +0000376static const htmlElemDesc
377html40ElementTable[] = {
Daniel Veillard02bb1702001-06-13 21:11:59 +0000378{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor " },
379{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form" },
380{ "acronym", 0, 0, 0, 0, 0, 0, 1, "" },
381{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author " },
382{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet " },
383{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area " },
384{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style" },
385{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri " },
386{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " },
387{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride " },
388{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style" },
389{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation " },
390{ "body", 1, 1, 0, 0, 0, 0, 0, "document body " },
391{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break " },
392{ "button", 0, 0, 0, 0, 0, 0, 2, "push button " },
393{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption " },
394{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center " },
395{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation" },
396{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment" },
397{ "col", 0, 2, 2, 1, 0, 0, 0, "table column " },
398{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group " },
399{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description " },
400{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text " },
401{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition" },
402{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list" },
403{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container"},
404{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list " },
405{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term " },
406{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis" },
407{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group " },
408{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font " },
409{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form " },
410{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " },
411{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" },
412{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading " },
413{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading " },
414{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading " },
415{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading " },
416{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading " },
417{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading " },
418{ "head", 1, 1, 0, 0, 0, 0, 0, "document head " },
419{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " },
420{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element " },
421{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style" },
422{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow " },
423{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image " },
424{ "input", 0, 2, 2, 1, 0, 0, 1, "form control " },
425{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text" },
426{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt " },
427{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user" },
428{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text " },
429{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend " },
430{ "li", 0, 1, 1, 0, 0, 0, 0, "list item " },
431{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link " },
432{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map " },
433{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list " },
434{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation " },
435{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering " },
436{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
437{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object " },
438{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list " },
439{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group " },
440{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " },
Daniel Veillardfee408f2002-11-22 13:18:30 +0000441{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph " },
Daniel Veillard02bb1702001-06-13 21:11:59 +0000442{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value " },
443{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text " },
444{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation " },
445{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style" },
446{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc." },
447{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements " },
448{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector " },
449{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style" },
450{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container " },
451{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text" },
452{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis" },
453{ "style", 0, 0, 0, 0, 0, 0, 0, "style info " },
454{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript" },
455{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript " },
456{ "table", 0, 0, 0, 0, 0, 0, 0, "&#160;" },
457{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body " },
458{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell" },
459{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field " },
460{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer " },
461{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell" },
462{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header " },
463{ "title", 0, 0, 0, 0, 0, 0, 0, "document title " },
464{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row " },
465{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style" },
466{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style" },
467{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list " },
468{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument" },
Owen Taylor3473f882001-02-23 17:55:21 +0000469};
470
471/*
Owen Taylor3473f882001-02-23 17:55:21 +0000472 * start tags that imply the end of current element
473 */
Daniel Veillard22090732001-07-16 00:06:07 +0000474static const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000475"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
476 "dl", "ul", "ol", "menu", "dir", "address", "pre",
477 "listing", "xmp", "head", NULL,
478"head", "p", NULL,
479"title", "p", NULL,
480"body", "head", "style", "link", "title", "p", NULL,
481"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
482 "pre", "listing", "xmp", "head", "li", NULL,
483"hr", "p", "head", NULL,
484"h1", "p", "head", NULL,
485"h2", "p", "head", NULL,
486"h3", "p", "head", NULL,
487"h4", "p", "head", NULL,
488"h5", "p", "head", NULL,
489"h6", "p", "head", NULL,
490"dir", "p", "head", NULL,
491"address", "p", "head", "ul", NULL,
492"pre", "p", "head", "ul", NULL,
493"listing", "p", "head", NULL,
494"xmp", "p", "head", NULL,
495"blockquote", "p", "head", NULL,
496"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
497 "xmp", "head", NULL,
498"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
499 "head", "dd", NULL,
500"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
501 "head", "dt", NULL,
502"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
503 "listing", "xmp", NULL,
504"ol", "p", "head", "ul", NULL,
505"menu", "p", "head", "ul", NULL,
506"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
507"div", "p", "head", NULL,
508"noscript", "p", "head", NULL,
509"center", "font", "b", "i", "p", "head", NULL,
510"a", "a", NULL,
511"caption", "p", NULL,
512"colgroup", "caption", "colgroup", "col", "p", NULL,
513"col", "caption", "col", "p", NULL,
514"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
515 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000516"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
517"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000518"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
519"thead", "caption", "col", "colgroup", NULL,
520"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
521 "tbody", "p", NULL,
522"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
523 "tfoot", "tbody", "p", NULL,
524"optgroup", "option", NULL,
525"option", "option", NULL,
526"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
527 "pre", "listing", "xmp", "a", NULL,
528NULL
529};
530
531/*
532 * The list of HTML elements which are supposed not to have
533 * CDATA content and where a p element will be implied
534 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000535 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +0000536 * implied paragraph
537 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000538static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000539 "html",
540 "head",
541 "body",
542 NULL
543};
544
545/*
546 * The list of HTML attributes which are of content %Script;
547 * NOTE: when adding ones, check htmlIsScriptAttribute() since
548 * it assumes the name starts with 'on'
549 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000550static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000551 "onclick",
552 "ondblclick",
553 "onmousedown",
554 "onmouseup",
555 "onmouseover",
556 "onmousemove",
557 "onmouseout",
558 "onkeypress",
559 "onkeydown",
560 "onkeyup",
561 "onload",
562 "onunload",
563 "onfocus",
564 "onblur",
565 "onsubmit",
566 "onrest",
567 "onchange",
568 "onselect"
569};
570
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000571/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000572 * This table is used by the htmlparser to know what to do with
573 * broken html pages. By assigning different priorities to different
574 * elements the parser can decide how to handle extra endtags.
575 * Endtags are only allowed to close elements with lower or equal
576 * priority.
577 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000578
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000579typedef struct {
580 const char *name;
581 int priority;
582} elementPriority;
583
Daniel Veillard22090732001-07-16 00:06:07 +0000584static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000585 {"div", 150},
586 {"td", 160},
587 {"th", 160},
588 {"tr", 170},
589 {"thead", 180},
590 {"tbody", 180},
591 {"tfoot", 180},
592 {"table", 190},
593 {"head", 200},
594 {"body", 200},
595 {"html", 220},
596 {NULL, 100} /* Default priority */
597};
Owen Taylor3473f882001-02-23 17:55:21 +0000598
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000599static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +0000600static int htmlStartCloseIndexinitialized = 0;
601
602/************************************************************************
603 * *
604 * functions to handle HTML specific data *
605 * *
606 ************************************************************************/
607
608/**
609 * htmlInitAutoClose:
610 *
611 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
612 * This is not reentrant. Call xmlInitParser() once before processing in
613 * case of use in multithreaded programs.
614 */
615void
616htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000617 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000618
619 if (htmlStartCloseIndexinitialized) return;
620
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000621 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
622 indx = 0;
623 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
624 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +0000625 while (htmlStartClose[i] != NULL) i++;
626 i++;
627 }
628 htmlStartCloseIndexinitialized = 1;
629}
630
631/**
632 * htmlTagLookup:
633 * @tag: The tag name in lowercase
634 *
635 * Lookup the HTML tag in the ElementTable
636 *
637 * Returns the related htmlElemDescPtr or NULL if not found.
638 */
Daniel Veillardbb371292001-08-16 23:26:59 +0000639const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +0000640htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000641 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000642
643 for (i = 0; i < (sizeof(html40ElementTable) /
644 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000645 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillard22090732001-07-16 00:06:07 +0000646 return((const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +0000647 }
648 return(NULL);
649}
650
651/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000652 * htmlGetEndPriority:
653 * @name: The name of the element to look up the priority for.
654 *
655 * Return value: The "endtag" priority.
656 **/
657static int
658htmlGetEndPriority (const xmlChar *name) {
659 int i = 0;
660
661 while ((htmlEndPriority[i].name != NULL) &&
662 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
663 i++;
664
665 return(htmlEndPriority[i].priority);
666}
667
668/**
Owen Taylor3473f882001-02-23 17:55:21 +0000669 * htmlCheckAutoClose:
670 * @newtag: The new tag name
671 * @oldtag: The old tag name
672 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000673 * Checks whether the new tag is one of the registered valid tags for
674 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +0000675 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
676 *
677 * Returns 0 if no, 1 if yes.
678 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000679static int
Owen Taylor3473f882001-02-23 17:55:21 +0000680htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000681 int i, indx;
682 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +0000683
684 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
685
686 /* inefficient, but not a big deal */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000687 for (indx = 0; indx < 100;indx++) {
688 closed = htmlStartCloseIndex[indx];
689 if (closed == NULL) return(0);
690 if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor3473f882001-02-23 17:55:21 +0000691 }
692
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000693 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +0000694 i++;
695 while (htmlStartClose[i] != NULL) {
696 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
697 return(1);
698 }
699 i++;
700 }
701 return(0);
702}
703
704/**
705 * htmlAutoCloseOnClose:
706 * @ctxt: an HTML parser context
707 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000708 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +0000709 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000710 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000711 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000712static void
Owen Taylor3473f882001-02-23 17:55:21 +0000713htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillardbb371292001-08-16 23:26:59 +0000714 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +0000715 xmlChar *oldname;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000716 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +0000717
718#ifdef DEBUG
719 xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
720 for (i = 0;i < ctxt->nameNr;i++)
721 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
722#endif
723
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000724 priority = htmlGetEndPriority (newtag);
725
Owen Taylor3473f882001-02-23 17:55:21 +0000726 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000727
Owen Taylor3473f882001-02-23 17:55:21 +0000728 if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000729 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000730 * A missplaced endtag can only close elements with lower
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000731 * or equal priority, so if we find an element with higher
732 * priority before we find an element with
733 * matching name, we just ignore this endtag
734 */
735 if (htmlGetEndPriority (ctxt->nameTab[i]) > priority) return;
Owen Taylor3473f882001-02-23 17:55:21 +0000736 }
737 if (i < 0) return;
738
739 while (!xmlStrEqual(newtag, ctxt->name)) {
740 info = htmlTagLookup(ctxt->name);
741 if ((info == NULL) || (info->endTag == 1)) {
742#ifdef DEBUG
743 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
744#endif
Daniel Veillard56098d42001-04-24 12:51:09 +0000745 } else if (info->endTag == 3) {
746#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +0000747 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", newtag, ctxt->name);
William M. Brack1633d182001-10-05 15:41:19 +0000748
Daniel Veillard56098d42001-04-24 12:51:09 +0000749#endif
750 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
751 ctxt->sax->error(ctxt->userData,
752 "Opening and ending tag mismatch: %s and %s\n",
753 newtag, ctxt->name);
754 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000755 }
756 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
757 ctxt->sax->endElement(ctxt->userData, ctxt->name);
758 oldname = htmlnamePop(ctxt);
759 if (oldname != NULL) {
760#ifdef DEBUG
761 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
762#endif
763 xmlFree(oldname);
764 }
765 }
766}
767
768/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000769 * htmlAutoCloseOnEnd:
770 * @ctxt: an HTML parser context
771 *
772 * Close all remaining tags at the end of the stream
773 */
774static void
775htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
776 xmlChar *oldname;
777 int i;
778
779 if (ctxt->nameNr == 0)
780 return;
781#ifdef DEBUG
782 xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
783#endif
784
785 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
786#ifdef DEBUG
787 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
788#endif
789 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
790 ctxt->sax->endElement(ctxt->userData, ctxt->name);
791 oldname = htmlnamePop(ctxt);
792 if (oldname != NULL) {
793#ifdef DEBUG
794 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
795#endif
796 xmlFree(oldname);
797 }
798 }
799}
800
801/**
Owen Taylor3473f882001-02-23 17:55:21 +0000802 * htmlAutoClose:
803 * @ctxt: an HTML parser context
804 * @newtag: The new tag name or NULL
805 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000806 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000807 * The list is kept in htmlStartClose array. This function is
808 * called when a new tag has been detected and generates the
809 * appropriates closes if possible/needed.
810 * If newtag is NULL this mean we are at the end of the resource
811 * and we should check
812 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000813static void
Owen Taylor3473f882001-02-23 17:55:21 +0000814htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
815 xmlChar *oldname;
816 while ((newtag != NULL) && (ctxt->name != NULL) &&
817 (htmlCheckAutoClose(newtag, ctxt->name))) {
818#ifdef DEBUG
819 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
820#endif
821 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
822 ctxt->sax->endElement(ctxt->userData, ctxt->name);
823 oldname = htmlnamePop(ctxt);
824 if (oldname != NULL) {
825#ifdef DEBUG
826 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
827#endif
828 xmlFree(oldname);
829 }
830 }
831 if (newtag == NULL) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000832 htmlAutoCloseOnEnd(ctxt);
833 return;
Owen Taylor3473f882001-02-23 17:55:21 +0000834 }
835 while ((newtag == NULL) && (ctxt->name != NULL) &&
836 ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
837 (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
838 (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
839#ifdef DEBUG
840 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
841#endif
842 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
843 ctxt->sax->endElement(ctxt->userData, ctxt->name);
844 oldname = htmlnamePop(ctxt);
845 if (oldname != NULL) {
846#ifdef DEBUG
847 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
848#endif
849 xmlFree(oldname);
850 }
851 }
852
853}
854
855/**
856 * htmlAutoCloseTag:
857 * @doc: the HTML document
858 * @name: The tag name
859 * @elem: the HTML element
860 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000861 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000862 * The list is kept in htmlStartClose array. This function checks
863 * if the element or one of it's children would autoclose the
864 * given tag.
865 *
866 * Returns 1 if autoclose, 0 otherwise
867 */
868int
869htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
870 htmlNodePtr child;
871
872 if (elem == NULL) return(1);
873 if (xmlStrEqual(name, elem->name)) return(0);
874 if (htmlCheckAutoClose(elem->name, name)) return(1);
875 child = elem->children;
876 while (child != NULL) {
877 if (htmlAutoCloseTag(doc, name, child)) return(1);
878 child = child->next;
879 }
880 return(0);
881}
882
883/**
884 * htmlIsAutoClosed:
885 * @doc: the HTML document
886 * @elem: the HTML element
887 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000888 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000889 * The list is kept in htmlStartClose array. This function checks
890 * if a tag is autoclosed by one of it's child
891 *
892 * Returns 1 if autoclosed, 0 otherwise
893 */
894int
895htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
896 htmlNodePtr child;
897
898 if (elem == NULL) return(1);
899 child = elem->children;
900 while (child != NULL) {
901 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
902 child = child->next;
903 }
904 return(0);
905}
906
907/**
908 * htmlCheckImplied:
909 * @ctxt: an HTML parser context
910 * @newtag: The new tag name
911 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000912 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +0000913 * called when a new tag has been detected and generates the
914 * appropriates implicit tags if missing
915 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000916static void
Owen Taylor3473f882001-02-23 17:55:21 +0000917htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
918 if (!htmlOmittedDefaultValue)
919 return;
920 if (xmlStrEqual(newtag, BAD_CAST"html"))
921 return;
922 if (ctxt->nameNr <= 0) {
923#ifdef DEBUG
924 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
925#endif
926 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
927 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
928 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
929 }
930 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
931 return;
932 if ((ctxt->nameNr <= 1) &&
933 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
934 (xmlStrEqual(newtag, BAD_CAST"style")) ||
935 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
936 (xmlStrEqual(newtag, BAD_CAST"link")) ||
937 (xmlStrEqual(newtag, BAD_CAST"title")) ||
938 (xmlStrEqual(newtag, BAD_CAST"base")))) {
939 /*
940 * dropped OBJECT ... i you put it first BODY will be
941 * assumed !
942 */
943#ifdef DEBUG
944 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
945#endif
946 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
947 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
948 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
949 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
950 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
951 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
952 int i;
953 for (i = 0;i < ctxt->nameNr;i++) {
954 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
955 return;
956 }
957 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
958 return;
959 }
960 }
961
962#ifdef DEBUG
963 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
964#endif
965 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
966 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
967 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
968 }
969}
970
971/**
972 * htmlCheckParagraph
973 * @ctxt: an HTML parser context
974 *
975 * Check whether a p element need to be implied before inserting
976 * characters in the current element.
977 *
978 * Returns 1 if a paragraph has been inserted, 0 if not and -1
979 * in case of error.
980 */
981
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000982static int
Owen Taylor3473f882001-02-23 17:55:21 +0000983htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
984 const xmlChar *tag;
985 int i;
986
987 if (ctxt == NULL)
988 return(-1);
989 tag = ctxt->name;
990 if (tag == NULL) {
991 htmlAutoClose(ctxt, BAD_CAST"p");
992 htmlCheckImplied(ctxt, BAD_CAST"p");
993 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
994 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
995 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
996 return(1);
997 }
998 if (!htmlOmittedDefaultValue)
999 return(0);
1000 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1001 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1002#ifdef DEBUG
1003 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
1004#endif
1005 htmlAutoClose(ctxt, BAD_CAST"p");
1006 htmlCheckImplied(ctxt, BAD_CAST"p");
1007 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
1008 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1009 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1010 return(1);
1011 }
1012 }
1013 return(0);
1014}
1015
1016/**
1017 * htmlIsScriptAttribute:
1018 * @name: an attribute name
1019 *
1020 * Check if an attribute is of content type Script
1021 *
1022 * Returns 1 is the attribute is a script 0 otherwise
1023 */
1024int
1025htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001026 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001027
1028 if (name == NULL)
1029 return(0);
1030 /*
1031 * all script attributes start with 'on'
1032 */
1033 if ((name[0] != 'o') || (name[1] != 'n'))
1034 return(0);
1035 for (i = 0;
1036 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1037 i++) {
1038 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1039 return(1);
1040 }
1041 return(0);
1042}
1043
1044/************************************************************************
1045 * *
1046 * The list of HTML predefined entities *
1047 * *
1048 ************************************************************************/
1049
1050
Daniel Veillard22090732001-07-16 00:06:07 +00001051static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001052/*
1053 * the 4 absolute ones, plus apostrophe.
1054 */
1055{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1056{ 38, "amp", "ampersand, U+0026 ISOnum" },
1057{ 39, "apos", "single quote" },
1058{ 60, "lt", "less-than sign, U+003C ISOnum" },
1059{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1060
1061/*
1062 * A bunch still in the 128-255 range
1063 * Replacing them depend really on the charset used.
1064 */
1065{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1066{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1067{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1068{ 163, "pound","pound sign, U+00A3 ISOnum" },
1069{ 164, "curren","currency sign, U+00A4 ISOnum" },
1070{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1071{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1072{ 167, "sect", "section sign, U+00A7 ISOnum" },
1073{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1074{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1075{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1076{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1077{ 172, "not", "not sign, U+00AC ISOnum" },
1078{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1079{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1080{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1081{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1082{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1083{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1084{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1085{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1086{ 181, "micro","micro sign, U+00B5 ISOnum" },
1087{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1088{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1089{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1090{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1091{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1092{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1093{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1094{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1095{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1096{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1097{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1098{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1099{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1100{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1101{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1102{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1103{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1104{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1105{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1106{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1107{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1108{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1109{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1110{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1111{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1112{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1113{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1114{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1115{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1116{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1117{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1118{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1119{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1120{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1121{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1122{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1123{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1124{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1125{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1126{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1127{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1128{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1129{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1130{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1131{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1132{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1133{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1134{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1135{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1136{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1137{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1138{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1139{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1140{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1141{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1142{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1143{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1144{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1145{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1146{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1147{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1148{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1149{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1150{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1151{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1152{ 247, "divide","division sign, U+00F7 ISOnum" },
1153{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1154{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1155{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1156{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1157{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1158{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1159{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1160{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1161
1162{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1163{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1164{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1165{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1166{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1167
1168/*
1169 * Anything below should really be kept as entities references
1170 */
1171{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1172
1173{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1174{ 732, "tilde","small tilde, U+02DC ISOdia" },
1175
1176{ 913, "Alpha","greek capital letter alpha, U+0391" },
1177{ 914, "Beta", "greek capital letter beta, U+0392" },
1178{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1179{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1180{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1181{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1182{ 919, "Eta", "greek capital letter eta, U+0397" },
1183{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1184{ 921, "Iota", "greek capital letter iota, U+0399" },
1185{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001186{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001187{ 924, "Mu", "greek capital letter mu, U+039C" },
1188{ 925, "Nu", "greek capital letter nu, U+039D" },
1189{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1190{ 927, "Omicron","greek capital letter omicron, U+039F" },
1191{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1192{ 929, "Rho", "greek capital letter rho, U+03A1" },
1193{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1194{ 932, "Tau", "greek capital letter tau, U+03A4" },
1195{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1196{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1197{ 935, "Chi", "greek capital letter chi, U+03A7" },
1198{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1199{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1200
1201{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1202{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1203{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1204{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1205{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1206{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1207{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1208{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1209{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1210{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1211{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1212{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1213{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1214{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1215{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1216{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1217{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1218{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1219{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1220{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1221{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1222{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1223{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1224{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1225{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1226{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1227{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1228{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1229
1230{ 8194, "ensp", "en space, U+2002 ISOpub" },
1231{ 8195, "emsp", "em space, U+2003 ISOpub" },
1232{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1233{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1234{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1235{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1236{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1237{ 8211, "ndash","en dash, U+2013 ISOpub" },
1238{ 8212, "mdash","em dash, U+2014 ISOpub" },
1239{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1240{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1241{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1242{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1243{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1244{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1245{ 8224, "dagger","dagger, U+2020 ISOpub" },
1246{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1247
1248{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1249{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1250
1251{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1252
1253{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1254{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1255
1256{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1257{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1258
1259{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1260{ 8260, "frasl","fraction slash, U+2044 NEW" },
1261
1262{ 8364, "euro", "euro sign, U+20AC NEW" },
1263
1264{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1265{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1266{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1267{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1268{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1269{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1270{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1271{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1272{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1273{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1274{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1275{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1276{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1277{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1278{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1279{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1280
1281{ 8704, "forall","for all, U+2200 ISOtech" },
1282{ 8706, "part", "partial differential, U+2202 ISOtech" },
1283{ 8707, "exist","there exists, U+2203 ISOtech" },
1284{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1285{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1286{ 8712, "isin", "element of, U+2208 ISOtech" },
1287{ 8713, "notin","not an element of, U+2209 ISOtech" },
1288{ 8715, "ni", "contains as member, U+220B ISOtech" },
1289{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001290{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001291{ 8722, "minus","minus sign, U+2212 ISOtech" },
1292{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1293{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1294{ 8733, "prop", "proportional to, U+221D ISOtech" },
1295{ 8734, "infin","infinity, U+221E ISOtech" },
1296{ 8736, "ang", "angle, U+2220 ISOamso" },
1297{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1298{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1299{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1300{ 8746, "cup", "union = cup, U+222A ISOtech" },
1301{ 8747, "int", "integral, U+222B ISOtech" },
1302{ 8756, "there4","therefore, U+2234 ISOtech" },
1303{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1304{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1305{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1306{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1307{ 8801, "equiv","identical to, U+2261 ISOtech" },
1308{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1309{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1310{ 8834, "sub", "subset of, U+2282 ISOtech" },
1311{ 8835, "sup", "superset of, U+2283 ISOtech" },
1312{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1313{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1314{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1315{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1316{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1317{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1318{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1319{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1320{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1321{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1322{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1323{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1324{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1325{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1326
1327{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1328{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1329{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1330{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1331
1332};
1333
1334/************************************************************************
1335 * *
1336 * Commodity functions to handle entities *
1337 * *
1338 ************************************************************************/
1339
1340/*
1341 * Macro used to grow the current buffer.
1342 */
1343#define growBuffer(buffer) { \
1344 buffer##_size *= 2; \
Daniel Veillard3487c8d2002-09-05 11:33:25 +00001345 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
Owen Taylor3473f882001-02-23 17:55:21 +00001346 if (buffer == NULL) { \
Daniel Veillard3487c8d2002-09-05 11:33:25 +00001347 xmlGenericError(xmlGenericErrorContext, "realloc failed\n"); \
Owen Taylor3473f882001-02-23 17:55:21 +00001348 return(NULL); \
1349 } \
1350}
1351
1352/**
1353 * htmlEntityLookup:
1354 * @name: the entity name
1355 *
1356 * Lookup the given entity in EntitiesTable
1357 *
1358 * TODO: the linear scan is really ugly, an hash table is really needed.
1359 *
1360 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1361 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001362const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001363htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001364 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001365
1366 for (i = 0;i < (sizeof(html40EntitiesTable)/
1367 sizeof(html40EntitiesTable[0]));i++) {
1368 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1369#ifdef DEBUG
1370 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
1371#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001372 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001373 }
1374 }
1375 return(NULL);
1376}
1377
1378/**
1379 * htmlEntityValueLookup:
1380 * @value: the entity's unicode value
1381 *
1382 * Lookup the given entity in EntitiesTable
1383 *
1384 * TODO: the linear scan is really ugly, an hash table is really needed.
1385 *
1386 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1387 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001388const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001389htmlEntityValueLookup(unsigned int value) {
1390 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001391#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00001392 unsigned int lv = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001393#endif
1394
1395 for (i = 0;i < (sizeof(html40EntitiesTable)/
1396 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001397 if (html40EntitiesTable[i].value >= value) {
1398 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001399 break;
1400#ifdef DEBUG
1401 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
1402#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001403 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001404 }
1405#ifdef DEBUG
1406 if (lv > html40EntitiesTable[i].value) {
1407 xmlGenericError(xmlGenericErrorContext,
1408 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1409 lv, html40EntitiesTable[i].value);
1410 }
1411 lv = html40EntitiesTable[i].value;
1412#endif
1413 }
1414 return(NULL);
1415}
1416
1417/**
1418 * UTF8ToHtml:
1419 * @out: a pointer to an array of bytes to store the result
1420 * @outlen: the length of @out
1421 * @in: a pointer to an array of UTF-8 chars
1422 * @inlen: the length of @in
1423 *
1424 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1425 * plus HTML entities block of chars out.
1426 *
1427 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1428 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001429 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001430 * The value of @outlen after return is the number of octets consumed.
1431 */
1432int
1433UTF8ToHtml(unsigned char* out, int *outlen,
1434 const unsigned char* in, int *inlen) {
1435 const unsigned char* processed = in;
1436 const unsigned char* outend;
1437 const unsigned char* outstart = out;
1438 const unsigned char* instart = in;
1439 const unsigned char* inend;
1440 unsigned int c, d;
1441 int trailing;
1442
1443 if (in == NULL) {
1444 /*
1445 * initialization nothing to do
1446 */
1447 *outlen = 0;
1448 *inlen = 0;
1449 return(0);
1450 }
1451 inend = in + (*inlen);
1452 outend = out + (*outlen);
1453 while (in < inend) {
1454 d = *in++;
1455 if (d < 0x80) { c= d; trailing= 0; }
1456 else if (d < 0xC0) {
1457 /* trailing byte in leading position */
1458 *outlen = out - outstart;
1459 *inlen = processed - instart;
1460 return(-2);
1461 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1462 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1463 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1464 else {
1465 /* no chance for this in Ascii */
1466 *outlen = out - outstart;
1467 *inlen = processed - instart;
1468 return(-2);
1469 }
1470
1471 if (inend - in < trailing) {
1472 break;
1473 }
1474
1475 for ( ; trailing; trailing--) {
1476 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1477 break;
1478 c <<= 6;
1479 c |= d & 0x3F;
1480 }
1481
1482 /* assertion: c is a single UTF-4 value */
1483 if (c < 0x80) {
1484 if (out + 1 >= outend)
1485 break;
1486 *out++ = c;
1487 } else {
1488 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001489 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001490
1491 /*
1492 * Try to lookup a predefined HTML entity for it
1493 */
1494
1495 ent = htmlEntityValueLookup(c);
1496 if (ent == NULL) {
1497 /* no chance for this in Ascii */
1498 *outlen = out - outstart;
1499 *inlen = processed - instart;
1500 return(-2);
1501 }
1502 len = strlen(ent->name);
1503 if (out + 2 + len >= outend)
1504 break;
1505 *out++ = '&';
1506 memcpy(out, ent->name, len);
1507 out += len;
1508 *out++ = ';';
1509 }
1510 processed = in;
1511 }
1512 *outlen = out - outstart;
1513 *inlen = processed - instart;
1514 return(0);
1515}
1516
1517/**
1518 * htmlEncodeEntities:
1519 * @out: a pointer to an array of bytes to store the result
1520 * @outlen: the length of @out
1521 * @in: a pointer to an array of UTF-8 chars
1522 * @inlen: the length of @in
1523 * @quoteChar: the quote character to escape (' or ") or zero.
1524 *
1525 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1526 * plus HTML entities block of chars out.
1527 *
1528 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1529 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001530 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001531 * The value of @outlen after return is the number of octets consumed.
1532 */
1533int
1534htmlEncodeEntities(unsigned char* out, int *outlen,
1535 const unsigned char* in, int *inlen, int quoteChar) {
1536 const unsigned char* processed = in;
1537 const unsigned char* outend = out + (*outlen);
1538 const unsigned char* outstart = out;
1539 const unsigned char* instart = in;
1540 const unsigned char* inend = in + (*inlen);
1541 unsigned int c, d;
1542 int trailing;
1543
1544 while (in < inend) {
1545 d = *in++;
1546 if (d < 0x80) { c= d; trailing= 0; }
1547 else if (d < 0xC0) {
1548 /* trailing byte in leading position */
1549 *outlen = out - outstart;
1550 *inlen = processed - instart;
1551 return(-2);
1552 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1553 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1554 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1555 else {
1556 /* no chance for this in Ascii */
1557 *outlen = out - outstart;
1558 *inlen = processed - instart;
1559 return(-2);
1560 }
1561
1562 if (inend - in < trailing)
1563 break;
1564
1565 while (trailing--) {
1566 if (((d= *in++) & 0xC0) != 0x80) {
1567 *outlen = out - outstart;
1568 *inlen = processed - instart;
1569 return(-2);
1570 }
1571 c <<= 6;
1572 c |= d & 0x3F;
1573 }
1574
1575 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001576 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1577 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001578 if (out >= outend)
1579 break;
1580 *out++ = c;
1581 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001582 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001583 const char *cp;
1584 char nbuf[16];
1585 int len;
1586
1587 /*
1588 * Try to lookup a predefined HTML entity for it
1589 */
1590 ent = htmlEntityValueLookup(c);
1591 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00001592 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00001593 cp = nbuf;
1594 }
1595 else
1596 cp = ent->name;
1597 len = strlen(cp);
1598 if (out + 2 + len > outend)
1599 break;
1600 *out++ = '&';
1601 memcpy(out, cp, len);
1602 out += len;
1603 *out++ = ';';
1604 }
1605 processed = in;
1606 }
1607 *outlen = out - outstart;
1608 *inlen = processed - instart;
1609 return(0);
1610}
1611
1612/**
1613 * htmlDecodeEntities:
1614 * @ctxt: the parser context
1615 * @len: the len to decode (in bytes !), -1 for no size limit
1616 * @end: an end marker xmlChar, 0 if none
1617 * @end2: an end marker xmlChar, 0 if none
1618 * @end3: an end marker xmlChar, 0 if none
1619 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001620 * Substitute the HTML entities by their value
Owen Taylor3473f882001-02-23 17:55:21 +00001621 *
1622 * DEPRECATED !!!!
1623 *
1624 * Returns A newly allocated string with the substitution done. The caller
1625 * must deallocate it !
1626 */
1627xmlChar *
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00001628htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
1629 xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001630 static int deprecated = 0;
1631 if (!deprecated) {
1632 xmlGenericError(xmlGenericErrorContext,
1633 "htmlDecodeEntities() deprecated function reached\n");
1634 deprecated = 1;
1635 }
1636 return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00001637}
1638
1639/************************************************************************
1640 * *
1641 * Commodity functions to handle streams *
1642 * *
1643 ************************************************************************/
1644
1645/**
Owen Taylor3473f882001-02-23 17:55:21 +00001646 * htmlNewInputStream:
1647 * @ctxt: an HTML parser context
1648 *
1649 * Create a new input stream structure
1650 * Returns the new input stream or NULL
1651 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001652static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001653htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1654 htmlParserInputPtr input;
1655
1656 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1657 if (input == NULL) {
1658 ctxt->errNo = XML_ERR_NO_MEMORY;
1659 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1660 ctxt->sax->error(ctxt->userData,
1661 "malloc: couldn't allocate a new input stream\n");
1662 return(NULL);
1663 }
1664 memset(input, 0, sizeof(htmlParserInput));
1665 input->filename = NULL;
1666 input->directory = NULL;
1667 input->base = NULL;
1668 input->cur = NULL;
1669 input->buf = NULL;
1670 input->line = 1;
1671 input->col = 1;
1672 input->buf = NULL;
1673 input->free = NULL;
1674 input->version = NULL;
1675 input->consumed = 0;
1676 input->length = 0;
1677 return(input);
1678}
1679
1680
1681/************************************************************************
1682 * *
1683 * Commodity functions, cleanup needed ? *
1684 * *
1685 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001686/*
1687 * all tags allowing pc data from the html 4.01 loose dtd
1688 * NOTE: it might be more apropriate to integrate this information
1689 * into the html40ElementTable array but I don't want to risk any
1690 * binary incomptibility
1691 */
1692static const char *allowPCData[] = {
1693 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
1694 "blockquote", "body", "button", "caption", "center", "cite", "code",
1695 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
1696 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
1697 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
1698 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
1699};
Owen Taylor3473f882001-02-23 17:55:21 +00001700
1701/**
1702 * areBlanks:
1703 * @ctxt: an HTML parser context
1704 * @str: a xmlChar *
1705 * @len: the size of @str
1706 *
1707 * Is this a sequence of blank chars that one can ignore ?
1708 *
1709 * Returns 1 if ignorable 0 otherwise.
1710 */
1711
1712static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001713 unsigned int i;
1714 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00001715 xmlNodePtr lastChild;
1716
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001717 for (j = 0;j < len;j++)
1718 if (!(IS_BLANK(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001719
1720 if (CUR == 0) return(1);
1721 if (CUR != '<') return(0);
1722 if (ctxt->name == NULL)
1723 return(1);
1724 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
1725 return(1);
1726 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
1727 return(1);
1728 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
1729 return(1);
1730 if (ctxt->node == NULL) return(0);
1731 lastChild = xmlGetLastChild(ctxt->node);
1732 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00001733 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
1734 (ctxt->node->content != NULL)) return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001735 /* keep ws in constructs like ...<b> </b>...
1736 for all tags "b" allowing PCDATA */
1737 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
1738 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
1739 return(0);
1740 }
1741 }
Owen Taylor3473f882001-02-23 17:55:21 +00001742 } else if (xmlNodeIsText(lastChild)) {
1743 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001744 } else {
1745 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
1746 for all tags "p" allowing PCDATA */
1747 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
1748 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
1749 return(0);
1750 }
1751 }
Owen Taylor3473f882001-02-23 17:55:21 +00001752 }
1753 return(1);
1754}
1755
1756/**
Owen Taylor3473f882001-02-23 17:55:21 +00001757 * htmlNewDocNoDtD:
1758 * @URI: URI for the dtd, or NULL
1759 * @ExternalID: the external ID of the DTD, or NULL
1760 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001761 * Creates a new HTML document without a DTD node if @URI and @ExternalID
1762 * are NULL
1763 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001764 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00001765 */
1766htmlDocPtr
1767htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
1768 xmlDocPtr cur;
1769
1770 /*
1771 * Allocate a new document and fill the fields.
1772 */
1773 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
1774 if (cur == NULL) {
1775 xmlGenericError(xmlGenericErrorContext,
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001776 "htmlNewDocNoDtD : malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00001777 return(NULL);
1778 }
1779 memset(cur, 0, sizeof(xmlDoc));
1780
1781 cur->type = XML_HTML_DOCUMENT_NODE;
1782 cur->version = NULL;
1783 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001784 cur->doc = cur;
1785 cur->name = NULL;
1786 cur->children = NULL;
1787 cur->extSubset = NULL;
1788 cur->oldNs = NULL;
1789 cur->encoding = NULL;
1790 cur->standalone = 1;
1791 cur->compression = 0;
1792 cur->ids = NULL;
1793 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001794 cur->_private = NULL;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00001795 if ((ExternalID != NULL) ||
1796 (URI != NULL))
Daniel Veillard5151c062001-10-23 13:10:19 +00001797 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00001798 return(cur);
1799}
1800
1801/**
1802 * htmlNewDoc:
1803 * @URI: URI for the dtd, or NULL
1804 * @ExternalID: the external ID of the DTD, or NULL
1805 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001806 * Creates a new HTML document
1807 *
Owen Taylor3473f882001-02-23 17:55:21 +00001808 * Returns a new document
1809 */
1810htmlDocPtr
1811htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1812 if ((URI == NULL) && (ExternalID == NULL))
1813 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00001814 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
1815 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00001816
1817 return(htmlNewDocNoDtD(URI, ExternalID));
1818}
1819
1820
1821/************************************************************************
1822 * *
1823 * The parser itself *
1824 * Relates to http://www.w3.org/TR/html40 *
1825 * *
1826 ************************************************************************/
1827
1828/************************************************************************
1829 * *
1830 * The parser itself *
1831 * *
1832 ************************************************************************/
1833
Daniel Veillarde55e8e42003-01-10 12:50:02 +00001834static xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
1835
Owen Taylor3473f882001-02-23 17:55:21 +00001836/**
1837 * htmlParseHTMLName:
1838 * @ctxt: an HTML parser context
1839 *
1840 * parse an HTML tag or attribute name, note that we convert it to lowercase
1841 * since HTML names are not case-sensitive.
1842 *
1843 * Returns the Tag Name parsed or NULL
1844 */
1845
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001846static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001847htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1848 xmlChar *ret = NULL;
1849 int i = 0;
1850 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
1851
1852 if (!IS_LETTER(CUR) && (CUR != '_') &&
1853 (CUR != ':')) return(NULL);
1854
1855 while ((i < HTML_PARSER_BUFFER_SIZE) &&
1856 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1857 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
1858 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
1859 else loc[i] = CUR;
1860 i++;
1861
1862 NEXT;
1863 }
1864
1865 ret = xmlStrndup(loc, i);
1866
1867 return(ret);
1868}
1869
1870/**
1871 * htmlParseName:
1872 * @ctxt: an HTML parser context
1873 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001874 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00001875 *
1876 * Returns the Name parsed or NULL
1877 */
1878
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001879static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001880htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00001881 const xmlChar *in;
1882 xmlChar *ret;
1883 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001884
1885 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00001886
1887 /*
1888 * Accelerator for simple ASCII names
1889 */
1890 in = ctxt->input->cur;
1891 if (((*in >= 0x61) && (*in <= 0x7A)) ||
1892 ((*in >= 0x41) && (*in <= 0x5A)) ||
1893 (*in == '_') || (*in == ':')) {
1894 in++;
1895 while (((*in >= 0x61) && (*in <= 0x7A)) ||
1896 ((*in >= 0x41) && (*in <= 0x5A)) ||
1897 ((*in >= 0x30) && (*in <= 0x39)) ||
1898 (*in == '_') || (*in == '-') ||
1899 (*in == ':') || (*in == '.'))
1900 in++;
1901 if ((*in > 0) && (*in < 0x80)) {
1902 count = in - ctxt->input->cur;
1903 ret = xmlStrndup(ctxt->input->cur, count);
1904 ctxt->input->cur = in;
1905 return(ret);
1906 }
1907 }
1908 return(htmlParseNameComplex(ctxt));
1909}
1910
1911static xmlChar *
1912htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
1913 xmlChar buf[XML_MAX_NAMELEN + 5];
1914 int len = 0, l;
1915 int c;
1916 int count = 0;
1917
1918 /*
1919 * Handler for more complex cases
1920 */
1921 GROW;
1922 c = CUR_CHAR(l);
1923 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
1924 (!IS_LETTER(c) && (c != '_') &&
1925 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00001926 return(NULL);
1927 }
1928
Daniel Veillarde55e8e42003-01-10 12:50:02 +00001929 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
1930 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
1931 (c == '.') || (c == '-') ||
1932 (c == '_') || (c == ':') ||
1933 (IS_COMBINING(c)) ||
1934 (IS_EXTENDER(c)))) {
1935 if (count++ > 100) {
1936 count = 0;
1937 GROW;
1938 }
1939 COPY_BUF(l,buf,len,c);
1940 NEXTL(l);
1941 c = CUR_CHAR(l);
1942 if (len >= XML_MAX_NAMELEN) {
1943 /*
1944 * Okay someone managed to make a huge name, so he's ready to pay
1945 * for the processing speed.
1946 */
1947 xmlChar *buffer;
1948 int max = len * 2;
1949
1950 buffer = (xmlChar *) xmlMalloc(max * sizeof(xmlChar));
1951 if (buffer == NULL) {
1952 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1953 ctxt->sax->error(ctxt->userData,
1954 "htmlParseNameComplex: out of memory\n");
1955 return(NULL);
1956 }
1957 memcpy(buffer, buf, len);
1958 while ((IS_LETTER(c)) || (IS_DIGIT(c)) || /* test bigname.xml */
1959 (c == '.') || (c == '-') ||
1960 (c == '_') || (c == ':') ||
1961 (IS_COMBINING(c)) ||
1962 (IS_EXTENDER(c))) {
1963 if (count++ > 100) {
1964 count = 0;
1965 GROW;
1966 }
1967 if (len + 10 > max) {
1968 max *= 2;
1969 buffer = (xmlChar *) xmlRealloc(buffer,
1970 max * sizeof(xmlChar));
1971 if (buffer == NULL) {
1972 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1973 ctxt->sax->error(ctxt->userData,
1974 "htmlParseNameComplex: out of memory\n");
1975 return(NULL);
1976 }
1977 }
1978 COPY_BUF(l,buffer,len,c);
1979 NEXTL(l);
1980 c = CUR_CHAR(l);
1981 }
1982 buffer[len] = 0;
1983 return(buffer);
Owen Taylor3473f882001-02-23 17:55:21 +00001984 }
1985 }
1986 return(xmlStrndup(buf, len));
1987}
1988
Daniel Veillarde55e8e42003-01-10 12:50:02 +00001989
Owen Taylor3473f882001-02-23 17:55:21 +00001990/**
1991 * htmlParseHTMLAttribute:
1992 * @ctxt: an HTML parser context
1993 * @stop: a char stop value
1994 *
1995 * parse an HTML attribute value till the stop (quote), if
1996 * stop is 0 then it stops at the first space
1997 *
1998 * Returns the attribute parsed or NULL
1999 */
2000
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002001static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002002htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2003 xmlChar *buffer = NULL;
2004 int buffer_size = 0;
2005 xmlChar *out = NULL;
2006 xmlChar *name = NULL;
2007
2008 xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002009 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002010
2011 /*
2012 * allocate a translation buffer.
2013 */
2014 buffer_size = HTML_PARSER_BUFFER_SIZE;
2015 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
2016 if (buffer == NULL) {
Daniel Veillard3487c8d2002-09-05 11:33:25 +00002017 xmlGenericError(xmlGenericErrorContext,
2018 "htmlParseHTMLAttribute: malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002019 return(NULL);
2020 }
2021 out = buffer;
2022
2023 /*
2024 * Ok loop until we reach one of the ending chars
2025 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002026 while ((CUR != 0) && (CUR != stop)) {
2027 if ((stop == 0) && (CUR == '>')) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002028 if ((stop == 0) && (IS_BLANK(CUR))) break;
2029 if (CUR == '&') {
2030 if (NXT(1) == '#') {
2031 unsigned int c;
2032 int bits;
2033
2034 c = htmlParseCharRef(ctxt);
2035 if (c < 0x80)
2036 { *out++ = c; bits= -6; }
2037 else if (c < 0x800)
2038 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2039 else if (c < 0x10000)
2040 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2041 else
2042 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2043
2044 for ( ; bits >= 0; bits-= 6) {
2045 *out++ = ((c >> bits) & 0x3F) | 0x80;
2046 }
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002047
2048 if (out - buffer > buffer_size - 100) {
2049 int indx = out - buffer;
2050
2051 growBuffer(buffer);
2052 out = &buffer[indx];
2053 }
Owen Taylor3473f882001-02-23 17:55:21 +00002054 } else {
2055 ent = htmlParseEntityRef(ctxt, &name);
2056 if (name == NULL) {
2057 *out++ = '&';
2058 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002059 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002060
2061 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002062 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002063 }
2064 } else if (ent == NULL) {
2065 *out++ = '&';
2066 cur = name;
2067 while (*cur != 0) {
2068 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002069 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002070
2071 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002072 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002073 }
2074 *out++ = *cur++;
2075 }
2076 xmlFree(name);
2077 } else {
2078 unsigned int c;
2079 int bits;
2080
2081 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002082 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002083
2084 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002085 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002086 }
2087 c = (xmlChar)ent->value;
2088 if (c < 0x80)
2089 { *out++ = c; bits= -6; }
2090 else if (c < 0x800)
2091 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2092 else if (c < 0x10000)
2093 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2094 else
2095 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2096
2097 for ( ; bits >= 0; bits-= 6) {
2098 *out++ = ((c >> bits) & 0x3F) | 0x80;
2099 }
2100 xmlFree(name);
2101 }
2102 }
2103 } else {
2104 unsigned int c;
2105 int bits, l;
2106
2107 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002108 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002109
2110 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002111 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002112 }
2113 c = CUR_CHAR(l);
2114 if (c < 0x80)
2115 { *out++ = c; bits= -6; }
2116 else if (c < 0x800)
2117 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2118 else if (c < 0x10000)
2119 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2120 else
2121 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2122
2123 for ( ; bits >= 0; bits-= 6) {
2124 *out++ = ((c >> bits) & 0x3F) | 0x80;
2125 }
2126 NEXT;
2127 }
2128 }
2129 *out++ = 0;
2130 return(buffer);
2131}
2132
2133/**
Owen Taylor3473f882001-02-23 17:55:21 +00002134 * htmlParseEntityRef:
2135 * @ctxt: an HTML parser context
2136 * @str: location to store the entity name
2137 *
2138 * parse an HTML ENTITY references
2139 *
2140 * [68] EntityRef ::= '&' Name ';'
2141 *
2142 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2143 * if non-NULL *str will have to be freed by the caller.
2144 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002145const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00002146htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2147 xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002148 const htmlEntityDesc * ent = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002149 *str = NULL;
2150
2151 if (CUR == '&') {
2152 NEXT;
2153 name = htmlParseName(ctxt);
2154 if (name == NULL) {
2155 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2156 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2157 ctxt->wellFormed = 0;
2158 } else {
2159 GROW;
2160 if (CUR == ';') {
2161 *str = name;
2162
2163 /*
2164 * Lookup the entity in the table.
2165 */
2166 ent = htmlEntityLookup(name);
2167 if (ent != NULL) /* OK that's ugly !!! */
2168 NEXT;
2169 } else {
2170 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2171 ctxt->sax->error(ctxt->userData,
2172 "htmlParseEntityRef: expecting ';'\n");
2173 *str = name;
2174 }
2175 }
2176 }
2177 return(ent);
2178}
2179
2180/**
2181 * htmlParseAttValue:
2182 * @ctxt: an HTML parser context
2183 *
2184 * parse a value for an attribute
2185 * Note: the parser won't do substitution of entities here, this
2186 * will be handled later in xmlStringGetNodeList, unless it was
2187 * asked for ctxt->replaceEntities != 0
2188 *
2189 * Returns the AttValue parsed or NULL.
2190 */
2191
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002192static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002193htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2194 xmlChar *ret = NULL;
2195
2196 if (CUR == '"') {
2197 NEXT;
2198 ret = htmlParseHTMLAttribute(ctxt, '"');
2199 if (CUR != '"') {
2200 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2201 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2202 ctxt->wellFormed = 0;
2203 } else
2204 NEXT;
2205 } else if (CUR == '\'') {
2206 NEXT;
2207 ret = htmlParseHTMLAttribute(ctxt, '\'');
2208 if (CUR != '\'') {
2209 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2210 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2211 ctxt->wellFormed = 0;
2212 } else
2213 NEXT;
2214 } else {
2215 /*
2216 * That's an HTMLism, the attribute value may not be quoted
2217 */
2218 ret = htmlParseHTMLAttribute(ctxt, 0);
2219 if (ret == NULL) {
2220 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2221 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2222 ctxt->wellFormed = 0;
2223 }
2224 }
2225 return(ret);
2226}
2227
2228/**
2229 * htmlParseSystemLiteral:
2230 * @ctxt: an HTML parser context
2231 *
2232 * parse an HTML Literal
2233 *
2234 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2235 *
2236 * Returns the SystemLiteral parsed or NULL
2237 */
2238
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002239static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002240htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2241 const xmlChar *q;
2242 xmlChar *ret = NULL;
2243
2244 if (CUR == '"') {
2245 NEXT;
2246 q = CUR_PTR;
2247 while ((IS_CHAR(CUR)) && (CUR != '"'))
2248 NEXT;
2249 if (!IS_CHAR(CUR)) {
2250 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2251 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2252 ctxt->wellFormed = 0;
2253 } else {
2254 ret = xmlStrndup(q, CUR_PTR - q);
2255 NEXT;
2256 }
2257 } else if (CUR == '\'') {
2258 NEXT;
2259 q = CUR_PTR;
2260 while ((IS_CHAR(CUR)) && (CUR != '\''))
2261 NEXT;
2262 if (!IS_CHAR(CUR)) {
2263 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2264 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2265 ctxt->wellFormed = 0;
2266 } else {
2267 ret = xmlStrndup(q, CUR_PTR - q);
2268 NEXT;
2269 }
2270 } else {
2271 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2272 ctxt->sax->error(ctxt->userData,
2273 "SystemLiteral \" or ' expected\n");
2274 ctxt->wellFormed = 0;
2275 }
2276
2277 return(ret);
2278}
2279
2280/**
2281 * htmlParsePubidLiteral:
2282 * @ctxt: an HTML parser context
2283 *
2284 * parse an HTML public literal
2285 *
2286 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2287 *
2288 * Returns the PubidLiteral parsed or NULL.
2289 */
2290
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002291static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002292htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2293 const xmlChar *q;
2294 xmlChar *ret = NULL;
2295 /*
2296 * Name ::= (Letter | '_') (NameChar)*
2297 */
2298 if (CUR == '"') {
2299 NEXT;
2300 q = CUR_PTR;
2301 while (IS_PUBIDCHAR(CUR)) NEXT;
2302 if (CUR != '"') {
2303 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2304 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2305 ctxt->wellFormed = 0;
2306 } else {
2307 ret = xmlStrndup(q, CUR_PTR - q);
2308 NEXT;
2309 }
2310 } else if (CUR == '\'') {
2311 NEXT;
2312 q = CUR_PTR;
2313 while ((IS_LETTER(CUR)) && (CUR != '\''))
2314 NEXT;
2315 if (!IS_LETTER(CUR)) {
2316 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2317 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2318 ctxt->wellFormed = 0;
2319 } else {
2320 ret = xmlStrndup(q, CUR_PTR - q);
2321 NEXT;
2322 }
2323 } else {
2324 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2325 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2326 ctxt->wellFormed = 0;
2327 }
2328
2329 return(ret);
2330}
2331
2332/**
2333 * htmlParseScript:
2334 * @ctxt: an HTML parser context
2335 *
2336 * parse the content of an HTML SCRIPT or STYLE element
2337 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2338 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2339 * http://www.w3.org/TR/html4/types.html#type-script
2340 * http://www.w3.org/TR/html4/types.html#h-6.15
2341 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2342 *
2343 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2344 * element and the value of intrinsic event attributes. User agents must
2345 * not evaluate script data as HTML markup but instead must pass it on as
2346 * data to a script engine.
2347 * NOTES:
2348 * - The content is passed like CDATA
2349 * - the attributes for style and scripting "onXXX" are also described
2350 * as CDATA but SGML allows entities references in attributes so their
2351 * processing is identical as other attributes
2352 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002353static void
Owen Taylor3473f882001-02-23 17:55:21 +00002354htmlParseScript(htmlParserCtxtPtr ctxt) {
2355 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2356 int nbchar = 0;
2357 xmlChar cur;
2358
2359 SHRINK;
2360 cur = CUR;
2361 while (IS_CHAR(cur)) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00002362 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2363 (NXT(3) == '-')) {
2364 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2365 if (ctxt->sax->cdataBlock!= NULL) {
2366 /*
2367 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2368 */
2369 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2370 }
2371 }
2372 nbchar = 0;
2373 htmlParseComment(ctxt);
2374 cur = CUR;
2375 continue;
2376 } else if ((cur == '<') && (NXT(1) == '/')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002377 /*
2378 * One should break here, the specification is clear:
2379 * Authors should therefore escape "</" within the content.
2380 * Escape mechanisms are specific to each scripting or
2381 * style sheet language.
2382 */
2383 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2384 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2385 break; /* while */
2386 }
2387 buf[nbchar++] = cur;
2388 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2389 if (ctxt->sax->cdataBlock!= NULL) {
2390 /*
2391 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2392 */
2393 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2394 }
2395 nbchar = 0;
2396 }
2397 NEXT;
2398 cur = CUR;
2399 }
2400 if (!(IS_CHAR(cur))) {
2401 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2402 ctxt->sax->error(ctxt->userData,
2403 "Invalid char in CDATA 0x%X\n", cur);
2404 ctxt->wellFormed = 0;
2405 NEXT;
2406 }
2407
2408 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2409 if (ctxt->sax->cdataBlock!= NULL) {
2410 /*
2411 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2412 */
2413 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2414 }
2415 }
2416}
2417
2418
2419/**
2420 * htmlParseCharData:
2421 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002422 *
2423 * parse a CharData section.
2424 * if we are within a CDATA section ']]>' marks an end of section.
2425 *
2426 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2427 */
2428
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002429static void
2430htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002431 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2432 int nbchar = 0;
2433 int cur, l;
2434
2435 SHRINK;
2436 cur = CUR_CHAR(l);
2437 while (((cur != '<') || (ctxt->token == '<')) &&
2438 ((cur != '&') || (ctxt->token == '&')) &&
2439 (IS_CHAR(cur))) {
2440 COPY_BUF(l,buf,nbchar,cur);
2441 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2442 /*
2443 * Ok the segment is to be consumed as chars.
2444 */
2445 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2446 if (areBlanks(ctxt, buf, nbchar)) {
2447 if (ctxt->sax->ignorableWhitespace != NULL)
2448 ctxt->sax->ignorableWhitespace(ctxt->userData,
2449 buf, nbchar);
2450 } else {
2451 htmlCheckParagraph(ctxt);
2452 if (ctxt->sax->characters != NULL)
2453 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2454 }
2455 }
2456 nbchar = 0;
2457 }
2458 NEXTL(l);
2459 cur = CUR_CHAR(l);
2460 }
2461 if (nbchar != 0) {
2462 /*
2463 * Ok the segment is to be consumed as chars.
2464 */
2465 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2466 if (areBlanks(ctxt, buf, nbchar)) {
2467 if (ctxt->sax->ignorableWhitespace != NULL)
2468 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2469 } else {
2470 htmlCheckParagraph(ctxt);
2471 if (ctxt->sax->characters != NULL)
2472 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2473 }
2474 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002475 } else {
2476 /*
2477 * Loop detection
2478 */
2479 if (cur == 0)
2480 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002481 }
2482}
2483
2484/**
2485 * htmlParseExternalID:
2486 * @ctxt: an HTML parser context
2487 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002488 *
2489 * Parse an External ID or a Public ID
2490 *
Owen Taylor3473f882001-02-23 17:55:21 +00002491 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2492 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2493 *
2494 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2495 *
2496 * Returns the function returns SystemLiteral and in the second
2497 * case publicID receives PubidLiteral, is strict is off
2498 * it is possible to return NULL and have publicID set.
2499 */
2500
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002501static xmlChar *
2502htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002503 xmlChar *URI = NULL;
2504
2505 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2506 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2507 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2508 SKIP(6);
2509 if (!IS_BLANK(CUR)) {
2510 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2511 ctxt->sax->error(ctxt->userData,
2512 "Space required after 'SYSTEM'\n");
2513 ctxt->wellFormed = 0;
2514 }
2515 SKIP_BLANKS;
2516 URI = htmlParseSystemLiteral(ctxt);
2517 if (URI == NULL) {
2518 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2519 ctxt->sax->error(ctxt->userData,
2520 "htmlParseExternalID: SYSTEM, no URI\n");
2521 ctxt->wellFormed = 0;
2522 }
2523 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2524 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2525 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2526 SKIP(6);
2527 if (!IS_BLANK(CUR)) {
2528 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2529 ctxt->sax->error(ctxt->userData,
2530 "Space required after 'PUBLIC'\n");
2531 ctxt->wellFormed = 0;
2532 }
2533 SKIP_BLANKS;
2534 *publicID = htmlParsePubidLiteral(ctxt);
2535 if (*publicID == NULL) {
2536 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2537 ctxt->sax->error(ctxt->userData,
2538 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2539 ctxt->wellFormed = 0;
2540 }
2541 SKIP_BLANKS;
2542 if ((CUR == '"') || (CUR == '\'')) {
2543 URI = htmlParseSystemLiteral(ctxt);
2544 }
2545 }
2546 return(URI);
2547}
2548
2549/**
2550 * htmlParseComment:
2551 * @ctxt: an HTML parser context
2552 *
2553 * Parse an XML (SGML) comment <!-- .... -->
2554 *
2555 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2556 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002557static void
Owen Taylor3473f882001-02-23 17:55:21 +00002558htmlParseComment(htmlParserCtxtPtr ctxt) {
2559 xmlChar *buf = NULL;
2560 int len;
2561 int size = HTML_PARSER_BUFFER_SIZE;
2562 int q, ql;
2563 int r, rl;
2564 int cur, l;
2565 xmlParserInputState state;
2566
2567 /*
2568 * Check that there is a comment right here.
2569 */
2570 if ((RAW != '<') || (NXT(1) != '!') ||
2571 (NXT(2) != '-') || (NXT(3) != '-')) return;
2572
2573 state = ctxt->instate;
2574 ctxt->instate = XML_PARSER_COMMENT;
2575 SHRINK;
2576 SKIP(4);
2577 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2578 if (buf == NULL) {
2579 xmlGenericError(xmlGenericErrorContext,
2580 "malloc of %d byte failed\n", size);
2581 ctxt->instate = state;
2582 return;
2583 }
2584 q = CUR_CHAR(ql);
2585 NEXTL(ql);
2586 r = CUR_CHAR(rl);
2587 NEXTL(rl);
2588 cur = CUR_CHAR(l);
2589 len = 0;
2590 while (IS_CHAR(cur) &&
2591 ((cur != '>') ||
2592 (r != '-') || (q != '-'))) {
2593 if (len + 5 >= size) {
2594 size *= 2;
2595 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2596 if (buf == NULL) {
2597 xmlGenericError(xmlGenericErrorContext,
2598 "realloc of %d byte failed\n", size);
2599 ctxt->instate = state;
2600 return;
2601 }
2602 }
2603 COPY_BUF(ql,buf,len,q);
2604 q = r;
2605 ql = rl;
2606 r = cur;
2607 rl = l;
2608 NEXTL(l);
2609 cur = CUR_CHAR(l);
2610 if (cur == 0) {
2611 SHRINK;
2612 GROW;
2613 cur = CUR_CHAR(l);
2614 }
2615 }
2616 buf[len] = 0;
2617 if (!IS_CHAR(cur)) {
2618 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2619 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2620 ctxt->sax->error(ctxt->userData,
2621 "Comment not terminated \n<!--%.50s\n", buf);
2622 ctxt->wellFormed = 0;
2623 xmlFree(buf);
2624 } else {
2625 NEXT;
2626 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2627 (!ctxt->disableSAX))
2628 ctxt->sax->comment(ctxt->userData, buf);
2629 xmlFree(buf);
2630 }
2631 ctxt->instate = state;
2632}
2633
2634/**
2635 * htmlParseCharRef:
2636 * @ctxt: an HTML parser context
2637 *
2638 * parse Reference declarations
2639 *
2640 * [66] CharRef ::= '&#' [0-9]+ ';' |
2641 * '&#x' [0-9a-fA-F]+ ';'
2642 *
2643 * Returns the value parsed (as an int)
2644 */
2645int
2646htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2647 int val = 0;
2648
2649 if ((CUR == '&') && (NXT(1) == '#') &&
2650 (NXT(2) == 'x')) {
2651 SKIP(3);
2652 while (CUR != ';') {
2653 if ((CUR >= '0') && (CUR <= '9'))
2654 val = val * 16 + (CUR - '0');
2655 else if ((CUR >= 'a') && (CUR <= 'f'))
2656 val = val * 16 + (CUR - 'a') + 10;
2657 else if ((CUR >= 'A') && (CUR <= 'F'))
2658 val = val * 16 + (CUR - 'A') + 10;
2659 else {
2660 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2661 ctxt->sax->error(ctxt->userData,
2662 "htmlParseCharRef: invalid hexadecimal value\n");
2663 ctxt->wellFormed = 0;
2664 return(0);
2665 }
2666 NEXT;
2667 }
2668 if (CUR == ';')
2669 NEXT;
2670 } else if ((CUR == '&') && (NXT(1) == '#')) {
2671 SKIP(2);
2672 while (CUR != ';') {
2673 if ((CUR >= '0') && (CUR <= '9'))
2674 val = val * 10 + (CUR - '0');
2675 else {
2676 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2677 ctxt->sax->error(ctxt->userData,
2678 "htmlParseCharRef: invalid decimal value\n");
2679 ctxt->wellFormed = 0;
2680 return(0);
2681 }
2682 NEXT;
2683 }
2684 if (CUR == ';')
2685 NEXT;
2686 } else {
2687 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2688 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2689 ctxt->wellFormed = 0;
2690 }
2691 /*
2692 * Check the value IS_CHAR ...
2693 */
2694 if (IS_CHAR(val)) {
2695 return(val);
2696 } else {
2697 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2698 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
2699 val);
2700 ctxt->wellFormed = 0;
2701 }
2702 return(0);
2703}
2704
2705
2706/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00002707 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00002708 * @ctxt: an HTML parser context
2709 *
2710 * parse a DOCTYPE declaration
2711 *
2712 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2713 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2714 */
2715
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002716static void
Owen Taylor3473f882001-02-23 17:55:21 +00002717htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
2718 xmlChar *name;
2719 xmlChar *ExternalID = NULL;
2720 xmlChar *URI = NULL;
2721
2722 /*
2723 * We know that '<!DOCTYPE' has been detected.
2724 */
2725 SKIP(9);
2726
2727 SKIP_BLANKS;
2728
2729 /*
2730 * Parse the DOCTYPE name.
2731 */
2732 name = htmlParseName(ctxt);
2733 if (name == NULL) {
2734 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2735 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2736 ctxt->wellFormed = 0;
2737 }
2738 /*
2739 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2740 */
2741
2742 SKIP_BLANKS;
2743
2744 /*
2745 * Check for SystemID and ExternalID
2746 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002747 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00002748 SKIP_BLANKS;
2749
2750 /*
2751 * We should be at the end of the DOCTYPE declaration.
2752 */
2753 if (CUR != '>') {
2754 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardf6ed8bc2001-10-02 09:22:47 +00002755 ctxt->sax->error(ctxt->userData, "DOCTYPE improperly terminated\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002756 ctxt->wellFormed = 0;
2757 /* We shouldn't try to resynchronize ... */
2758 }
2759 NEXT;
2760
2761 /*
2762 * Create or update the document accordingly to the DOCTYPE
2763 */
2764 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2765 (!ctxt->disableSAX))
2766 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
2767
2768 /*
2769 * Cleanup, since we don't use all those identifiers
2770 */
2771 if (URI != NULL) xmlFree(URI);
2772 if (ExternalID != NULL) xmlFree(ExternalID);
2773 if (name != NULL) xmlFree(name);
2774}
2775
2776/**
2777 * htmlParseAttribute:
2778 * @ctxt: an HTML parser context
2779 * @value: a xmlChar ** used to store the value of the attribute
2780 *
2781 * parse an attribute
2782 *
2783 * [41] Attribute ::= Name Eq AttValue
2784 *
2785 * [25] Eq ::= S? '=' S?
2786 *
2787 * With namespace:
2788 *
2789 * [NS 11] Attribute ::= QName Eq AttValue
2790 *
2791 * Also the case QName == xmlns:??? is handled independently as a namespace
2792 * definition.
2793 *
2794 * Returns the attribute name, and the value in *value.
2795 */
2796
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002797static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002798htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
2799 xmlChar *name, *val = NULL;
2800
2801 *value = NULL;
2802 name = htmlParseHTMLName(ctxt);
2803 if (name == NULL) {
2804 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2805 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2806 ctxt->wellFormed = 0;
2807 return(NULL);
2808 }
2809
2810 /*
2811 * read the value
2812 */
2813 SKIP_BLANKS;
2814 if (CUR == '=') {
2815 NEXT;
2816 SKIP_BLANKS;
2817 val = htmlParseAttValue(ctxt);
2818 /******
2819 } else {
2820 * TODO : some attribute must have values, some may not
2821 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2822 ctxt->sax->warning(ctxt->userData,
2823 "No value for attribute %s\n", name); */
2824 }
2825
2826 *value = val;
2827 return(name);
2828}
2829
2830/**
2831 * htmlCheckEncoding:
2832 * @ctxt: an HTML parser context
2833 * @attvalue: the attribute value
2834 *
2835 * Checks an http-equiv attribute from a Meta tag to detect
2836 * the encoding
2837 * If a new encoding is detected the parser is switched to decode
2838 * it and pass UTF8
2839 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002840static void
Owen Taylor3473f882001-02-23 17:55:21 +00002841htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2842 const xmlChar *encoding;
2843
2844 if ((ctxt == NULL) || (attvalue == NULL))
2845 return;
2846
2847 /* do not change encoding */
2848 if (ctxt->input->encoding != NULL)
2849 return;
2850
2851 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
2852 if (encoding != NULL) {
2853 encoding += 8;
2854 } else {
2855 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
2856 if (encoding != NULL)
2857 encoding += 9;
2858 }
2859 if (encoding != NULL) {
2860 xmlCharEncoding enc;
2861 xmlCharEncodingHandlerPtr handler;
2862
2863 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2864
2865 if (ctxt->input->encoding != NULL)
2866 xmlFree((xmlChar *) ctxt->input->encoding);
2867 ctxt->input->encoding = xmlStrdup(encoding);
2868
2869 enc = xmlParseCharEncoding((const char *) encoding);
2870 /*
2871 * registered set of known encodings
2872 */
2873 if (enc != XML_CHAR_ENCODING_ERROR) {
2874 xmlSwitchEncoding(ctxt, enc);
2875 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2876 } else {
2877 /*
2878 * fallback for unknown encodings
2879 */
2880 handler = xmlFindCharEncodingHandler((const char *) encoding);
2881 if (handler != NULL) {
2882 xmlSwitchToEncoding(ctxt, handler);
2883 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2884 } else {
2885 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2886 }
2887 }
2888
2889 if ((ctxt->input->buf != NULL) &&
2890 (ctxt->input->buf->encoder != NULL) &&
2891 (ctxt->input->buf->raw != NULL) &&
2892 (ctxt->input->buf->buffer != NULL)) {
2893 int nbchars;
2894 int processed;
2895
2896 /*
2897 * convert as much as possible to the parser reading buffer.
2898 */
2899 processed = ctxt->input->cur - ctxt->input->base;
2900 xmlBufferShrink(ctxt->input->buf->buffer, processed);
2901 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2902 ctxt->input->buf->buffer,
2903 ctxt->input->buf->raw);
2904 if (nbchars < 0) {
2905 ctxt->errNo = XML_ERR_INVALID_ENCODING;
2906 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2907 ctxt->sax->error(ctxt->userData,
2908 "htmlCheckEncoding: encoder error\n");
2909 }
2910 ctxt->input->base =
2911 ctxt->input->cur = ctxt->input->buf->buffer->content;
2912 }
2913 }
2914}
2915
2916/**
2917 * htmlCheckMeta:
2918 * @ctxt: an HTML parser context
2919 * @atts: the attributes values
2920 *
2921 * Checks an attributes from a Meta tag
2922 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002923static void
Owen Taylor3473f882001-02-23 17:55:21 +00002924htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2925 int i;
2926 const xmlChar *att, *value;
2927 int http = 0;
2928 const xmlChar *content = NULL;
2929
2930 if ((ctxt == NULL) || (atts == NULL))
2931 return;
2932
2933 i = 0;
2934 att = atts[i++];
2935 while (att != NULL) {
2936 value = atts[i++];
2937 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
2938 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
2939 http = 1;
2940 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
2941 content = value;
2942 att = atts[i++];
2943 }
2944 if ((http) && (content != NULL))
2945 htmlCheckEncoding(ctxt, content);
2946
2947}
2948
2949/**
2950 * htmlParseStartTag:
2951 * @ctxt: an HTML parser context
2952 *
2953 * parse a start of tag either for rule element or
2954 * EmptyElement. In both case we don't parse the tag closing chars.
2955 *
2956 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2957 *
2958 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2959 *
2960 * With namespace:
2961 *
2962 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2963 *
2964 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2965 *
2966 */
2967
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002968static void
Owen Taylor3473f882001-02-23 17:55:21 +00002969htmlParseStartTag(htmlParserCtxtPtr ctxt) {
2970 xmlChar *name;
2971 xmlChar *attname;
2972 xmlChar *attvalue;
2973 const xmlChar **atts = NULL;
2974 int nbatts = 0;
2975 int maxatts = 0;
2976 int meta = 0;
2977 int i;
2978
2979 if (CUR != '<') return;
2980 NEXT;
2981
2982 GROW;
2983 name = htmlParseHTMLName(ctxt);
2984 if (name == NULL) {
2985 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2986 ctxt->sax->error(ctxt->userData,
2987 "htmlParseStartTag: invalid element name\n");
2988 ctxt->wellFormed = 0;
2989 /* Dump the bogus tag like browsers do */
2990 while ((IS_CHAR(CUR)) && (CUR != '>'))
2991 NEXT;
2992 return;
2993 }
2994 if (xmlStrEqual(name, BAD_CAST"meta"))
2995 meta = 1;
2996
2997 /*
2998 * Check for auto-closure of HTML elements.
2999 */
3000 htmlAutoClose(ctxt, name);
3001
3002 /*
3003 * Check for implied HTML elements.
3004 */
3005 htmlCheckImplied(ctxt, name);
3006
3007 /*
3008 * Avoid html at any level > 0, head at any level != 1
3009 * or any attempt to recurse body
3010 */
3011 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3012 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3013 ctxt->sax->error(ctxt->userData,
3014 "htmlParseStartTag: misplaced <html> tag\n");
3015 ctxt->wellFormed = 0;
3016 xmlFree(name);
3017 return;
3018 }
3019 if ((ctxt->nameNr != 1) &&
3020 (xmlStrEqual(name, BAD_CAST"head"))) {
3021 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3022 ctxt->sax->error(ctxt->userData,
3023 "htmlParseStartTag: misplaced <head> tag\n");
3024 ctxt->wellFormed = 0;
3025 xmlFree(name);
3026 return;
3027 }
3028 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003029 int indx;
3030 for (indx = 0;indx < ctxt->nameNr;indx++) {
3031 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor3473f882001-02-23 17:55:21 +00003032 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3033 ctxt->sax->error(ctxt->userData,
3034 "htmlParseStartTag: misplaced <body> tag\n");
3035 ctxt->wellFormed = 0;
3036 xmlFree(name);
3037 return;
3038 }
3039 }
3040 }
3041
3042 /*
3043 * Now parse the attributes, it ends up with the ending
3044 *
3045 * (S Attribute)* S?
3046 */
3047 SKIP_BLANKS;
3048 while ((IS_CHAR(CUR)) &&
3049 (CUR != '>') &&
3050 ((CUR != '/') || (NXT(1) != '>'))) {
3051 long cons = ctxt->nbChars;
3052
3053 GROW;
3054 attname = htmlParseAttribute(ctxt, &attvalue);
3055 if (attname != NULL) {
3056
3057 /*
3058 * Well formedness requires at most one declaration of an attribute
3059 */
3060 for (i = 0; i < nbatts;i += 2) {
3061 if (xmlStrEqual(atts[i], attname)) {
3062 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3063 ctxt->sax->error(ctxt->userData,
3064 "Attribute %s redefined\n",
3065 attname);
3066 ctxt->wellFormed = 0;
3067 xmlFree(attname);
3068 if (attvalue != NULL)
3069 xmlFree(attvalue);
3070 goto failed;
3071 }
3072 }
3073
3074 /*
3075 * Add the pair to atts
3076 */
3077 if (atts == NULL) {
3078 maxatts = 10;
3079 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
3080 if (atts == NULL) {
3081 xmlGenericError(xmlGenericErrorContext,
3082 "malloc of %ld byte failed\n",
3083 maxatts * (long)sizeof(xmlChar *));
3084 if (name != NULL) xmlFree(name);
3085 return;
3086 }
3087 } else if (nbatts + 4 > maxatts) {
3088 maxatts *= 2;
3089 atts = (const xmlChar **) xmlRealloc((void *) atts,
3090 maxatts * sizeof(xmlChar *));
3091 if (atts == NULL) {
3092 xmlGenericError(xmlGenericErrorContext,
3093 "realloc of %ld byte failed\n",
3094 maxatts * (long)sizeof(xmlChar *));
3095 if (name != NULL) xmlFree(name);
3096 return;
3097 }
3098 }
3099 atts[nbatts++] = attname;
3100 atts[nbatts++] = attvalue;
3101 atts[nbatts] = NULL;
3102 atts[nbatts + 1] = NULL;
3103 }
3104 else {
3105 /* Dump the bogus attribute string up to the next blank or
3106 * the end of the tag. */
Daniel Veillard561b7f82002-03-20 21:55:57 +00003107 while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
3108 && ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003109 NEXT;
3110 }
3111
3112failed:
3113 SKIP_BLANKS;
3114 if (cons == ctxt->nbChars) {
3115 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3116 ctxt->sax->error(ctxt->userData,
3117 "htmlParseStartTag: problem parsing attributes\n");
3118 ctxt->wellFormed = 0;
3119 break;
3120 }
3121 }
3122
3123 /*
3124 * Handle specific association to the META tag
3125 */
3126 if (meta)
3127 htmlCheckMeta(ctxt, atts);
3128
3129 /*
3130 * SAX: Start of Element !
3131 */
3132 htmlnamePush(ctxt, xmlStrdup(name));
3133#ifdef DEBUG
3134 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
3135#endif
3136 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3137 ctxt->sax->startElement(ctxt->userData, name, atts);
3138
3139 if (atts != NULL) {
3140 for (i = 0;i < nbatts;i++) {
3141 if (atts[i] != NULL)
3142 xmlFree((xmlChar *) atts[i]);
3143 }
3144 xmlFree((void *) atts);
3145 }
3146 if (name != NULL) xmlFree(name);
3147}
3148
3149/**
3150 * htmlParseEndTag:
3151 * @ctxt: an HTML parser context
3152 *
3153 * parse an end of tag
3154 *
3155 * [42] ETag ::= '</' Name S? '>'
3156 *
3157 * With namespace
3158 *
3159 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003160 *
3161 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003162 */
3163
Daniel Veillardf420ac52001-07-04 16:04:09 +00003164static int
Owen Taylor3473f882001-02-23 17:55:21 +00003165htmlParseEndTag(htmlParserCtxtPtr ctxt) {
3166 xmlChar *name;
3167 xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003168 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003169
3170 if ((CUR != '<') || (NXT(1) != '/')) {
3171 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3172 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3173 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003174 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003175 }
3176 SKIP(2);
3177
3178 name = htmlParseHTMLName(ctxt);
Daniel Veillardf420ac52001-07-04 16:04:09 +00003179 if (name == NULL) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003180
3181 /*
3182 * We should definitely be at the ending "S? '>'" part
3183 */
3184 SKIP_BLANKS;
3185 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3186 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3187 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3188 ctxt->wellFormed = 0;
3189 } else
3190 NEXT;
3191
3192 /*
3193 * If the name read is not one of the element in the parsing stack
3194 * then return, it's just an error.
3195 */
3196 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
3197 if (xmlStrEqual(name, ctxt->nameTab[i])) break;
3198 }
3199 if (i < 0) {
3200 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3201 ctxt->sax->error(ctxt->userData,
3202 "Unexpected end tag : %s\n", name);
3203 xmlFree(name);
3204 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003205 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003206 }
3207
3208
3209 /*
3210 * Check for auto-closure of HTML elements.
3211 */
3212
3213 htmlAutoCloseOnClose(ctxt, name);
3214
3215 /*
3216 * Well formedness constraints, opening and closing must match.
3217 * With the exception that the autoclose may have popped stuff out
3218 * of the stack.
3219 */
3220 if (!xmlStrEqual(name, ctxt->name)) {
3221#ifdef DEBUG
3222 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
3223#endif
3224 if ((ctxt->name != NULL) &&
3225 (!xmlStrEqual(ctxt->name, name))) {
3226 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3227 ctxt->sax->error(ctxt->userData,
3228 "Opening and ending tag mismatch: %s and %s\n",
3229 name, ctxt->name);
3230 ctxt->wellFormed = 0;
3231 }
3232 }
3233
3234 /*
3235 * SAX: End of Tag
3236 */
3237 oldname = ctxt->name;
3238 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3239 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3240 ctxt->sax->endElement(ctxt->userData, name);
3241 oldname = htmlnamePop(ctxt);
3242 if (oldname != NULL) {
3243#ifdef DEBUG
3244 xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
3245#endif
3246 xmlFree(oldname);
3247#ifdef DEBUG
3248 } else {
3249 xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
3250#endif
3251 }
Daniel Veillardf420ac52001-07-04 16:04:09 +00003252 ret = 1;
3253 } else {
3254 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003255 }
3256
3257 if (name != NULL)
3258 xmlFree(name);
3259
Daniel Veillardf420ac52001-07-04 16:04:09 +00003260 return(ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003261}
3262
3263
3264/**
3265 * htmlParseReference:
3266 * @ctxt: an HTML parser context
3267 *
3268 * parse and handle entity references in content,
3269 * this will end-up in a call to character() since this is either a
3270 * CharRef, or a predefined entity.
3271 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003272static void
Owen Taylor3473f882001-02-23 17:55:21 +00003273htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003274 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003275 xmlChar out[6];
3276 xmlChar *name;
3277 if (CUR != '&') return;
3278
3279 if (NXT(1) == '#') {
3280 unsigned int c;
3281 int bits, i = 0;
3282
3283 c = htmlParseCharRef(ctxt);
3284 if (c == 0)
3285 return;
3286
3287 if (c < 0x80) { out[i++]= c; bits= -6; }
3288 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3289 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3290 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3291
3292 for ( ; bits >= 0; bits-= 6) {
3293 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3294 }
3295 out[i] = 0;
3296
3297 htmlCheckParagraph(ctxt);
3298 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3299 ctxt->sax->characters(ctxt->userData, out, i);
3300 } else {
3301 ent = htmlParseEntityRef(ctxt, &name);
3302 if (name == NULL) {
3303 htmlCheckParagraph(ctxt);
3304 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3305 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3306 return;
3307 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003308 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003309 htmlCheckParagraph(ctxt);
3310 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3311 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3312 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3313 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3314 }
3315 } else {
3316 unsigned int c;
3317 int bits, i = 0;
3318
3319 c = ent->value;
3320 if (c < 0x80)
3321 { out[i++]= c; bits= -6; }
3322 else if (c < 0x800)
3323 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3324 else if (c < 0x10000)
3325 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3326 else
3327 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3328
3329 for ( ; bits >= 0; bits-= 6) {
3330 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3331 }
3332 out[i] = 0;
3333
3334 htmlCheckParagraph(ctxt);
3335 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3336 ctxt->sax->characters(ctxt->userData, out, i);
3337 }
3338 xmlFree(name);
3339 }
3340}
3341
3342/**
3343 * htmlParseContent:
3344 * @ctxt: an HTML parser context
3345 * @name: the node name
3346 *
3347 * Parse a content: comment, sub-element, reference or text.
3348 *
3349 */
3350
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003351static void
Owen Taylor3473f882001-02-23 17:55:21 +00003352htmlParseContent(htmlParserCtxtPtr ctxt) {
3353 xmlChar *currentNode;
3354 int depth;
3355
3356 currentNode = xmlStrdup(ctxt->name);
3357 depth = ctxt->nameNr;
3358 while (1) {
3359 long cons = ctxt->nbChars;
3360
3361 GROW;
3362 /*
3363 * Our tag or one of it's parent or children is ending.
3364 */
3365 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003366 if (htmlParseEndTag(ctxt) &&
3367 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3368 if (currentNode != NULL)
3369 xmlFree(currentNode);
3370 return;
3371 }
3372 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003373 }
3374
3375 /*
3376 * Has this node been popped out during parsing of
3377 * the next element
3378 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003379 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3380 (!xmlStrEqual(currentNode, ctxt->name)))
3381 {
Owen Taylor3473f882001-02-23 17:55:21 +00003382 if (currentNode != NULL) xmlFree(currentNode);
3383 return;
3384 }
3385
Daniel Veillardf9533d12001-03-03 10:04:57 +00003386 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3387 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003388 /*
3389 * Handle SCRIPT/STYLE separately
3390 */
3391 htmlParseScript(ctxt);
3392 } else {
3393 /*
3394 * Sometimes DOCTYPE arrives in the middle of the document
3395 */
3396 if ((CUR == '<') && (NXT(1) == '!') &&
3397 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3398 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3399 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3400 (UPP(8) == 'E')) {
3401 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3402 ctxt->sax->error(ctxt->userData,
3403 "Misplaced DOCTYPE declaration\n");
3404 ctxt->wellFormed = 0;
3405 htmlParseDocTypeDecl(ctxt);
3406 }
3407
3408 /*
3409 * First case : a comment
3410 */
3411 if ((CUR == '<') && (NXT(1) == '!') &&
3412 (NXT(2) == '-') && (NXT(3) == '-')) {
3413 htmlParseComment(ctxt);
3414 }
3415
3416 /*
3417 * Second case : a sub-element.
3418 */
3419 else if (CUR == '<') {
3420 htmlParseElement(ctxt);
3421 }
3422
3423 /*
3424 * Third case : a reference. If if has not been resolved,
3425 * parsing returns it's Name, create the node
3426 */
3427 else if (CUR == '&') {
3428 htmlParseReference(ctxt);
3429 }
3430
3431 /*
3432 * Fourth : end of the resource
3433 */
3434 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003435 htmlAutoCloseOnEnd(ctxt);
3436 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003437 }
3438
3439 /*
3440 * Last case, text. Note that References are handled directly.
3441 */
3442 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003443 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003444 }
3445
3446 if (cons == ctxt->nbChars) {
3447 if (ctxt->node != NULL) {
3448 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3449 ctxt->sax->error(ctxt->userData,
3450 "detected an error in element content\n");
3451 ctxt->wellFormed = 0;
3452 }
3453 break;
3454 }
3455 }
3456 GROW;
3457 }
3458 if (currentNode != NULL) xmlFree(currentNode);
3459}
3460
3461/**
3462 * htmlParseElement:
3463 * @ctxt: an HTML parser context
3464 *
3465 * parse an HTML element, this is highly recursive
3466 *
3467 * [39] element ::= EmptyElemTag | STag content ETag
3468 *
3469 * [41] Attribute ::= Name Eq AttValue
3470 */
3471
3472void
3473htmlParseElement(htmlParserCtxtPtr ctxt) {
3474 xmlChar *name;
3475 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003476 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003477 htmlParserNodeInfo node_info;
3478 xmlChar *oldname;
3479 int depth = ctxt->nameNr;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003480 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003481
3482 /* Capture start position */
3483 if (ctxt->record_info) {
3484 node_info.begin_pos = ctxt->input->consumed +
3485 (CUR_PTR - ctxt->input->base);
3486 node_info.begin_line = ctxt->input->line;
3487 }
3488
3489 oldname = xmlStrdup(ctxt->name);
3490 htmlParseStartTag(ctxt);
3491 name = ctxt->name;
3492#ifdef DEBUG
3493 if (oldname == NULL)
3494 xmlGenericError(xmlGenericErrorContext,
3495 "Start of element %s\n", name);
3496 else if (name == NULL)
3497 xmlGenericError(xmlGenericErrorContext,
3498 "Start of element failed, was %s\n", oldname);
3499 else
3500 xmlGenericError(xmlGenericErrorContext,
3501 "Start of element %s, was %s\n", name, oldname);
3502#endif
3503 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3504 (name == NULL)) {
3505 if (CUR == '>')
3506 NEXT;
3507 if (oldname != NULL)
3508 xmlFree(oldname);
3509 return;
3510 }
3511 if (oldname != NULL)
3512 xmlFree(oldname);
3513
3514 /*
3515 * Lookup the info for that element.
3516 */
3517 info = htmlTagLookup(name);
3518 if (info == NULL) {
3519 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3520 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3521 name);
3522 ctxt->wellFormed = 0;
3523 } else if (info->depr) {
3524/***************************
3525 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3526 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3527 name);
3528 ***************************/
3529 }
3530
3531 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00003532 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00003533 */
3534 if ((CUR == '/') && (NXT(1) == '>')) {
3535 SKIP(2);
3536 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3537 ctxt->sax->endElement(ctxt->userData, name);
3538 oldname = htmlnamePop(ctxt);
3539#ifdef DEBUG
3540 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
3541#endif
3542 if (oldname != NULL)
3543 xmlFree(oldname);
3544 return;
3545 }
3546
3547 if (CUR == '>') {
3548 NEXT;
3549 } else {
3550 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3551 ctxt->sax->error(ctxt->userData,
3552 "Couldn't find end of Start Tag %s\n",
3553 name);
3554 ctxt->wellFormed = 0;
3555
3556 /*
3557 * end of parsing of this node.
3558 */
3559 if (xmlStrEqual(name, ctxt->name)) {
3560 nodePop(ctxt);
3561 oldname = htmlnamePop(ctxt);
3562#ifdef DEBUG
3563 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
3564#endif
3565 if (oldname != NULL)
3566 xmlFree(oldname);
3567 }
3568
3569 /*
3570 * Capture end position and add node
3571 */
3572 if ( currentNode != NULL && ctxt->record_info ) {
3573 node_info.end_pos = ctxt->input->consumed +
3574 (CUR_PTR - ctxt->input->base);
3575 node_info.end_line = ctxt->input->line;
3576 node_info.node = ctxt->node;
3577 xmlParserAddNodeInfo(ctxt, &node_info);
3578 }
3579 return;
3580 }
3581
3582 /*
3583 * Check for an Empty Element from DTD definition
3584 */
3585 if ((info != NULL) && (info->empty)) {
3586 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3587 ctxt->sax->endElement(ctxt->userData, name);
3588 oldname = htmlnamePop(ctxt);
3589#ifdef DEBUG
3590 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
3591#endif
3592 if (oldname != NULL)
3593 xmlFree(oldname);
3594 return;
3595 }
3596
3597 /*
3598 * Parse the content of the element:
3599 */
3600 currentNode = xmlStrdup(ctxt->name);
3601 depth = ctxt->nameNr;
3602 while (IS_CHAR(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00003603 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00003604 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00003605 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00003606 if (ctxt->nameNr < depth) break;
3607 }
3608
Owen Taylor3473f882001-02-23 17:55:21 +00003609 /*
3610 * Capture end position and add node
3611 */
3612 if ( currentNode != NULL && ctxt->record_info ) {
3613 node_info.end_pos = ctxt->input->consumed +
3614 (CUR_PTR - ctxt->input->base);
3615 node_info.end_line = ctxt->input->line;
3616 node_info.node = ctxt->node;
3617 xmlParserAddNodeInfo(ctxt, &node_info);
3618 }
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003619 if (!IS_CHAR(CUR)) {
3620 htmlAutoCloseOnEnd(ctxt);
3621 }
3622
Owen Taylor3473f882001-02-23 17:55:21 +00003623 if (currentNode != NULL)
3624 xmlFree(currentNode);
3625}
3626
3627/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003628 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00003629 * @ctxt: an HTML parser context
3630 *
3631 * parse an HTML document (and build a tree if using the standard SAX
3632 * interface).
3633 *
3634 * Returns 0, -1 in case of error. the parser context is augmented
3635 * as a result of the parsing.
3636 */
3637
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00003638int
Owen Taylor3473f882001-02-23 17:55:21 +00003639htmlParseDocument(htmlParserCtxtPtr ctxt) {
3640 xmlDtdPtr dtd;
3641
Daniel Veillardd0463562001-10-13 09:15:48 +00003642 xmlInitParser();
3643
Owen Taylor3473f882001-02-23 17:55:21 +00003644 htmlDefaultSAXHandlerInit();
3645 ctxt->html = 1;
3646
3647 GROW;
3648 /*
3649 * SAX: beginning of the document processing.
3650 */
3651 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3652 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3653
3654 /*
3655 * Wipe out everything which is before the first '<'
3656 */
3657 SKIP_BLANKS;
3658 if (CUR == 0) {
3659 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3660 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3661 ctxt->wellFormed = 0;
3662 }
3663
3664 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3665 ctxt->sax->startDocument(ctxt->userData);
3666
3667
3668 /*
3669 * Parse possible comments before any content
3670 */
3671 while ((CUR == '<') && (NXT(1) == '!') &&
3672 (NXT(2) == '-') && (NXT(3) == '-')) {
3673 htmlParseComment(ctxt);
3674 SKIP_BLANKS;
3675 }
3676
3677
3678 /*
3679 * Then possibly doc type declaration(s) and more Misc
3680 * (doctypedecl Misc*)?
3681 */
3682 if ((CUR == '<') && (NXT(1) == '!') &&
3683 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3684 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3685 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3686 (UPP(8) == 'E')) {
3687 htmlParseDocTypeDecl(ctxt);
3688 }
3689 SKIP_BLANKS;
3690
3691 /*
3692 * Parse possible comments before any content
3693 */
3694 while ((CUR == '<') && (NXT(1) == '!') &&
3695 (NXT(2) == '-') && (NXT(3) == '-')) {
3696 htmlParseComment(ctxt);
3697 SKIP_BLANKS;
3698 }
3699
3700 /*
3701 * Time to start parsing the tree itself
3702 */
3703 htmlParseContent(ctxt);
3704
3705 /*
3706 * autoclose
3707 */
3708 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003709 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003710
3711
3712 /*
3713 * SAX: end of the document processing.
3714 */
3715 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3716 ctxt->sax->endDocument(ctxt->userData);
3717
3718 if (ctxt->myDoc != NULL) {
3719 dtd = xmlGetIntSubset(ctxt->myDoc);
3720 if (dtd == NULL)
3721 ctxt->myDoc->intSubset =
3722 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3723 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3724 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3725 }
3726 if (! ctxt->wellFormed) return(-1);
3727 return(0);
3728}
3729
3730
3731/************************************************************************
3732 * *
3733 * Parser contexts handling *
3734 * *
3735 ************************************************************************/
3736
3737/**
3738 * xmlInitParserCtxt:
3739 * @ctxt: an HTML parser context
3740 *
3741 * Initialize a parser context
3742 */
3743
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003744static void
Owen Taylor3473f882001-02-23 17:55:21 +00003745htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3746{
3747 htmlSAXHandler *sax;
3748
3749 if (ctxt == NULL) return;
3750 memset(ctxt, 0, sizeof(htmlParserCtxt));
3751
3752 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3753 if (sax == NULL) {
3754 xmlGenericError(xmlGenericErrorContext,
3755 "htmlInitParserCtxt: out of memory\n");
3756 }
3757 else
3758 memset(sax, 0, sizeof(htmlSAXHandler));
3759
3760 /* Allocate the Input stack */
3761 ctxt->inputTab = (htmlParserInputPtr *)
3762 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3763 if (ctxt->inputTab == NULL) {
3764 xmlGenericError(xmlGenericErrorContext,
3765 "htmlInitParserCtxt: out of memory\n");
3766 ctxt->inputNr = 0;
3767 ctxt->inputMax = 0;
3768 ctxt->input = NULL;
3769 return;
3770 }
3771 ctxt->inputNr = 0;
3772 ctxt->inputMax = 5;
3773 ctxt->input = NULL;
3774 ctxt->version = NULL;
3775 ctxt->encoding = NULL;
3776 ctxt->standalone = -1;
3777 ctxt->instate = XML_PARSER_START;
3778
3779 /* Allocate the Node stack */
3780 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3781 if (ctxt->nodeTab == NULL) {
3782 xmlGenericError(xmlGenericErrorContext,
3783 "htmlInitParserCtxt: out of memory\n");
3784 ctxt->nodeNr = 0;
3785 ctxt->nodeMax = 0;
3786 ctxt->node = NULL;
3787 ctxt->inputNr = 0;
3788 ctxt->inputMax = 0;
3789 ctxt->input = NULL;
3790 return;
3791 }
3792 ctxt->nodeNr = 0;
3793 ctxt->nodeMax = 10;
3794 ctxt->node = NULL;
3795
3796 /* Allocate the Name stack */
3797 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
3798 if (ctxt->nameTab == NULL) {
3799 xmlGenericError(xmlGenericErrorContext,
3800 "htmlInitParserCtxt: out of memory\n");
3801 ctxt->nameNr = 0;
3802 ctxt->nameMax = 10;
3803 ctxt->name = NULL;
3804 ctxt->nodeNr = 0;
3805 ctxt->nodeMax = 0;
3806 ctxt->node = NULL;
3807 ctxt->inputNr = 0;
3808 ctxt->inputMax = 0;
3809 ctxt->input = NULL;
3810 return;
3811 }
3812 ctxt->nameNr = 0;
3813 ctxt->nameMax = 10;
3814 ctxt->name = NULL;
3815
3816 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3817 else {
3818 ctxt->sax = sax;
3819 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3820 }
3821 ctxt->userData = ctxt;
3822 ctxt->myDoc = NULL;
3823 ctxt->wellFormed = 1;
3824 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00003825 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00003826 ctxt->html = 1;
3827 ctxt->record_info = 0;
3828 ctxt->validate = 0;
3829 ctxt->nbChars = 0;
3830 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00003831 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003832 xmlInitNodeInfoSeq(&ctxt->node_seq);
3833}
3834
3835/**
3836 * htmlFreeParserCtxt:
3837 * @ctxt: an HTML parser context
3838 *
3839 * Free all the memory used by a parser context. However the parsed
3840 * document in ctxt->myDoc is not freed.
3841 */
3842
3843void
3844htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3845{
3846 xmlFreeParserCtxt(ctxt);
3847}
3848
3849/**
Daniel Veillard1d995272002-07-22 16:43:32 +00003850 * htmlNewParserCtxt:
3851 *
3852 * Allocate and initialize a new parser context.
3853 *
3854 * Returns the xmlParserCtxtPtr or NULL
3855 */
3856
3857static htmlParserCtxtPtr
3858htmlNewParserCtxt(void)
3859{
3860 xmlParserCtxtPtr ctxt;
3861
3862 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
3863 if (ctxt == NULL) {
3864 xmlGenericError(xmlGenericErrorContext,
3865 "xmlNewParserCtxt : cannot allocate context\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00003866 return(NULL);
3867 }
3868 memset(ctxt, 0, sizeof(xmlParserCtxt));
3869 htmlInitParserCtxt(ctxt);
3870 return(ctxt);
3871}
3872
3873/**
3874 * htmlCreateMemoryParserCtxt:
3875 * @buffer: a pointer to a char array
3876 * @size: the size of the array
3877 *
3878 * Create a parser context for an HTML in-memory document.
3879 *
3880 * Returns the new parser context or NULL
3881 */
3882static htmlParserCtxtPtr
3883htmlCreateMemoryParserCtxt(const char *buffer, int size) {
3884 xmlParserCtxtPtr ctxt;
3885 xmlParserInputPtr input;
3886 xmlParserInputBufferPtr buf;
3887
3888 if (buffer == NULL)
3889 return(NULL);
3890 if (size <= 0)
3891 return(NULL);
3892
3893 ctxt = htmlNewParserCtxt();
3894 if (ctxt == NULL)
3895 return(NULL);
3896
3897 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
3898 if (buf == NULL) return(NULL);
3899
3900 input = xmlNewInputStream(ctxt);
3901 if (input == NULL) {
3902 xmlFreeParserCtxt(ctxt);
3903 return(NULL);
3904 }
3905
3906 input->filename = NULL;
3907 input->buf = buf;
3908 input->base = input->buf->buffer->content;
3909 input->cur = input->buf->buffer->content;
3910 input->end = &input->buf->buffer->content[input->buf->buffer->use];
3911
3912 inputPush(ctxt, input);
3913 return(ctxt);
3914}
3915
3916/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003917 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00003918 * @cur: a pointer to an array of xmlChar
3919 * @encoding: a free form C string describing the HTML document encoding, or NULL
3920 *
3921 * Create a parser context for an HTML document.
3922 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003923 * TODO: check the need to add encoding handling there
3924 *
Owen Taylor3473f882001-02-23 17:55:21 +00003925 * Returns the new parser context or NULL
3926 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003927static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00003928htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Daniel Veillard1d995272002-07-22 16:43:32 +00003929 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00003930 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00003931
Daniel Veillard1d995272002-07-22 16:43:32 +00003932 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00003933 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00003934 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00003935 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
3936
3937 if (encoding != NULL) {
3938 xmlCharEncoding enc;
3939 xmlCharEncodingHandlerPtr handler;
3940
3941 if (ctxt->input->encoding != NULL)
3942 xmlFree((xmlChar *) ctxt->input->encoding);
3943 ctxt->input->encoding = (const xmlChar *) encoding;
3944
3945 enc = xmlParseCharEncoding(encoding);
3946 /*
3947 * registered set of known encodings
3948 */
3949 if (enc != XML_CHAR_ENCODING_ERROR) {
3950 xmlSwitchEncoding(ctxt, enc);
3951 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
3952 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3953 ctxt->sax->error(ctxt->userData,
3954 "Unsupported encoding %s\n", encoding);
3955 ctxt->input->encoding = NULL;
3956 }
3957 } else {
3958 /*
3959 * fallback for unknown encodings
3960 */
3961 handler = xmlFindCharEncodingHandler((const char *) encoding);
3962 if (handler != NULL) {
3963 xmlSwitchToEncoding(ctxt, handler);
3964 } else {
3965 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3966 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3967 ctxt->sax->error(ctxt->userData,
3968 "Unsupported encoding %s\n", encoding);
3969 }
3970 }
3971 }
3972 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003973}
3974
3975/************************************************************************
3976 * *
3977 * Progressive parsing interfaces *
3978 * *
3979 ************************************************************************/
3980
3981/**
3982 * htmlParseLookupSequence:
3983 * @ctxt: an HTML parser context
3984 * @first: the first char to lookup
3985 * @next: the next char to lookup or zero
3986 * @third: the next char to lookup or zero
3987 *
3988 * Try to find if a sequence (first, next, third) or just (first next) or
3989 * (first) is available in the input stream.
3990 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3991 * to avoid rescanning sequences of bytes, it DOES change the state of the
3992 * parser, do not use liberally.
3993 * This is basically similar to xmlParseLookupSequence()
3994 *
3995 * Returns the index to the current parsing point if the full sequence
3996 * is available, -1 otherwise.
3997 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003998static int
Owen Taylor3473f882001-02-23 17:55:21 +00003999htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
4000 xmlChar next, xmlChar third) {
4001 int base, len;
4002 htmlParserInputPtr in;
4003 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004004 int incomment = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004005
4006 in = ctxt->input;
4007 if (in == NULL) return(-1);
4008 base = in->cur - in->base;
4009 if (base < 0) return(-1);
4010 if (ctxt->checkIndex > base)
4011 base = ctxt->checkIndex;
4012 if (in->buf == NULL) {
4013 buf = in->base;
4014 len = in->length;
4015 } else {
4016 buf = in->buf->buffer->content;
4017 len = in->buf->buffer->use;
4018 }
4019 /* take into account the sequence length */
4020 if (third) len -= 2;
4021 else if (next) len --;
4022 for (;base < len;base++) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00004023 if (!incomment && (base + 4 < len)) {
4024 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4025 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4026 incomment = 1;
4027 }
4028 /* do not increment base, some people use <!--> */
4029 }
4030 if (incomment) {
4031 if (base + 3 < len)
4032 return(-1);
4033 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4034 (buf[base + 2] == '>')) {
4035 incomment = 0;
4036 base += 2;
4037 }
4038 continue;
4039 }
Owen Taylor3473f882001-02-23 17:55:21 +00004040 if (buf[base] == first) {
4041 if (third != 0) {
4042 if ((buf[base + 1] != next) ||
4043 (buf[base + 2] != third)) continue;
4044 } else if (next != 0) {
4045 if (buf[base + 1] != next) continue;
4046 }
4047 ctxt->checkIndex = 0;
4048#ifdef DEBUG_PUSH
4049 if (next == 0)
4050 xmlGenericError(xmlGenericErrorContext,
4051 "HPP: lookup '%c' found at %d\n",
4052 first, base);
4053 else if (third == 0)
4054 xmlGenericError(xmlGenericErrorContext,
4055 "HPP: lookup '%c%c' found at %d\n",
4056 first, next, base);
4057 else
4058 xmlGenericError(xmlGenericErrorContext,
4059 "HPP: lookup '%c%c%c' found at %d\n",
4060 first, next, third, base);
4061#endif
4062 return(base - (in->cur - in->base));
4063 }
4064 }
4065 ctxt->checkIndex = base;
4066#ifdef DEBUG_PUSH
4067 if (next == 0)
4068 xmlGenericError(xmlGenericErrorContext,
4069 "HPP: lookup '%c' failed\n", first);
4070 else if (third == 0)
4071 xmlGenericError(xmlGenericErrorContext,
4072 "HPP: lookup '%c%c' failed\n", first, next);
4073 else
4074 xmlGenericError(xmlGenericErrorContext,
4075 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4076#endif
4077 return(-1);
4078}
4079
4080/**
4081 * htmlParseTryOrFinish:
4082 * @ctxt: an HTML parser context
4083 * @terminate: last chunk indicator
4084 *
4085 * Try to progress on parsing
4086 *
4087 * Returns zero if no parsing was possible
4088 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004089static int
Owen Taylor3473f882001-02-23 17:55:21 +00004090htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4091 int ret = 0;
4092 htmlParserInputPtr in;
4093 int avail = 0;
4094 xmlChar cur, next;
4095
4096#ifdef DEBUG_PUSH
4097 switch (ctxt->instate) {
4098 case XML_PARSER_EOF:
4099 xmlGenericError(xmlGenericErrorContext,
4100 "HPP: try EOF\n"); break;
4101 case XML_PARSER_START:
4102 xmlGenericError(xmlGenericErrorContext,
4103 "HPP: try START\n"); break;
4104 case XML_PARSER_MISC:
4105 xmlGenericError(xmlGenericErrorContext,
4106 "HPP: try MISC\n");break;
4107 case XML_PARSER_COMMENT:
4108 xmlGenericError(xmlGenericErrorContext,
4109 "HPP: try COMMENT\n");break;
4110 case XML_PARSER_PROLOG:
4111 xmlGenericError(xmlGenericErrorContext,
4112 "HPP: try PROLOG\n");break;
4113 case XML_PARSER_START_TAG:
4114 xmlGenericError(xmlGenericErrorContext,
4115 "HPP: try START_TAG\n");break;
4116 case XML_PARSER_CONTENT:
4117 xmlGenericError(xmlGenericErrorContext,
4118 "HPP: try CONTENT\n");break;
4119 case XML_PARSER_CDATA_SECTION:
4120 xmlGenericError(xmlGenericErrorContext,
4121 "HPP: try CDATA_SECTION\n");break;
4122 case XML_PARSER_END_TAG:
4123 xmlGenericError(xmlGenericErrorContext,
4124 "HPP: try END_TAG\n");break;
4125 case XML_PARSER_ENTITY_DECL:
4126 xmlGenericError(xmlGenericErrorContext,
4127 "HPP: try ENTITY_DECL\n");break;
4128 case XML_PARSER_ENTITY_VALUE:
4129 xmlGenericError(xmlGenericErrorContext,
4130 "HPP: try ENTITY_VALUE\n");break;
4131 case XML_PARSER_ATTRIBUTE_VALUE:
4132 xmlGenericError(xmlGenericErrorContext,
4133 "HPP: try ATTRIBUTE_VALUE\n");break;
4134 case XML_PARSER_DTD:
4135 xmlGenericError(xmlGenericErrorContext,
4136 "HPP: try DTD\n");break;
4137 case XML_PARSER_EPILOG:
4138 xmlGenericError(xmlGenericErrorContext,
4139 "HPP: try EPILOG\n");break;
4140 case XML_PARSER_PI:
4141 xmlGenericError(xmlGenericErrorContext,
4142 "HPP: try PI\n");break;
4143 case XML_PARSER_SYSTEM_LITERAL:
4144 xmlGenericError(xmlGenericErrorContext,
4145 "HPP: try SYSTEM_LITERAL\n");break;
4146 }
4147#endif
4148
4149 while (1) {
4150
4151 in = ctxt->input;
4152 if (in == NULL) break;
4153 if (in->buf == NULL)
4154 avail = in->length - (in->cur - in->base);
4155 else
4156 avail = in->buf->buffer->use - (in->cur - in->base);
4157 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004158 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004159 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4160 /*
4161 * SAX: end of the document processing.
4162 */
4163 ctxt->instate = XML_PARSER_EOF;
4164 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4165 ctxt->sax->endDocument(ctxt->userData);
4166 }
4167 }
4168 if (avail < 1)
4169 goto done;
4170 switch (ctxt->instate) {
4171 case XML_PARSER_EOF:
4172 /*
4173 * Document parsing is done !
4174 */
4175 goto done;
4176 case XML_PARSER_START:
4177 /*
4178 * Very first chars read from the document flow.
4179 */
4180 cur = in->cur[0];
4181 if (IS_BLANK(cur)) {
4182 SKIP_BLANKS;
4183 if (in->buf == NULL)
4184 avail = in->length - (in->cur - in->base);
4185 else
4186 avail = in->buf->buffer->use - (in->cur - in->base);
4187 }
4188 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4189 ctxt->sax->setDocumentLocator(ctxt->userData,
4190 &xmlDefaultSAXLocator);
4191 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4192 (!ctxt->disableSAX))
4193 ctxt->sax->startDocument(ctxt->userData);
4194
4195 cur = in->cur[0];
4196 next = in->cur[1];
4197 if ((cur == '<') && (next == '!') &&
4198 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4199 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4200 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4201 (UPP(8) == 'E')) {
4202 if ((!terminate) &&
4203 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4204 goto done;
4205#ifdef DEBUG_PUSH
4206 xmlGenericError(xmlGenericErrorContext,
4207 "HPP: Parsing internal subset\n");
4208#endif
4209 htmlParseDocTypeDecl(ctxt);
4210 ctxt->instate = XML_PARSER_PROLOG;
4211#ifdef DEBUG_PUSH
4212 xmlGenericError(xmlGenericErrorContext,
4213 "HPP: entering PROLOG\n");
4214#endif
4215 } else {
4216 ctxt->instate = XML_PARSER_MISC;
4217 }
4218#ifdef DEBUG_PUSH
4219 xmlGenericError(xmlGenericErrorContext,
4220 "HPP: entering MISC\n");
4221#endif
4222 break;
4223 case XML_PARSER_MISC:
4224 SKIP_BLANKS;
4225 if (in->buf == NULL)
4226 avail = in->length - (in->cur - in->base);
4227 else
4228 avail = in->buf->buffer->use - (in->cur - in->base);
4229 if (avail < 2)
4230 goto done;
4231 cur = in->cur[0];
4232 next = in->cur[1];
4233 if ((cur == '<') && (next == '!') &&
4234 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4235 if ((!terminate) &&
4236 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4237 goto done;
4238#ifdef DEBUG_PUSH
4239 xmlGenericError(xmlGenericErrorContext,
4240 "HPP: Parsing Comment\n");
4241#endif
4242 htmlParseComment(ctxt);
4243 ctxt->instate = XML_PARSER_MISC;
4244 } else if ((cur == '<') && (next == '!') &&
4245 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4246 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4247 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4248 (UPP(8) == 'E')) {
4249 if ((!terminate) &&
4250 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4251 goto done;
4252#ifdef DEBUG_PUSH
4253 xmlGenericError(xmlGenericErrorContext,
4254 "HPP: Parsing internal subset\n");
4255#endif
4256 htmlParseDocTypeDecl(ctxt);
4257 ctxt->instate = XML_PARSER_PROLOG;
4258#ifdef DEBUG_PUSH
4259 xmlGenericError(xmlGenericErrorContext,
4260 "HPP: entering PROLOG\n");
4261#endif
4262 } else if ((cur == '<') && (next == '!') &&
4263 (avail < 9)) {
4264 goto done;
4265 } else {
4266 ctxt->instate = XML_PARSER_START_TAG;
4267#ifdef DEBUG_PUSH
4268 xmlGenericError(xmlGenericErrorContext,
4269 "HPP: entering START_TAG\n");
4270#endif
4271 }
4272 break;
4273 case XML_PARSER_PROLOG:
4274 SKIP_BLANKS;
4275 if (in->buf == NULL)
4276 avail = in->length - (in->cur - in->base);
4277 else
4278 avail = in->buf->buffer->use - (in->cur - in->base);
4279 if (avail < 2)
4280 goto done;
4281 cur = in->cur[0];
4282 next = in->cur[1];
4283 if ((cur == '<') && (next == '!') &&
4284 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4285 if ((!terminate) &&
4286 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4287 goto done;
4288#ifdef DEBUG_PUSH
4289 xmlGenericError(xmlGenericErrorContext,
4290 "HPP: Parsing Comment\n");
4291#endif
4292 htmlParseComment(ctxt);
4293 ctxt->instate = XML_PARSER_PROLOG;
4294 } else if ((cur == '<') && (next == '!') &&
4295 (avail < 4)) {
4296 goto done;
4297 } else {
4298 ctxt->instate = XML_PARSER_START_TAG;
4299#ifdef DEBUG_PUSH
4300 xmlGenericError(xmlGenericErrorContext,
4301 "HPP: entering START_TAG\n");
4302#endif
4303 }
4304 break;
4305 case XML_PARSER_EPILOG:
4306 if (in->buf == NULL)
4307 avail = in->length - (in->cur - in->base);
4308 else
4309 avail = in->buf->buffer->use - (in->cur - in->base);
4310 if (avail < 1)
4311 goto done;
4312 cur = in->cur[0];
4313 if (IS_BLANK(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004314 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004315 goto done;
4316 }
4317 if (avail < 2)
4318 goto done;
4319 next = in->cur[1];
4320 if ((cur == '<') && (next == '!') &&
4321 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4322 if ((!terminate) &&
4323 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4324 goto done;
4325#ifdef DEBUG_PUSH
4326 xmlGenericError(xmlGenericErrorContext,
4327 "HPP: Parsing Comment\n");
4328#endif
4329 htmlParseComment(ctxt);
4330 ctxt->instate = XML_PARSER_EPILOG;
4331 } else if ((cur == '<') && (next == '!') &&
4332 (avail < 4)) {
4333 goto done;
4334 } else {
4335 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004336 ctxt->wellFormed = 0;
4337 ctxt->instate = XML_PARSER_EOF;
4338#ifdef DEBUG_PUSH
4339 xmlGenericError(xmlGenericErrorContext,
4340 "HPP: entering EOF\n");
4341#endif
4342 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4343 ctxt->sax->endDocument(ctxt->userData);
4344 goto done;
4345 }
4346 break;
4347 case XML_PARSER_START_TAG: {
4348 xmlChar *name, *oldname;
4349 int depth = ctxt->nameNr;
Daniel Veillardbb371292001-08-16 23:26:59 +00004350 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004351
4352 if (avail < 2)
4353 goto done;
4354 cur = in->cur[0];
4355 if (cur != '<') {
4356 ctxt->instate = XML_PARSER_CONTENT;
4357#ifdef DEBUG_PUSH
4358 xmlGenericError(xmlGenericErrorContext,
4359 "HPP: entering CONTENT\n");
4360#endif
4361 break;
4362 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004363 if (in->cur[1] == '/') {
4364 ctxt->instate = XML_PARSER_END_TAG;
4365 ctxt->checkIndex = 0;
4366#ifdef DEBUG_PUSH
4367 xmlGenericError(xmlGenericErrorContext,
4368 "HPP: entering END_TAG\n");
4369#endif
4370 break;
4371 }
Owen Taylor3473f882001-02-23 17:55:21 +00004372 if ((!terminate) &&
4373 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4374 goto done;
4375
4376 oldname = xmlStrdup(ctxt->name);
4377 htmlParseStartTag(ctxt);
4378 name = ctxt->name;
4379#ifdef DEBUG
4380 if (oldname == NULL)
4381 xmlGenericError(xmlGenericErrorContext,
4382 "Start of element %s\n", name);
4383 else if (name == NULL)
4384 xmlGenericError(xmlGenericErrorContext,
4385 "Start of element failed, was %s\n",
4386 oldname);
4387 else
4388 xmlGenericError(xmlGenericErrorContext,
4389 "Start of element %s, was %s\n",
4390 name, oldname);
4391#endif
4392 if (((depth == ctxt->nameNr) &&
4393 (xmlStrEqual(oldname, ctxt->name))) ||
4394 (name == NULL)) {
4395 if (CUR == '>')
4396 NEXT;
4397 if (oldname != NULL)
4398 xmlFree(oldname);
4399 break;
4400 }
4401 if (oldname != NULL)
4402 xmlFree(oldname);
4403
4404 /*
4405 * Lookup the info for that element.
4406 */
4407 info = htmlTagLookup(name);
4408 if (info == NULL) {
4409 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4410 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4411 name);
4412 ctxt->wellFormed = 0;
4413 } else if (info->depr) {
4414 /***************************
4415 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4416 ctxt->sax->warning(ctxt->userData,
4417 "Tag %s is deprecated\n",
4418 name);
4419 ***************************/
4420 }
4421
4422 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004423 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004424 */
4425 if ((CUR == '/') && (NXT(1) == '>')) {
4426 SKIP(2);
4427 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4428 ctxt->sax->endElement(ctxt->userData, name);
4429 oldname = htmlnamePop(ctxt);
4430#ifdef DEBUG
4431 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
4432 oldname);
4433#endif
4434 if (oldname != NULL)
4435 xmlFree(oldname);
4436 ctxt->instate = XML_PARSER_CONTENT;
4437#ifdef DEBUG_PUSH
4438 xmlGenericError(xmlGenericErrorContext,
4439 "HPP: entering CONTENT\n");
4440#endif
4441 break;
4442 }
4443
4444 if (CUR == '>') {
4445 NEXT;
4446 } else {
4447 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4448 ctxt->sax->error(ctxt->userData,
4449 "Couldn't find end of Start Tag %s\n",
4450 name);
4451 ctxt->wellFormed = 0;
4452
4453 /*
4454 * end of parsing of this node.
4455 */
4456 if (xmlStrEqual(name, ctxt->name)) {
4457 nodePop(ctxt);
4458 oldname = htmlnamePop(ctxt);
4459#ifdef DEBUG
4460 xmlGenericError(xmlGenericErrorContext,
4461 "End of start tag problem: popping out %s\n", oldname);
4462#endif
4463 if (oldname != NULL)
4464 xmlFree(oldname);
4465 }
4466
4467 ctxt->instate = XML_PARSER_CONTENT;
4468#ifdef DEBUG_PUSH
4469 xmlGenericError(xmlGenericErrorContext,
4470 "HPP: entering CONTENT\n");
4471#endif
4472 break;
4473 }
4474
4475 /*
4476 * Check for an Empty Element from DTD definition
4477 */
4478 if ((info != NULL) && (info->empty)) {
4479 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4480 ctxt->sax->endElement(ctxt->userData, name);
4481 oldname = htmlnamePop(ctxt);
4482#ifdef DEBUG
4483 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
4484#endif
4485 if (oldname != NULL)
4486 xmlFree(oldname);
4487 }
4488 ctxt->instate = XML_PARSER_CONTENT;
4489#ifdef DEBUG_PUSH
4490 xmlGenericError(xmlGenericErrorContext,
4491 "HPP: entering CONTENT\n");
4492#endif
4493 break;
4494 }
4495 case XML_PARSER_CONTENT: {
4496 long cons;
4497 /*
4498 * Handle preparsed entities and charRef
4499 */
4500 if (ctxt->token != 0) {
4501 xmlChar chr[2] = { 0 , 0 } ;
4502
4503 chr[0] = (xmlChar) ctxt->token;
4504 htmlCheckParagraph(ctxt);
4505 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4506 ctxt->sax->characters(ctxt->userData, chr, 1);
4507 ctxt->token = 0;
4508 ctxt->checkIndex = 0;
4509 }
4510 if ((avail == 1) && (terminate)) {
4511 cur = in->cur[0];
4512 if ((cur != '<') && (cur != '&')) {
4513 if (ctxt->sax != NULL) {
4514 if (IS_BLANK(cur)) {
4515 if (ctxt->sax->ignorableWhitespace != NULL)
4516 ctxt->sax->ignorableWhitespace(
4517 ctxt->userData, &cur, 1);
4518 } else {
4519 htmlCheckParagraph(ctxt);
4520 if (ctxt->sax->characters != NULL)
4521 ctxt->sax->characters(
4522 ctxt->userData, &cur, 1);
4523 }
4524 }
4525 ctxt->token = 0;
4526 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00004527 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00004528 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004529 }
Owen Taylor3473f882001-02-23 17:55:21 +00004530 }
4531 if (avail < 2)
4532 goto done;
4533 cur = in->cur[0];
4534 next = in->cur[1];
4535 cons = ctxt->nbChars;
4536 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4537 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4538 /*
4539 * Handle SCRIPT/STYLE separately
4540 */
4541 if ((!terminate) &&
4542 (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4543 goto done;
4544 htmlParseScript(ctxt);
4545 if ((cur == '<') && (next == '/')) {
4546 ctxt->instate = XML_PARSER_END_TAG;
4547 ctxt->checkIndex = 0;
4548#ifdef DEBUG_PUSH
4549 xmlGenericError(xmlGenericErrorContext,
4550 "HPP: entering END_TAG\n");
4551#endif
4552 break;
4553 }
4554 } else {
4555 /*
4556 * Sometimes DOCTYPE arrives in the middle of the document
4557 */
4558 if ((cur == '<') && (next == '!') &&
4559 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4560 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4561 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4562 (UPP(8) == 'E')) {
4563 if ((!terminate) &&
4564 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4565 goto done;
4566 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4567 ctxt->sax->error(ctxt->userData,
4568 "Misplaced DOCTYPE declaration\n");
4569 ctxt->wellFormed = 0;
4570 htmlParseDocTypeDecl(ctxt);
4571 } else if ((cur == '<') && (next == '!') &&
4572 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4573 if ((!terminate) &&
4574 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4575 goto done;
4576#ifdef DEBUG_PUSH
4577 xmlGenericError(xmlGenericErrorContext,
4578 "HPP: Parsing Comment\n");
4579#endif
4580 htmlParseComment(ctxt);
4581 ctxt->instate = XML_PARSER_CONTENT;
4582 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4583 goto done;
4584 } else if ((cur == '<') && (next == '/')) {
4585 ctxt->instate = XML_PARSER_END_TAG;
4586 ctxt->checkIndex = 0;
4587#ifdef DEBUG_PUSH
4588 xmlGenericError(xmlGenericErrorContext,
4589 "HPP: entering END_TAG\n");
4590#endif
4591 break;
4592 } else if (cur == '<') {
4593 ctxt->instate = XML_PARSER_START_TAG;
4594 ctxt->checkIndex = 0;
4595#ifdef DEBUG_PUSH
4596 xmlGenericError(xmlGenericErrorContext,
4597 "HPP: entering START_TAG\n");
4598#endif
4599 break;
4600 } else if (cur == '&') {
4601 if ((!terminate) &&
4602 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4603 goto done;
4604#ifdef DEBUG_PUSH
4605 xmlGenericError(xmlGenericErrorContext,
4606 "HPP: Parsing Reference\n");
4607#endif
4608 /* TODO: check generation of subtrees if noent !!! */
4609 htmlParseReference(ctxt);
4610 } else {
4611 /* TODO Avoid the extra copy, handle directly !!!!!! */
4612 /*
Daniel Veillard01c13b52002-12-10 15:19:08 +00004613 * Goal of the following test is:
Owen Taylor3473f882001-02-23 17:55:21 +00004614 * - minimize calls to the SAX 'character' callback
4615 * when they are mergeable
4616 */
4617 if ((ctxt->inputNr == 1) &&
4618 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4619 if ((!terminate) &&
4620 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4621 goto done;
4622 }
4623 ctxt->checkIndex = 0;
4624#ifdef DEBUG_PUSH
4625 xmlGenericError(xmlGenericErrorContext,
4626 "HPP: Parsing char data\n");
4627#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004628 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004629 }
4630 }
4631 if (cons == ctxt->nbChars) {
4632 if (ctxt->node != NULL) {
4633 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4634 ctxt->sax->error(ctxt->userData,
4635 "detected an error in element content\n");
4636 ctxt->wellFormed = 0;
4637 }
4638 NEXT;
4639 break;
4640 }
4641
4642 break;
4643 }
4644 case XML_PARSER_END_TAG:
4645 if (avail < 2)
4646 goto done;
4647 if ((!terminate) &&
4648 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4649 goto done;
4650 htmlParseEndTag(ctxt);
4651 if (ctxt->nameNr == 0) {
4652 ctxt->instate = XML_PARSER_EPILOG;
4653 } else {
4654 ctxt->instate = XML_PARSER_CONTENT;
4655 }
4656 ctxt->checkIndex = 0;
4657#ifdef DEBUG_PUSH
4658 xmlGenericError(xmlGenericErrorContext,
4659 "HPP: entering CONTENT\n");
4660#endif
4661 break;
4662 case XML_PARSER_CDATA_SECTION:
4663 xmlGenericError(xmlGenericErrorContext,
4664 "HPP: internal error, state == CDATA\n");
4665 ctxt->instate = XML_PARSER_CONTENT;
4666 ctxt->checkIndex = 0;
4667#ifdef DEBUG_PUSH
4668 xmlGenericError(xmlGenericErrorContext,
4669 "HPP: entering CONTENT\n");
4670#endif
4671 break;
4672 case XML_PARSER_DTD:
4673 xmlGenericError(xmlGenericErrorContext,
4674 "HPP: internal error, state == DTD\n");
4675 ctxt->instate = XML_PARSER_CONTENT;
4676 ctxt->checkIndex = 0;
4677#ifdef DEBUG_PUSH
4678 xmlGenericError(xmlGenericErrorContext,
4679 "HPP: entering CONTENT\n");
4680#endif
4681 break;
4682 case XML_PARSER_COMMENT:
4683 xmlGenericError(xmlGenericErrorContext,
4684 "HPP: internal error, state == COMMENT\n");
4685 ctxt->instate = XML_PARSER_CONTENT;
4686 ctxt->checkIndex = 0;
4687#ifdef DEBUG_PUSH
4688 xmlGenericError(xmlGenericErrorContext,
4689 "HPP: entering CONTENT\n");
4690#endif
4691 break;
4692 case XML_PARSER_PI:
4693 xmlGenericError(xmlGenericErrorContext,
4694 "HPP: internal error, state == PI\n");
4695 ctxt->instate = XML_PARSER_CONTENT;
4696 ctxt->checkIndex = 0;
4697#ifdef DEBUG_PUSH
4698 xmlGenericError(xmlGenericErrorContext,
4699 "HPP: entering CONTENT\n");
4700#endif
4701 break;
4702 case XML_PARSER_ENTITY_DECL:
4703 xmlGenericError(xmlGenericErrorContext,
4704 "HPP: internal error, state == ENTITY_DECL\n");
4705 ctxt->instate = XML_PARSER_CONTENT;
4706 ctxt->checkIndex = 0;
4707#ifdef DEBUG_PUSH
4708 xmlGenericError(xmlGenericErrorContext,
4709 "HPP: entering CONTENT\n");
4710#endif
4711 break;
4712 case XML_PARSER_ENTITY_VALUE:
4713 xmlGenericError(xmlGenericErrorContext,
4714 "HPP: internal error, state == ENTITY_VALUE\n");
4715 ctxt->instate = XML_PARSER_CONTENT;
4716 ctxt->checkIndex = 0;
4717#ifdef DEBUG_PUSH
4718 xmlGenericError(xmlGenericErrorContext,
4719 "HPP: entering DTD\n");
4720#endif
4721 break;
4722 case XML_PARSER_ATTRIBUTE_VALUE:
4723 xmlGenericError(xmlGenericErrorContext,
4724 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
4725 ctxt->instate = XML_PARSER_START_TAG;
4726 ctxt->checkIndex = 0;
4727#ifdef DEBUG_PUSH
4728 xmlGenericError(xmlGenericErrorContext,
4729 "HPP: entering START_TAG\n");
4730#endif
4731 break;
4732 case XML_PARSER_SYSTEM_LITERAL:
4733 xmlGenericError(xmlGenericErrorContext,
4734 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
4735 ctxt->instate = XML_PARSER_CONTENT;
4736 ctxt->checkIndex = 0;
4737#ifdef DEBUG_PUSH
4738 xmlGenericError(xmlGenericErrorContext,
4739 "HPP: entering CONTENT\n");
4740#endif
4741 break;
4742 case XML_PARSER_IGNORE:
4743 xmlGenericError(xmlGenericErrorContext,
4744 "HPP: internal error, state == XML_PARSER_IGNORE\n");
4745 ctxt->instate = XML_PARSER_CONTENT;
4746 ctxt->checkIndex = 0;
4747#ifdef DEBUG_PUSH
4748 xmlGenericError(xmlGenericErrorContext,
4749 "HPP: entering CONTENT\n");
4750#endif
4751 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00004752 case XML_PARSER_PUBLIC_LITERAL:
4753 xmlGenericError(xmlGenericErrorContext,
4754 "HPP: internal error, state == XML_PARSER_LITERAL\n");
4755 ctxt->instate = XML_PARSER_CONTENT;
4756 ctxt->checkIndex = 0;
4757#ifdef DEBUG_PUSH
4758 xmlGenericError(xmlGenericErrorContext,
4759 "HPP: entering CONTENT\n");
4760#endif
4761 break;
4762
Owen Taylor3473f882001-02-23 17:55:21 +00004763 }
4764 }
4765done:
4766 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004767 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004768 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4769 /*
4770 * SAX: end of the document processing.
4771 */
4772 ctxt->instate = XML_PARSER_EOF;
4773 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4774 ctxt->sax->endDocument(ctxt->userData);
4775 }
4776 }
4777 if ((ctxt->myDoc != NULL) &&
4778 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4779 (ctxt->instate == XML_PARSER_EPILOG))) {
4780 xmlDtdPtr dtd;
4781 dtd = xmlGetIntSubset(ctxt->myDoc);
4782 if (dtd == NULL)
4783 ctxt->myDoc->intSubset =
4784 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4785 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4786 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4787 }
4788#ifdef DEBUG_PUSH
4789 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4790#endif
4791 return(ret);
4792}
4793
4794/**
Owen Taylor3473f882001-02-23 17:55:21 +00004795 * htmlParseChunk:
4796 * @ctxt: an XML parser context
4797 * @chunk: an char array
4798 * @size: the size in byte of the chunk
4799 * @terminate: last chunk indicator
4800 *
4801 * Parse a Chunk of memory
4802 *
4803 * Returns zero if no error, the xmlParserErrors otherwise.
4804 */
4805int
4806htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4807 int terminate) {
4808 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4809 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4810 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4811 int cur = ctxt->input->cur - ctxt->input->base;
4812
4813 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4814 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4815 ctxt->input->cur = ctxt->input->base + cur;
4816#ifdef DEBUG_PUSH
4817 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4818#endif
4819
4820 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4821 htmlParseTryOrFinish(ctxt, terminate);
4822 } else if (ctxt->instate != XML_PARSER_EOF) {
4823 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
4824 htmlParseTryOrFinish(ctxt, terminate);
4825 }
4826 if (terminate) {
4827 if ((ctxt->instate != XML_PARSER_EOF) &&
4828 (ctxt->instate != XML_PARSER_EPILOG) &&
4829 (ctxt->instate != XML_PARSER_MISC)) {
4830 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004831 ctxt->wellFormed = 0;
4832 }
4833 if (ctxt->instate != XML_PARSER_EOF) {
4834 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4835 ctxt->sax->endDocument(ctxt->userData);
4836 }
4837 ctxt->instate = XML_PARSER_EOF;
4838 }
4839 return((xmlParserErrors) ctxt->errNo);
4840}
4841
4842/************************************************************************
4843 * *
4844 * User entry points *
4845 * *
4846 ************************************************************************/
4847
4848/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004849 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004850 * @sax: a SAX handler
4851 * @user_data: The user data returned on SAX callbacks
4852 * @chunk: a pointer to an array of chars
4853 * @size: number of chars in the array
4854 * @filename: an optional file name or URI
4855 * @enc: an optional encoding
4856 *
4857 * Create a parser context for using the HTML parser in push mode
4858 * To allow content encoding detection, @size should be >= 4
4859 * The value of @filename is used for fetching external entities
4860 * and error/warning reports.
4861 *
4862 * Returns the new parser context or NULL
4863 */
4864htmlParserCtxtPtr
4865htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4866 const char *chunk, int size, const char *filename,
4867 xmlCharEncoding enc) {
4868 htmlParserCtxtPtr ctxt;
4869 htmlParserInputPtr inputStream;
4870 xmlParserInputBufferPtr buf;
4871
Daniel Veillardd0463562001-10-13 09:15:48 +00004872 xmlInitParser();
4873
Owen Taylor3473f882001-02-23 17:55:21 +00004874 buf = xmlAllocParserInputBuffer(enc);
4875 if (buf == NULL) return(NULL);
4876
4877 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4878 if (ctxt == NULL) {
4879 xmlFree(buf);
4880 return(NULL);
4881 }
4882 memset(ctxt, 0, sizeof(htmlParserCtxt));
4883 htmlInitParserCtxt(ctxt);
4884 if (sax != NULL) {
4885 if (ctxt->sax != &htmlDefaultSAXHandler)
4886 xmlFree(ctxt->sax);
4887 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4888 if (ctxt->sax == NULL) {
4889 xmlFree(buf);
4890 xmlFree(ctxt);
4891 return(NULL);
4892 }
4893 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4894 if (user_data != NULL)
4895 ctxt->userData = user_data;
4896 }
4897 if (filename == NULL) {
4898 ctxt->directory = NULL;
4899 } else {
4900 ctxt->directory = xmlParserGetDirectory(filename);
4901 }
4902
4903 inputStream = htmlNewInputStream(ctxt);
4904 if (inputStream == NULL) {
4905 xmlFreeParserCtxt(ctxt);
4906 return(NULL);
4907 }
4908
4909 if (filename == NULL)
4910 inputStream->filename = NULL;
4911 else
4912 inputStream->filename = xmlMemStrdup(filename);
4913 inputStream->buf = buf;
4914 inputStream->base = inputStream->buf->buffer->content;
4915 inputStream->cur = inputStream->buf->buffer->content;
4916
4917 inputPush(ctxt, inputStream);
4918
4919 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4920 (ctxt->input->buf != NULL)) {
4921 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4922#ifdef DEBUG_PUSH
4923 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4924#endif
4925 }
4926
4927 return(ctxt);
4928}
4929
4930/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004931 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00004932 * @cur: a pointer to an array of xmlChar
4933 * @encoding: a free form C string describing the HTML document encoding, or NULL
4934 * @sax: the SAX handler block
4935 * @userData: if using SAX, this pointer will be provided on callbacks.
4936 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004937 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
4938 * to handle parse events. If sax is NULL, fallback to the default DOM
4939 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00004940 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004941 * Returns the resulting document tree unless SAX is NULL or the document is
4942 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00004943 */
4944
4945htmlDocPtr
4946htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
4947 htmlDocPtr ret;
4948 htmlParserCtxtPtr ctxt;
4949
Daniel Veillardd0463562001-10-13 09:15:48 +00004950 xmlInitParser();
4951
Owen Taylor3473f882001-02-23 17:55:21 +00004952 if (cur == NULL) return(NULL);
4953
4954
4955 ctxt = htmlCreateDocParserCtxt(cur, encoding);
4956 if (ctxt == NULL) return(NULL);
4957 if (sax != NULL) {
4958 ctxt->sax = sax;
4959 ctxt->userData = userData;
4960 }
4961
4962 htmlParseDocument(ctxt);
4963 ret = ctxt->myDoc;
4964 if (sax != NULL) {
4965 ctxt->sax = NULL;
4966 ctxt->userData = NULL;
4967 }
4968 htmlFreeParserCtxt(ctxt);
4969
4970 return(ret);
4971}
4972
4973/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004974 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00004975 * @cur: a pointer to an array of xmlChar
4976 * @encoding: a free form C string describing the HTML document encoding, or NULL
4977 *
4978 * parse an HTML in-memory document and build a tree.
4979 *
4980 * Returns the resulting document tree
4981 */
4982
4983htmlDocPtr
4984htmlParseDoc(xmlChar *cur, const char *encoding) {
4985 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4986}
4987
4988
4989/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004990 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004991 * @filename: the filename
4992 * @encoding: a free form C string describing the HTML document encoding, or NULL
4993 *
4994 * Create a parser context for a file content.
4995 * Automatic support for ZLIB/Compress compressed document is provided
4996 * by default if found at compile-time.
4997 *
4998 * Returns the new parser context or NULL
4999 */
5000htmlParserCtxtPtr
5001htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5002{
5003 htmlParserCtxtPtr ctxt;
5004 htmlParserInputPtr inputStream;
5005 xmlParserInputBufferPtr buf;
5006 /* htmlCharEncoding enc; */
5007 xmlChar *content, *content_line = (xmlChar *) "charset=";
5008
5009 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
5010 if (buf == NULL) return(NULL);
5011
5012 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
5013 if (ctxt == NULL) {
Daniel Veillard3487c8d2002-09-05 11:33:25 +00005014 xmlGenericError(xmlGenericErrorContext, "malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00005015 return(NULL);
5016 }
5017 memset(ctxt, 0, sizeof(htmlParserCtxt));
5018 htmlInitParserCtxt(ctxt);
5019 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
5020 if (inputStream == NULL) {
Daniel Veillard3487c8d2002-09-05 11:33:25 +00005021 xmlGenericError(xmlGenericErrorContext, "malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00005022 xmlFree(ctxt);
5023 return(NULL);
5024 }
5025 memset(inputStream, 0, sizeof(htmlParserInput));
5026
Daniel Veillarda646cfd2002-09-17 21:50:03 +00005027 inputStream->filename = (char *)
5028 xmlNormalizeWindowsPath((xmlChar *)filename);
Owen Taylor3473f882001-02-23 17:55:21 +00005029 inputStream->line = 1;
5030 inputStream->col = 1;
5031 inputStream->buf = buf;
5032 inputStream->directory = NULL;
5033
5034 inputStream->base = inputStream->buf->buffer->content;
5035 inputStream->cur = inputStream->buf->buffer->content;
5036 inputStream->free = NULL;
5037
5038 inputPush(ctxt, inputStream);
5039
5040 /* set encoding */
5041 if (encoding) {
5042 content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
5043 if (content) {
5044 strcpy ((char *)content, (char *)content_line);
5045 strcat ((char *)content, (char *)encoding);
5046 htmlCheckEncoding (ctxt, content);
5047 xmlFree (content);
5048 }
5049 }
5050
5051 return(ctxt);
5052}
5053
5054/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005055 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005056 * @filename: the filename
5057 * @encoding: a free form C string describing the HTML document encoding, or NULL
5058 * @sax: the SAX handler block
5059 * @userData: if using SAX, this pointer will be provided on callbacks.
5060 *
5061 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5062 * compressed document is provided by default if found at compile-time.
5063 * It use the given SAX function block to handle the parsing callback.
5064 * If sax is NULL, fallback to the default DOM tree building routines.
5065 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005066 * Returns the resulting document tree unless SAX is NULL or the document is
5067 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005068 */
5069
5070htmlDocPtr
5071htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5072 void *userData) {
5073 htmlDocPtr ret;
5074 htmlParserCtxtPtr ctxt;
5075 htmlSAXHandlerPtr oldsax = NULL;
5076
Daniel Veillardd0463562001-10-13 09:15:48 +00005077 xmlInitParser();
5078
Owen Taylor3473f882001-02-23 17:55:21 +00005079 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5080 if (ctxt == NULL) return(NULL);
5081 if (sax != NULL) {
5082 oldsax = ctxt->sax;
5083 ctxt->sax = sax;
5084 ctxt->userData = userData;
5085 }
5086
5087 htmlParseDocument(ctxt);
5088
5089 ret = ctxt->myDoc;
5090 if (sax != NULL) {
5091 ctxt->sax = oldsax;
5092 ctxt->userData = NULL;
5093 }
5094 htmlFreeParserCtxt(ctxt);
5095
5096 return(ret);
5097}
5098
5099/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005100 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005101 * @filename: the filename
5102 * @encoding: a free form C string describing the HTML document encoding, or NULL
5103 *
5104 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5105 * compressed document is provided by default if found at compile-time.
5106 *
5107 * Returns the resulting document tree
5108 */
5109
5110htmlDocPtr
5111htmlParseFile(const char *filename, const char *encoding) {
5112 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5113}
5114
5115/**
5116 * htmlHandleOmittedElem:
5117 * @val: int 0 or 1
5118 *
5119 * Set and return the previous value for handling HTML omitted tags.
5120 *
5121 * Returns the last value for 0 for no handling, 1 for auto insertion.
5122 */
5123
5124int
5125htmlHandleOmittedElem(int val) {
5126 int old = htmlOmittedDefaultValue;
5127
5128 htmlOmittedDefaultValue = val;
5129 return(old);
5130}
5131
5132#endif /* LIBXML_HTML_ENABLED */