blob: 7db3e9e3a8fa8478965467a6843843705f678ef6 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Owen Taylor3473f882001-02-23 17:55:21 +000045
46#define HTML_MAX_NAMELEN 1000
47#define HTML_PARSER_BIG_BUFFER_SIZE 1000
48#define HTML_PARSER_BUFFER_SIZE 100
49
50/* #define DEBUG */
51/* #define DEBUG_PUSH */
52
Daniel Veillard22090732001-07-16 00:06:07 +000053static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000054
Daniel Veillard56a4cb82001-03-24 17:00:36 +000055xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
56 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000057static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000058
59/************************************************************************
60 * *
Owen Taylor3473f882001-02-23 17:55:21 +000061 * Parser stacks related functions and macros *
62 * *
63 ************************************************************************/
64
Daniel Veillard1c732d22002-11-30 11:22:59 +000065/**
66 * htmlnamePush:
67 * @ctxt: an HTML parser context
68 * @value: the element name
69 *
70 * Pushes a new element name on top of the name stack
71 *
72 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +000073 */
Daniel Veillard1c732d22002-11-30 11:22:59 +000074static int
75htmlnamePush(htmlParserCtxtPtr ctxt, xmlChar * value)
76{
77 if (ctxt->nameNr >= ctxt->nameMax) {
78 ctxt->nameMax *= 2;
79 ctxt->nameTab =
80 (xmlChar * *)xmlRealloc(ctxt->nameTab,
81 ctxt->nameMax *
82 sizeof(ctxt->nameTab[0]));
83 if (ctxt->nameTab == NULL) {
84 xmlGenericError(xmlGenericErrorContext, "realloc failed !\n");
85 return (0);
86 }
87 }
88 ctxt->nameTab[ctxt->nameNr] = value;
89 ctxt->name = value;
90 return (ctxt->nameNr++);
91}
92/**
93 * htmlnamePop:
94 * @ctxt: an HTML parser context
95 *
96 * Pops the top element name from the name stack
97 *
98 * Returns the name just removed
99 */
100static xmlChar *
101htmlnamePop(htmlParserCtxtPtr ctxt)
102{
103 xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000104
Daniel Veillard1c732d22002-11-30 11:22:59 +0000105 if (ctxt->nameNr <= 0)
106 return (0);
107 ctxt->nameNr--;
108 if (ctxt->nameNr < 0)
109 return (0);
110 if (ctxt->nameNr > 0)
111 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
112 else
113 ctxt->name = NULL;
114 ret = ctxt->nameTab[ctxt->nameNr];
115 ctxt->nameTab[ctxt->nameNr] = 0;
116 return (ret);
117}
Owen Taylor3473f882001-02-23 17:55:21 +0000118
119/*
120 * Macros for accessing the content. Those should be used only by the parser,
121 * and not exported.
122 *
123 * Dirty macros, i.e. one need to make assumption on the context to use them
124 *
125 * CUR_PTR return the current pointer to the xmlChar to be parsed.
126 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
127 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
128 * in UNICODE mode. This should be used internally by the parser
129 * only to compare to ASCII values otherwise it would break when
130 * running with UTF-8 encoding.
131 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
132 * to compare on ASCII based substring.
133 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
134 * it should be used only to compare on ASCII based substring.
135 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
136 * strings within the parser.
137 *
138 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
139 *
140 * CURRENT Returns the current char value, with the full decoding of
141 * UTF-8 if we are using this mode. It returns an int.
142 * NEXT Skip to the next character, this does the proper decoding
143 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
144 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
145 */
146
147#define UPPER (toupper(*ctxt->input->cur))
148
149#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
150
151#define NXT(val) ctxt->input->cur[(val)]
152
153#define UPP(val) (toupper(ctxt->input->cur[(val)]))
154
155#define CUR_PTR ctxt->input->cur
156
157#define SHRINK xmlParserInputShrink(ctxt->input)
158
159#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
160
161#define CURRENT ((int) (*ctxt->input->cur))
162
163#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
164
165/* Inported from XML */
166
Daniel Veillard561b7f82002-03-20 21:55:57 +0000167/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
168#define CUR ((int) (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000169#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
170
Daniel Veillard561b7f82002-03-20 21:55:57 +0000171#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000172#define NXT(val) ctxt->input->cur[(val)]
173#define CUR_PTR ctxt->input->cur
174
175
176#define NEXTL(l) do { \
177 if (*(ctxt->input->cur) == '\n') { \
178 ctxt->input->line++; ctxt->input->col = 1; \
179 } else ctxt->input->col++; \
180 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
181 } while (0)
182
183/************
184 \
185 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
186 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
187 ************/
188
189#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
190#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
191
192#define COPY_BUF(l,b,i,v) \
193 if (l == 1) b[i++] = (xmlChar) v; \
194 else i += xmlCopyChar(l,&b[i],v)
195
196/**
197 * htmlCurrentChar:
198 * @ctxt: the HTML parser context
199 * @len: pointer to the length of the char read
200 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000201 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000202 * bytes in the input buffer. Implement the end of line normalization:
203 * 2.11 End-of-Line Handling
204 * If the encoding is unspecified, in the case we find an ISO-Latin-1
205 * char, then the encoding converter is plugged in automatically.
206 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000207 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000208 */
209
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000210static int
Owen Taylor3473f882001-02-23 17:55:21 +0000211htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
212 if (ctxt->instate == XML_PARSER_EOF)
213 return(0);
214
215 if (ctxt->token != 0) {
216 *len = 0;
217 return(ctxt->token);
218 }
219 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
220 /*
221 * We are supposed to handle UTF8, check it's valid
222 * From rfc2044: encoding of the Unicode values on UTF-8:
223 *
224 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
225 * 0000 0000-0000 007F 0xxxxxxx
226 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
227 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
228 *
229 * Check for the 0x110000 limit too
230 */
231 const unsigned char *cur = ctxt->input->cur;
232 unsigned char c;
233 unsigned int val;
234
235 c = *cur;
236 if (c & 0x80) {
237 if (cur[1] == 0)
238 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
239 if ((cur[1] & 0xc0) != 0x80)
240 goto encoding_error;
241 if ((c & 0xe0) == 0xe0) {
242
243 if (cur[2] == 0)
244 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
245 if ((cur[2] & 0xc0) != 0x80)
246 goto encoding_error;
247 if ((c & 0xf0) == 0xf0) {
248 if (cur[3] == 0)
249 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
250 if (((c & 0xf8) != 0xf0) ||
251 ((cur[3] & 0xc0) != 0x80))
252 goto encoding_error;
253 /* 4-byte code */
254 *len = 4;
255 val = (cur[0] & 0x7) << 18;
256 val |= (cur[1] & 0x3f) << 12;
257 val |= (cur[2] & 0x3f) << 6;
258 val |= cur[3] & 0x3f;
259 } else {
260 /* 3-byte code */
261 *len = 3;
262 val = (cur[0] & 0xf) << 12;
263 val |= (cur[1] & 0x3f) << 6;
264 val |= cur[2] & 0x3f;
265 }
266 } else {
267 /* 2-byte code */
268 *len = 2;
269 val = (cur[0] & 0x1f) << 6;
270 val |= cur[1] & 0x3f;
271 }
272 if (!IS_CHAR(val)) {
273 ctxt->errNo = XML_ERR_INVALID_ENCODING;
274 if ((ctxt->sax != NULL) &&
275 (ctxt->sax->error != NULL))
276 ctxt->sax->error(ctxt->userData,
277 "Char 0x%X out of allowed range\n", val);
278 ctxt->wellFormed = 0;
Daniel Veillarddad3f682002-11-17 16:47:27 +0000279 if (ctxt->recovery == 0) ctxt->disableSAX = 1;
Owen Taylor3473f882001-02-23 17:55:21 +0000280 }
281 return(val);
282 } else {
283 /* 1-byte code */
284 *len = 1;
285 return((int) *ctxt->input->cur);
286 }
287 }
288 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000289 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000290 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000291 * XML constructs only use < 128 chars
292 */
293 *len = 1;
294 if ((int) *ctxt->input->cur < 0x80)
295 return((int) *ctxt->input->cur);
296
297 /*
298 * Humm this is bad, do an automatic flow conversion
299 */
300 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
301 ctxt->charset = XML_CHAR_ENCODING_UTF8;
302 return(xmlCurrentChar(ctxt, len));
303
304encoding_error:
305 /*
306 * If we detect an UTF8 error that probably mean that the
307 * input encoding didn't get properly advertized in the
308 * declaration header. Report the error and switch the encoding
309 * to ISO-Latin-1 (if you don't like this policy, just declare the
310 * encoding !)
311 */
312 ctxt->errNo = XML_ERR_INVALID_ENCODING;
313 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
314 ctxt->sax->error(ctxt->userData,
315 "Input is not proper UTF-8, indicate encoding !\n");
316 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
317 ctxt->input->cur[0], ctxt->input->cur[1],
318 ctxt->input->cur[2], ctxt->input->cur[3]);
319 }
320
321 ctxt->charset = XML_CHAR_ENCODING_8859_1;
322 *len = 1;
323 return((int) *ctxt->input->cur);
324}
325
326/**
Owen Taylor3473f882001-02-23 17:55:21 +0000327 * htmlSkipBlankChars:
328 * @ctxt: the HTML parser context
329 *
330 * skip all blanks character found at that point in the input streams.
331 *
332 * Returns the number of space chars skipped
333 */
334
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000335static int
Owen Taylor3473f882001-02-23 17:55:21 +0000336htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
337 int res = 0;
338
339 while (IS_BLANK(*(ctxt->input->cur))) {
340 if ((*ctxt->input->cur == 0) &&
341 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
342 xmlPopInput(ctxt);
343 } else {
344 if (*(ctxt->input->cur) == '\n') {
345 ctxt->input->line++; ctxt->input->col = 1;
346 } else ctxt->input->col++;
347 ctxt->input->cur++;
348 ctxt->nbChars++;
349 if (*ctxt->input->cur == 0)
350 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
351 }
352 res++;
353 }
354 return(res);
355}
356
357
358
359/************************************************************************
360 * *
361 * The list of HTML elements and their properties *
362 * *
363 ************************************************************************/
364
365/*
366 * Start Tag: 1 means the start tag can be ommited
367 * End Tag: 1 means the end tag can be ommited
368 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000369 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000370 * Depr: this element is deprecated
371 * DTD: 1 means that this element is valid only in the Loose DTD
372 * 2 means that this element is valid only in the Frameset DTD
373 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000374 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Owen Taylor3473f882001-02-23 17:55:21 +0000375 */
Daniel Veillard22090732001-07-16 00:06:07 +0000376static const htmlElemDesc
377html40ElementTable[] = {
Daniel Veillard02bb1702001-06-13 21:11:59 +0000378{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor " },
379{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form" },
380{ "acronym", 0, 0, 0, 0, 0, 0, 1, "" },
381{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author " },
382{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet " },
383{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area " },
384{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style" },
385{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri " },
386{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " },
387{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride " },
388{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style" },
389{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation " },
390{ "body", 1, 1, 0, 0, 0, 0, 0, "document body " },
391{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break " },
392{ "button", 0, 0, 0, 0, 0, 0, 2, "push button " },
393{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption " },
394{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center " },
395{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation" },
396{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment" },
397{ "col", 0, 2, 2, 1, 0, 0, 0, "table column " },
398{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group " },
399{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description " },
400{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text " },
401{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition" },
402{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list" },
403{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container"},
404{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list " },
405{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term " },
406{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis" },
407{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group " },
408{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font " },
409{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form " },
410{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " },
411{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" },
412{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading " },
413{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading " },
414{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading " },
415{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading " },
416{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading " },
417{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading " },
418{ "head", 1, 1, 0, 0, 0, 0, 0, "document head " },
419{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " },
420{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element " },
421{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style" },
422{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow " },
423{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image " },
424{ "input", 0, 2, 2, 1, 0, 0, 1, "form control " },
425{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text" },
426{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt " },
427{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user" },
428{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text " },
429{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend " },
430{ "li", 0, 1, 1, 0, 0, 0, 0, "list item " },
431{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link " },
432{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map " },
433{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list " },
434{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation " },
435{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering " },
436{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
437{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object " },
438{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list " },
439{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group " },
440{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " },
Daniel Veillardfee408f2002-11-22 13:18:30 +0000441{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph " },
Daniel Veillard02bb1702001-06-13 21:11:59 +0000442{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value " },
443{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text " },
444{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation " },
445{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style" },
446{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc." },
447{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements " },
448{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector " },
449{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style" },
450{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container " },
451{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text" },
452{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis" },
453{ "style", 0, 0, 0, 0, 0, 0, 0, "style info " },
454{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript" },
455{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript " },
456{ "table", 0, 0, 0, 0, 0, 0, 0, "&#160;" },
457{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body " },
458{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell" },
459{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field " },
460{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer " },
461{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell" },
462{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header " },
463{ "title", 0, 0, 0, 0, 0, 0, 0, "document title " },
464{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row " },
465{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style" },
466{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style" },
467{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list " },
468{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument" },
Owen Taylor3473f882001-02-23 17:55:21 +0000469};
470
471/*
Owen Taylor3473f882001-02-23 17:55:21 +0000472 * start tags that imply the end of current element
473 */
Daniel Veillard22090732001-07-16 00:06:07 +0000474static const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000475"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
476 "dl", "ul", "ol", "menu", "dir", "address", "pre",
477 "listing", "xmp", "head", NULL,
478"head", "p", NULL,
479"title", "p", NULL,
480"body", "head", "style", "link", "title", "p", NULL,
481"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
482 "pre", "listing", "xmp", "head", "li", NULL,
483"hr", "p", "head", NULL,
484"h1", "p", "head", NULL,
485"h2", "p", "head", NULL,
486"h3", "p", "head", NULL,
487"h4", "p", "head", NULL,
488"h5", "p", "head", NULL,
489"h6", "p", "head", NULL,
490"dir", "p", "head", NULL,
491"address", "p", "head", "ul", NULL,
492"pre", "p", "head", "ul", NULL,
493"listing", "p", "head", NULL,
494"xmp", "p", "head", NULL,
495"blockquote", "p", "head", NULL,
496"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
497 "xmp", "head", NULL,
498"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
499 "head", "dd", NULL,
500"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
501 "head", "dt", NULL,
502"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
503 "listing", "xmp", NULL,
504"ol", "p", "head", "ul", NULL,
505"menu", "p", "head", "ul", NULL,
506"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
507"div", "p", "head", NULL,
508"noscript", "p", "head", NULL,
509"center", "font", "b", "i", "p", "head", NULL,
510"a", "a", NULL,
511"caption", "p", NULL,
512"colgroup", "caption", "colgroup", "col", "p", NULL,
513"col", "caption", "col", "p", NULL,
514"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
515 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000516"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
517"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000518"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
519"thead", "caption", "col", "colgroup", NULL,
520"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
521 "tbody", "p", NULL,
522"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
523 "tfoot", "tbody", "p", NULL,
524"optgroup", "option", NULL,
525"option", "option", NULL,
526"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
527 "pre", "listing", "xmp", "a", NULL,
528NULL
529};
530
531/*
532 * The list of HTML elements which are supposed not to have
533 * CDATA content and where a p element will be implied
534 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000535 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +0000536 * implied paragraph
537 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000538static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000539 "html",
540 "head",
541 "body",
542 NULL
543};
544
545/*
546 * The list of HTML attributes which are of content %Script;
547 * NOTE: when adding ones, check htmlIsScriptAttribute() since
548 * it assumes the name starts with 'on'
549 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000550static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000551 "onclick",
552 "ondblclick",
553 "onmousedown",
554 "onmouseup",
555 "onmouseover",
556 "onmousemove",
557 "onmouseout",
558 "onkeypress",
559 "onkeydown",
560 "onkeyup",
561 "onload",
562 "onunload",
563 "onfocus",
564 "onblur",
565 "onsubmit",
566 "onrest",
567 "onchange",
568 "onselect"
569};
570
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000571/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000572 * This table is used by the htmlparser to know what to do with
573 * broken html pages. By assigning different priorities to different
574 * elements the parser can decide how to handle extra endtags.
575 * Endtags are only allowed to close elements with lower or equal
576 * priority.
577 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000578
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000579typedef struct {
580 const char *name;
581 int priority;
582} elementPriority;
583
Daniel Veillard22090732001-07-16 00:06:07 +0000584static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000585 {"div", 150},
586 {"td", 160},
587 {"th", 160},
588 {"tr", 170},
589 {"thead", 180},
590 {"tbody", 180},
591 {"tfoot", 180},
592 {"table", 190},
593 {"head", 200},
594 {"body", 200},
595 {"html", 220},
596 {NULL, 100} /* Default priority */
597};
Owen Taylor3473f882001-02-23 17:55:21 +0000598
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000599static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +0000600static int htmlStartCloseIndexinitialized = 0;
601
602/************************************************************************
603 * *
604 * functions to handle HTML specific data *
605 * *
606 ************************************************************************/
607
608/**
609 * htmlInitAutoClose:
610 *
611 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
612 * This is not reentrant. Call xmlInitParser() once before processing in
613 * case of use in multithreaded programs.
614 */
615void
616htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000617 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000618
619 if (htmlStartCloseIndexinitialized) return;
620
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000621 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
622 indx = 0;
623 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
624 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +0000625 while (htmlStartClose[i] != NULL) i++;
626 i++;
627 }
628 htmlStartCloseIndexinitialized = 1;
629}
630
631/**
632 * htmlTagLookup:
633 * @tag: The tag name in lowercase
634 *
635 * Lookup the HTML tag in the ElementTable
636 *
637 * Returns the related htmlElemDescPtr or NULL if not found.
638 */
Daniel Veillardbb371292001-08-16 23:26:59 +0000639const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +0000640htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000641 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000642
643 for (i = 0; i < (sizeof(html40ElementTable) /
644 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000645 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillard22090732001-07-16 00:06:07 +0000646 return((const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +0000647 }
648 return(NULL);
649}
650
651/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000652 * htmlGetEndPriority:
653 * @name: The name of the element to look up the priority for.
654 *
655 * Return value: The "endtag" priority.
656 **/
657static int
658htmlGetEndPriority (const xmlChar *name) {
659 int i = 0;
660
661 while ((htmlEndPriority[i].name != NULL) &&
662 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
663 i++;
664
665 return(htmlEndPriority[i].priority);
666}
667
668/**
Owen Taylor3473f882001-02-23 17:55:21 +0000669 * htmlCheckAutoClose:
670 * @newtag: The new tag name
671 * @oldtag: The old tag name
672 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000673 * Checks whether the new tag is one of the registered valid tags for
674 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +0000675 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
676 *
677 * Returns 0 if no, 1 if yes.
678 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000679static int
Owen Taylor3473f882001-02-23 17:55:21 +0000680htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000681 int i, indx;
682 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +0000683
684 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
685
686 /* inefficient, but not a big deal */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000687 for (indx = 0; indx < 100;indx++) {
688 closed = htmlStartCloseIndex[indx];
689 if (closed == NULL) return(0);
690 if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor3473f882001-02-23 17:55:21 +0000691 }
692
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000693 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +0000694 i++;
695 while (htmlStartClose[i] != NULL) {
696 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
697 return(1);
698 }
699 i++;
700 }
701 return(0);
702}
703
704/**
705 * htmlAutoCloseOnClose:
706 * @ctxt: an HTML parser context
707 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000708 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +0000709 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000710 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000711 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000712static void
Owen Taylor3473f882001-02-23 17:55:21 +0000713htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillardbb371292001-08-16 23:26:59 +0000714 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +0000715 xmlChar *oldname;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000716 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +0000717
718#ifdef DEBUG
719 xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
720 for (i = 0;i < ctxt->nameNr;i++)
721 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
722#endif
723
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000724 priority = htmlGetEndPriority (newtag);
725
Owen Taylor3473f882001-02-23 17:55:21 +0000726 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000727
Owen Taylor3473f882001-02-23 17:55:21 +0000728 if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000729 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000730 * A missplaced endtag can only close elements with lower
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000731 * or equal priority, so if we find an element with higher
732 * priority before we find an element with
733 * matching name, we just ignore this endtag
734 */
735 if (htmlGetEndPriority (ctxt->nameTab[i]) > priority) return;
Owen Taylor3473f882001-02-23 17:55:21 +0000736 }
737 if (i < 0) return;
738
739 while (!xmlStrEqual(newtag, ctxt->name)) {
740 info = htmlTagLookup(ctxt->name);
741 if ((info == NULL) || (info->endTag == 1)) {
742#ifdef DEBUG
743 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
744#endif
Daniel Veillard56098d42001-04-24 12:51:09 +0000745 } else if (info->endTag == 3) {
746#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +0000747 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", newtag, ctxt->name);
William M. Brack1633d182001-10-05 15:41:19 +0000748
Daniel Veillard56098d42001-04-24 12:51:09 +0000749#endif
750 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
751 ctxt->sax->error(ctxt->userData,
752 "Opening and ending tag mismatch: %s and %s\n",
753 newtag, ctxt->name);
754 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000755 }
756 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
757 ctxt->sax->endElement(ctxt->userData, ctxt->name);
758 oldname = htmlnamePop(ctxt);
759 if (oldname != NULL) {
760#ifdef DEBUG
761 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
762#endif
763 xmlFree(oldname);
764 }
765 }
766}
767
768/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000769 * htmlAutoCloseOnEnd:
770 * @ctxt: an HTML parser context
771 *
772 * Close all remaining tags at the end of the stream
773 */
774static void
775htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
776 xmlChar *oldname;
777 int i;
778
779 if (ctxt->nameNr == 0)
780 return;
781#ifdef DEBUG
782 xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
783#endif
784
785 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
786#ifdef DEBUG
787 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
788#endif
789 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
790 ctxt->sax->endElement(ctxt->userData, ctxt->name);
791 oldname = htmlnamePop(ctxt);
792 if (oldname != NULL) {
793#ifdef DEBUG
794 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
795#endif
796 xmlFree(oldname);
797 }
798 }
799}
800
801/**
Owen Taylor3473f882001-02-23 17:55:21 +0000802 * htmlAutoClose:
803 * @ctxt: an HTML parser context
804 * @newtag: The new tag name or NULL
805 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000806 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000807 * The list is kept in htmlStartClose array. This function is
808 * called when a new tag has been detected and generates the
809 * appropriates closes if possible/needed.
810 * If newtag is NULL this mean we are at the end of the resource
811 * and we should check
812 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000813static void
Owen Taylor3473f882001-02-23 17:55:21 +0000814htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
815 xmlChar *oldname;
816 while ((newtag != NULL) && (ctxt->name != NULL) &&
817 (htmlCheckAutoClose(newtag, ctxt->name))) {
818#ifdef DEBUG
819 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
820#endif
821 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
822 ctxt->sax->endElement(ctxt->userData, ctxt->name);
823 oldname = htmlnamePop(ctxt);
824 if (oldname != NULL) {
825#ifdef DEBUG
826 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
827#endif
828 xmlFree(oldname);
829 }
830 }
831 if (newtag == NULL) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000832 htmlAutoCloseOnEnd(ctxt);
833 return;
Owen Taylor3473f882001-02-23 17:55:21 +0000834 }
835 while ((newtag == NULL) && (ctxt->name != NULL) &&
836 ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
837 (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
838 (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
839#ifdef DEBUG
840 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
841#endif
842 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
843 ctxt->sax->endElement(ctxt->userData, ctxt->name);
844 oldname = htmlnamePop(ctxt);
845 if (oldname != NULL) {
846#ifdef DEBUG
847 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
848#endif
849 xmlFree(oldname);
850 }
851 }
852
853}
854
855/**
856 * htmlAutoCloseTag:
857 * @doc: the HTML document
858 * @name: The tag name
859 * @elem: the HTML element
860 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000861 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000862 * The list is kept in htmlStartClose array. This function checks
863 * if the element or one of it's children would autoclose the
864 * given tag.
865 *
866 * Returns 1 if autoclose, 0 otherwise
867 */
868int
869htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
870 htmlNodePtr child;
871
872 if (elem == NULL) return(1);
873 if (xmlStrEqual(name, elem->name)) return(0);
874 if (htmlCheckAutoClose(elem->name, name)) return(1);
875 child = elem->children;
876 while (child != NULL) {
877 if (htmlAutoCloseTag(doc, name, child)) return(1);
878 child = child->next;
879 }
880 return(0);
881}
882
883/**
884 * htmlIsAutoClosed:
885 * @doc: the HTML document
886 * @elem: the HTML element
887 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000888 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000889 * The list is kept in htmlStartClose array. This function checks
890 * if a tag is autoclosed by one of it's child
891 *
892 * Returns 1 if autoclosed, 0 otherwise
893 */
894int
895htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
896 htmlNodePtr child;
897
898 if (elem == NULL) return(1);
899 child = elem->children;
900 while (child != NULL) {
901 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
902 child = child->next;
903 }
904 return(0);
905}
906
907/**
908 * htmlCheckImplied:
909 * @ctxt: an HTML parser context
910 * @newtag: The new tag name
911 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000912 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +0000913 * called when a new tag has been detected and generates the
914 * appropriates implicit tags if missing
915 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000916static void
Owen Taylor3473f882001-02-23 17:55:21 +0000917htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
918 if (!htmlOmittedDefaultValue)
919 return;
920 if (xmlStrEqual(newtag, BAD_CAST"html"))
921 return;
922 if (ctxt->nameNr <= 0) {
923#ifdef DEBUG
924 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
925#endif
926 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
927 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
928 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
929 }
930 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
931 return;
932 if ((ctxt->nameNr <= 1) &&
933 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
934 (xmlStrEqual(newtag, BAD_CAST"style")) ||
935 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
936 (xmlStrEqual(newtag, BAD_CAST"link")) ||
937 (xmlStrEqual(newtag, BAD_CAST"title")) ||
938 (xmlStrEqual(newtag, BAD_CAST"base")))) {
939 /*
940 * dropped OBJECT ... i you put it first BODY will be
941 * assumed !
942 */
943#ifdef DEBUG
944 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
945#endif
946 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
947 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
948 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
949 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
950 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
951 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
952 int i;
953 for (i = 0;i < ctxt->nameNr;i++) {
954 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
955 return;
956 }
957 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
958 return;
959 }
960 }
961
962#ifdef DEBUG
963 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
964#endif
965 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
966 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
967 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
968 }
969}
970
971/**
972 * htmlCheckParagraph
973 * @ctxt: an HTML parser context
974 *
975 * Check whether a p element need to be implied before inserting
976 * characters in the current element.
977 *
978 * Returns 1 if a paragraph has been inserted, 0 if not and -1
979 * in case of error.
980 */
981
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000982static int
Owen Taylor3473f882001-02-23 17:55:21 +0000983htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
984 const xmlChar *tag;
985 int i;
986
987 if (ctxt == NULL)
988 return(-1);
989 tag = ctxt->name;
990 if (tag == NULL) {
991 htmlAutoClose(ctxt, BAD_CAST"p");
992 htmlCheckImplied(ctxt, BAD_CAST"p");
993 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
994 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
995 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
996 return(1);
997 }
998 if (!htmlOmittedDefaultValue)
999 return(0);
1000 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1001 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1002#ifdef DEBUG
1003 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
1004#endif
1005 htmlAutoClose(ctxt, BAD_CAST"p");
1006 htmlCheckImplied(ctxt, BAD_CAST"p");
1007 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
1008 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1009 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1010 return(1);
1011 }
1012 }
1013 return(0);
1014}
1015
1016/**
1017 * htmlIsScriptAttribute:
1018 * @name: an attribute name
1019 *
1020 * Check if an attribute is of content type Script
1021 *
1022 * Returns 1 is the attribute is a script 0 otherwise
1023 */
1024int
1025htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001026 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001027
1028 if (name == NULL)
1029 return(0);
1030 /*
1031 * all script attributes start with 'on'
1032 */
1033 if ((name[0] != 'o') || (name[1] != 'n'))
1034 return(0);
1035 for (i = 0;
1036 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1037 i++) {
1038 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1039 return(1);
1040 }
1041 return(0);
1042}
1043
1044/************************************************************************
1045 * *
1046 * The list of HTML predefined entities *
1047 * *
1048 ************************************************************************/
1049
1050
Daniel Veillard22090732001-07-16 00:06:07 +00001051static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001052/*
1053 * the 4 absolute ones, plus apostrophe.
1054 */
1055{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1056{ 38, "amp", "ampersand, U+0026 ISOnum" },
1057{ 39, "apos", "single quote" },
1058{ 60, "lt", "less-than sign, U+003C ISOnum" },
1059{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1060
1061/*
1062 * A bunch still in the 128-255 range
1063 * Replacing them depend really on the charset used.
1064 */
1065{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1066{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1067{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1068{ 163, "pound","pound sign, U+00A3 ISOnum" },
1069{ 164, "curren","currency sign, U+00A4 ISOnum" },
1070{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1071{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1072{ 167, "sect", "section sign, U+00A7 ISOnum" },
1073{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1074{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1075{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1076{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1077{ 172, "not", "not sign, U+00AC ISOnum" },
1078{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1079{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1080{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1081{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1082{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1083{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1084{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1085{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1086{ 181, "micro","micro sign, U+00B5 ISOnum" },
1087{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1088{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1089{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1090{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1091{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1092{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1093{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1094{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1095{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1096{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1097{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1098{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1099{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1100{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1101{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1102{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1103{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1104{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1105{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1106{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1107{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1108{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1109{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1110{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1111{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1112{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1113{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1114{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1115{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1116{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1117{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1118{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1119{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1120{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1121{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1122{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1123{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1124{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1125{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1126{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1127{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1128{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1129{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1130{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1131{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1132{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1133{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1134{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1135{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1136{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1137{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1138{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1139{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1140{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1141{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1142{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1143{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1144{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1145{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1146{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1147{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1148{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1149{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1150{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1151{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1152{ 247, "divide","division sign, U+00F7 ISOnum" },
1153{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1154{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1155{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1156{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1157{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1158{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1159{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1160{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1161
1162{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1163{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1164{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1165{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1166{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1167
1168/*
1169 * Anything below should really be kept as entities references
1170 */
1171{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1172
1173{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1174{ 732, "tilde","small tilde, U+02DC ISOdia" },
1175
1176{ 913, "Alpha","greek capital letter alpha, U+0391" },
1177{ 914, "Beta", "greek capital letter beta, U+0392" },
1178{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1179{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1180{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1181{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1182{ 919, "Eta", "greek capital letter eta, U+0397" },
1183{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1184{ 921, "Iota", "greek capital letter iota, U+0399" },
1185{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001186{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001187{ 924, "Mu", "greek capital letter mu, U+039C" },
1188{ 925, "Nu", "greek capital letter nu, U+039D" },
1189{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1190{ 927, "Omicron","greek capital letter omicron, U+039F" },
1191{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1192{ 929, "Rho", "greek capital letter rho, U+03A1" },
1193{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1194{ 932, "Tau", "greek capital letter tau, U+03A4" },
1195{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1196{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1197{ 935, "Chi", "greek capital letter chi, U+03A7" },
1198{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1199{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1200
1201{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1202{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1203{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1204{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1205{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1206{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1207{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1208{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1209{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1210{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1211{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1212{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1213{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1214{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1215{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1216{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1217{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1218{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1219{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1220{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1221{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1222{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1223{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1224{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1225{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1226{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1227{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1228{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1229
1230{ 8194, "ensp", "en space, U+2002 ISOpub" },
1231{ 8195, "emsp", "em space, U+2003 ISOpub" },
1232{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1233{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1234{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1235{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1236{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1237{ 8211, "ndash","en dash, U+2013 ISOpub" },
1238{ 8212, "mdash","em dash, U+2014 ISOpub" },
1239{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1240{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1241{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1242{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1243{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1244{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1245{ 8224, "dagger","dagger, U+2020 ISOpub" },
1246{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1247
1248{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1249{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1250
1251{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1252
1253{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1254{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1255
1256{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1257{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1258
1259{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1260{ 8260, "frasl","fraction slash, U+2044 NEW" },
1261
1262{ 8364, "euro", "euro sign, U+20AC NEW" },
1263
1264{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1265{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1266{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1267{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1268{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1269{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1270{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1271{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1272{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1273{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1274{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1275{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1276{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1277{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1278{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1279{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1280
1281{ 8704, "forall","for all, U+2200 ISOtech" },
1282{ 8706, "part", "partial differential, U+2202 ISOtech" },
1283{ 8707, "exist","there exists, U+2203 ISOtech" },
1284{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1285{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1286{ 8712, "isin", "element of, U+2208 ISOtech" },
1287{ 8713, "notin","not an element of, U+2209 ISOtech" },
1288{ 8715, "ni", "contains as member, U+220B ISOtech" },
1289{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001290{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001291{ 8722, "minus","minus sign, U+2212 ISOtech" },
1292{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1293{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1294{ 8733, "prop", "proportional to, U+221D ISOtech" },
1295{ 8734, "infin","infinity, U+221E ISOtech" },
1296{ 8736, "ang", "angle, U+2220 ISOamso" },
1297{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1298{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1299{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1300{ 8746, "cup", "union = cup, U+222A ISOtech" },
1301{ 8747, "int", "integral, U+222B ISOtech" },
1302{ 8756, "there4","therefore, U+2234 ISOtech" },
1303{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1304{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1305{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1306{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1307{ 8801, "equiv","identical to, U+2261 ISOtech" },
1308{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1309{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1310{ 8834, "sub", "subset of, U+2282 ISOtech" },
1311{ 8835, "sup", "superset of, U+2283 ISOtech" },
1312{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1313{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1314{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1315{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1316{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1317{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1318{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1319{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1320{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1321{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1322{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1323{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1324{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1325{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1326
1327{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1328{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1329{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1330{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1331
1332};
1333
1334/************************************************************************
1335 * *
1336 * Commodity functions to handle entities *
1337 * *
1338 ************************************************************************/
1339
1340/*
1341 * Macro used to grow the current buffer.
1342 */
1343#define growBuffer(buffer) { \
1344 buffer##_size *= 2; \
Daniel Veillard3487c8d2002-09-05 11:33:25 +00001345 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
Owen Taylor3473f882001-02-23 17:55:21 +00001346 if (buffer == NULL) { \
Daniel Veillard3487c8d2002-09-05 11:33:25 +00001347 xmlGenericError(xmlGenericErrorContext, "realloc failed\n"); \
Owen Taylor3473f882001-02-23 17:55:21 +00001348 return(NULL); \
1349 } \
1350}
1351
1352/**
1353 * htmlEntityLookup:
1354 * @name: the entity name
1355 *
1356 * Lookup the given entity in EntitiesTable
1357 *
1358 * TODO: the linear scan is really ugly, an hash table is really needed.
1359 *
1360 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1361 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001362const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001363htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001364 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001365
1366 for (i = 0;i < (sizeof(html40EntitiesTable)/
1367 sizeof(html40EntitiesTable[0]));i++) {
1368 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1369#ifdef DEBUG
1370 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
1371#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001372 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001373 }
1374 }
1375 return(NULL);
1376}
1377
1378/**
1379 * htmlEntityValueLookup:
1380 * @value: the entity's unicode value
1381 *
1382 * Lookup the given entity in EntitiesTable
1383 *
1384 * TODO: the linear scan is really ugly, an hash table is really needed.
1385 *
1386 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1387 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001388const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001389htmlEntityValueLookup(unsigned int value) {
1390 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001391#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00001392 unsigned int lv = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001393#endif
1394
1395 for (i = 0;i < (sizeof(html40EntitiesTable)/
1396 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001397 if (html40EntitiesTable[i].value >= value) {
1398 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001399 break;
1400#ifdef DEBUG
1401 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
1402#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001403 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001404 }
1405#ifdef DEBUG
1406 if (lv > html40EntitiesTable[i].value) {
1407 xmlGenericError(xmlGenericErrorContext,
1408 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1409 lv, html40EntitiesTable[i].value);
1410 }
1411 lv = html40EntitiesTable[i].value;
1412#endif
1413 }
1414 return(NULL);
1415}
1416
1417/**
1418 * UTF8ToHtml:
1419 * @out: a pointer to an array of bytes to store the result
1420 * @outlen: the length of @out
1421 * @in: a pointer to an array of UTF-8 chars
1422 * @inlen: the length of @in
1423 *
1424 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1425 * plus HTML entities block of chars out.
1426 *
1427 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1428 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001429 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001430 * The value of @outlen after return is the number of octets consumed.
1431 */
1432int
1433UTF8ToHtml(unsigned char* out, int *outlen,
1434 const unsigned char* in, int *inlen) {
1435 const unsigned char* processed = in;
1436 const unsigned char* outend;
1437 const unsigned char* outstart = out;
1438 const unsigned char* instart = in;
1439 const unsigned char* inend;
1440 unsigned int c, d;
1441 int trailing;
1442
1443 if (in == NULL) {
1444 /*
1445 * initialization nothing to do
1446 */
1447 *outlen = 0;
1448 *inlen = 0;
1449 return(0);
1450 }
1451 inend = in + (*inlen);
1452 outend = out + (*outlen);
1453 while (in < inend) {
1454 d = *in++;
1455 if (d < 0x80) { c= d; trailing= 0; }
1456 else if (d < 0xC0) {
1457 /* trailing byte in leading position */
1458 *outlen = out - outstart;
1459 *inlen = processed - instart;
1460 return(-2);
1461 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1462 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1463 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1464 else {
1465 /* no chance for this in Ascii */
1466 *outlen = out - outstart;
1467 *inlen = processed - instart;
1468 return(-2);
1469 }
1470
1471 if (inend - in < trailing) {
1472 break;
1473 }
1474
1475 for ( ; trailing; trailing--) {
1476 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1477 break;
1478 c <<= 6;
1479 c |= d & 0x3F;
1480 }
1481
1482 /* assertion: c is a single UTF-4 value */
1483 if (c < 0x80) {
1484 if (out + 1 >= outend)
1485 break;
1486 *out++ = c;
1487 } else {
1488 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001489 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001490
1491 /*
1492 * Try to lookup a predefined HTML entity for it
1493 */
1494
1495 ent = htmlEntityValueLookup(c);
1496 if (ent == NULL) {
1497 /* no chance for this in Ascii */
1498 *outlen = out - outstart;
1499 *inlen = processed - instart;
1500 return(-2);
1501 }
1502 len = strlen(ent->name);
1503 if (out + 2 + len >= outend)
1504 break;
1505 *out++ = '&';
1506 memcpy(out, ent->name, len);
1507 out += len;
1508 *out++ = ';';
1509 }
1510 processed = in;
1511 }
1512 *outlen = out - outstart;
1513 *inlen = processed - instart;
1514 return(0);
1515}
1516
1517/**
1518 * htmlEncodeEntities:
1519 * @out: a pointer to an array of bytes to store the result
1520 * @outlen: the length of @out
1521 * @in: a pointer to an array of UTF-8 chars
1522 * @inlen: the length of @in
1523 * @quoteChar: the quote character to escape (' or ") or zero.
1524 *
1525 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1526 * plus HTML entities block of chars out.
1527 *
1528 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1529 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001530 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001531 * The value of @outlen after return is the number of octets consumed.
1532 */
1533int
1534htmlEncodeEntities(unsigned char* out, int *outlen,
1535 const unsigned char* in, int *inlen, int quoteChar) {
1536 const unsigned char* processed = in;
1537 const unsigned char* outend = out + (*outlen);
1538 const unsigned char* outstart = out;
1539 const unsigned char* instart = in;
1540 const unsigned char* inend = in + (*inlen);
1541 unsigned int c, d;
1542 int trailing;
1543
1544 while (in < inend) {
1545 d = *in++;
1546 if (d < 0x80) { c= d; trailing= 0; }
1547 else if (d < 0xC0) {
1548 /* trailing byte in leading position */
1549 *outlen = out - outstart;
1550 *inlen = processed - instart;
1551 return(-2);
1552 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1553 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1554 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1555 else {
1556 /* no chance for this in Ascii */
1557 *outlen = out - outstart;
1558 *inlen = processed - instart;
1559 return(-2);
1560 }
1561
1562 if (inend - in < trailing)
1563 break;
1564
1565 while (trailing--) {
1566 if (((d= *in++) & 0xC0) != 0x80) {
1567 *outlen = out - outstart;
1568 *inlen = processed - instart;
1569 return(-2);
1570 }
1571 c <<= 6;
1572 c |= d & 0x3F;
1573 }
1574
1575 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001576 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1577 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001578 if (out >= outend)
1579 break;
1580 *out++ = c;
1581 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001582 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001583 const char *cp;
1584 char nbuf[16];
1585 int len;
1586
1587 /*
1588 * Try to lookup a predefined HTML entity for it
1589 */
1590 ent = htmlEntityValueLookup(c);
1591 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00001592 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00001593 cp = nbuf;
1594 }
1595 else
1596 cp = ent->name;
1597 len = strlen(cp);
1598 if (out + 2 + len > outend)
1599 break;
1600 *out++ = '&';
1601 memcpy(out, cp, len);
1602 out += len;
1603 *out++ = ';';
1604 }
1605 processed = in;
1606 }
1607 *outlen = out - outstart;
1608 *inlen = processed - instart;
1609 return(0);
1610}
1611
1612/**
1613 * htmlDecodeEntities:
1614 * @ctxt: the parser context
1615 * @len: the len to decode (in bytes !), -1 for no size limit
1616 * @end: an end marker xmlChar, 0 if none
1617 * @end2: an end marker xmlChar, 0 if none
1618 * @end3: an end marker xmlChar, 0 if none
1619 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001620 * Substitute the HTML entities by their value
Owen Taylor3473f882001-02-23 17:55:21 +00001621 *
1622 * DEPRECATED !!!!
1623 *
1624 * Returns A newly allocated string with the substitution done. The caller
1625 * must deallocate it !
1626 */
1627xmlChar *
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00001628htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
1629 xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001630 static int deprecated = 0;
1631 if (!deprecated) {
1632 xmlGenericError(xmlGenericErrorContext,
1633 "htmlDecodeEntities() deprecated function reached\n");
1634 deprecated = 1;
1635 }
1636 return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00001637}
1638
1639/************************************************************************
1640 * *
1641 * Commodity functions to handle streams *
1642 * *
1643 ************************************************************************/
1644
1645/**
Owen Taylor3473f882001-02-23 17:55:21 +00001646 * htmlNewInputStream:
1647 * @ctxt: an HTML parser context
1648 *
1649 * Create a new input stream structure
1650 * Returns the new input stream or NULL
1651 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001652static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001653htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1654 htmlParserInputPtr input;
1655
1656 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1657 if (input == NULL) {
1658 ctxt->errNo = XML_ERR_NO_MEMORY;
1659 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1660 ctxt->sax->error(ctxt->userData,
1661 "malloc: couldn't allocate a new input stream\n");
1662 return(NULL);
1663 }
1664 memset(input, 0, sizeof(htmlParserInput));
1665 input->filename = NULL;
1666 input->directory = NULL;
1667 input->base = NULL;
1668 input->cur = NULL;
1669 input->buf = NULL;
1670 input->line = 1;
1671 input->col = 1;
1672 input->buf = NULL;
1673 input->free = NULL;
1674 input->version = NULL;
1675 input->consumed = 0;
1676 input->length = 0;
1677 return(input);
1678}
1679
1680
1681/************************************************************************
1682 * *
1683 * Commodity functions, cleanup needed ? *
1684 * *
1685 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001686/*
1687 * all tags allowing pc data from the html 4.01 loose dtd
1688 * NOTE: it might be more apropriate to integrate this information
1689 * into the html40ElementTable array but I don't want to risk any
1690 * binary incomptibility
1691 */
1692static const char *allowPCData[] = {
1693 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
1694 "blockquote", "body", "button", "caption", "center", "cite", "code",
1695 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
1696 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
1697 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
1698 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
1699};
Owen Taylor3473f882001-02-23 17:55:21 +00001700
1701/**
1702 * areBlanks:
1703 * @ctxt: an HTML parser context
1704 * @str: a xmlChar *
1705 * @len: the size of @str
1706 *
1707 * Is this a sequence of blank chars that one can ignore ?
1708 *
1709 * Returns 1 if ignorable 0 otherwise.
1710 */
1711
1712static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001713 unsigned int i;
1714 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00001715 xmlNodePtr lastChild;
1716
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001717 for (j = 0;j < len;j++)
1718 if (!(IS_BLANK(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001719
1720 if (CUR == 0) return(1);
1721 if (CUR != '<') return(0);
1722 if (ctxt->name == NULL)
1723 return(1);
1724 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
1725 return(1);
1726 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
1727 return(1);
1728 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
1729 return(1);
1730 if (ctxt->node == NULL) return(0);
1731 lastChild = xmlGetLastChild(ctxt->node);
1732 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00001733 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
1734 (ctxt->node->content != NULL)) return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001735 /* keep ws in constructs like ...<b> </b>...
1736 for all tags "b" allowing PCDATA */
1737 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
1738 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
1739 return(0);
1740 }
1741 }
Owen Taylor3473f882001-02-23 17:55:21 +00001742 } else if (xmlNodeIsText(lastChild)) {
1743 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001744 } else {
1745 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
1746 for all tags "p" allowing PCDATA */
1747 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
1748 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
1749 return(0);
1750 }
1751 }
Owen Taylor3473f882001-02-23 17:55:21 +00001752 }
1753 return(1);
1754}
1755
1756/**
Owen Taylor3473f882001-02-23 17:55:21 +00001757 * htmlNewDocNoDtD:
1758 * @URI: URI for the dtd, or NULL
1759 * @ExternalID: the external ID of the DTD, or NULL
1760 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001761 * Creates a new HTML document without a DTD node if @URI and @ExternalID
1762 * are NULL
1763 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001764 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00001765 */
1766htmlDocPtr
1767htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
1768 xmlDocPtr cur;
1769
1770 /*
1771 * Allocate a new document and fill the fields.
1772 */
1773 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
1774 if (cur == NULL) {
1775 xmlGenericError(xmlGenericErrorContext,
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001776 "htmlNewDocNoDtD : malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00001777 return(NULL);
1778 }
1779 memset(cur, 0, sizeof(xmlDoc));
1780
1781 cur->type = XML_HTML_DOCUMENT_NODE;
1782 cur->version = NULL;
1783 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001784 cur->doc = cur;
1785 cur->name = NULL;
1786 cur->children = NULL;
1787 cur->extSubset = NULL;
1788 cur->oldNs = NULL;
1789 cur->encoding = NULL;
1790 cur->standalone = 1;
1791 cur->compression = 0;
1792 cur->ids = NULL;
1793 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001794 cur->_private = NULL;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00001795 if ((ExternalID != NULL) ||
1796 (URI != NULL))
Daniel Veillard5151c062001-10-23 13:10:19 +00001797 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00001798 return(cur);
1799}
1800
1801/**
1802 * htmlNewDoc:
1803 * @URI: URI for the dtd, or NULL
1804 * @ExternalID: the external ID of the DTD, or NULL
1805 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001806 * Creates a new HTML document
1807 *
Owen Taylor3473f882001-02-23 17:55:21 +00001808 * Returns a new document
1809 */
1810htmlDocPtr
1811htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1812 if ((URI == NULL) && (ExternalID == NULL))
1813 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00001814 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
1815 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00001816
1817 return(htmlNewDocNoDtD(URI, ExternalID));
1818}
1819
1820
1821/************************************************************************
1822 * *
1823 * The parser itself *
1824 * Relates to http://www.w3.org/TR/html40 *
1825 * *
1826 ************************************************************************/
1827
1828/************************************************************************
1829 * *
1830 * The parser itself *
1831 * *
1832 ************************************************************************/
1833
1834/**
1835 * htmlParseHTMLName:
1836 * @ctxt: an HTML parser context
1837 *
1838 * parse an HTML tag or attribute name, note that we convert it to lowercase
1839 * since HTML names are not case-sensitive.
1840 *
1841 * Returns the Tag Name parsed or NULL
1842 */
1843
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001844static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001845htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1846 xmlChar *ret = NULL;
1847 int i = 0;
1848 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
1849
1850 if (!IS_LETTER(CUR) && (CUR != '_') &&
1851 (CUR != ':')) return(NULL);
1852
1853 while ((i < HTML_PARSER_BUFFER_SIZE) &&
1854 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1855 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
1856 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
1857 else loc[i] = CUR;
1858 i++;
1859
1860 NEXT;
1861 }
1862
1863 ret = xmlStrndup(loc, i);
1864
1865 return(ret);
1866}
1867
1868/**
1869 * htmlParseName:
1870 * @ctxt: an HTML parser context
1871 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001872 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00001873 *
1874 * Returns the Name parsed or NULL
1875 */
1876
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001877static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001878htmlParseName(htmlParserCtxtPtr ctxt) {
1879 xmlChar buf[HTML_MAX_NAMELEN];
1880 int len = 0;
1881
1882 GROW;
1883 if (!IS_LETTER(CUR) && (CUR != '_')) {
1884 return(NULL);
1885 }
1886
1887 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1888 (CUR == '.') || (CUR == '-') ||
1889 (CUR == '_') || (CUR == ':') ||
1890 (IS_COMBINING(CUR)) ||
1891 (IS_EXTENDER(CUR))) {
1892 buf[len++] = CUR;
1893 NEXT;
1894 if (len >= HTML_MAX_NAMELEN) {
1895 xmlGenericError(xmlGenericErrorContext,
1896 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1897 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1898 (CUR == '.') || (CUR == '-') ||
1899 (CUR == '_') || (CUR == ':') ||
1900 (IS_COMBINING(CUR)) ||
1901 (IS_EXTENDER(CUR)))
1902 NEXT;
1903 break;
1904 }
1905 }
1906 return(xmlStrndup(buf, len));
1907}
1908
1909/**
1910 * htmlParseHTMLAttribute:
1911 * @ctxt: an HTML parser context
1912 * @stop: a char stop value
1913 *
1914 * parse an HTML attribute value till the stop (quote), if
1915 * stop is 0 then it stops at the first space
1916 *
1917 * Returns the attribute parsed or NULL
1918 */
1919
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001920static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001921htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
1922 xmlChar *buffer = NULL;
1923 int buffer_size = 0;
1924 xmlChar *out = NULL;
1925 xmlChar *name = NULL;
1926
1927 xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00001928 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001929
1930 /*
1931 * allocate a translation buffer.
1932 */
1933 buffer_size = HTML_PARSER_BUFFER_SIZE;
1934 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1935 if (buffer == NULL) {
Daniel Veillard3487c8d2002-09-05 11:33:25 +00001936 xmlGenericError(xmlGenericErrorContext,
1937 "htmlParseHTMLAttribute: malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00001938 return(NULL);
1939 }
1940 out = buffer;
1941
1942 /*
1943 * Ok loop until we reach one of the ending chars
1944 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00001945 while ((CUR != 0) && (CUR != stop)) {
1946 if ((stop == 0) && (CUR == '>')) break;
Owen Taylor3473f882001-02-23 17:55:21 +00001947 if ((stop == 0) && (IS_BLANK(CUR))) break;
1948 if (CUR == '&') {
1949 if (NXT(1) == '#') {
1950 unsigned int c;
1951 int bits;
1952
1953 c = htmlParseCharRef(ctxt);
1954 if (c < 0x80)
1955 { *out++ = c; bits= -6; }
1956 else if (c < 0x800)
1957 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1958 else if (c < 0x10000)
1959 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1960 else
1961 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1962
1963 for ( ; bits >= 0; bits-= 6) {
1964 *out++ = ((c >> bits) & 0x3F) | 0x80;
1965 }
Daniel Veillardce02dbc2002-10-22 19:14:58 +00001966
1967 if (out - buffer > buffer_size - 100) {
1968 int indx = out - buffer;
1969
1970 growBuffer(buffer);
1971 out = &buffer[indx];
1972 }
Owen Taylor3473f882001-02-23 17:55:21 +00001973 } else {
1974 ent = htmlParseEntityRef(ctxt, &name);
1975 if (name == NULL) {
1976 *out++ = '&';
1977 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001978 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00001979
1980 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001981 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00001982 }
1983 } else if (ent == NULL) {
1984 *out++ = '&';
1985 cur = name;
1986 while (*cur != 0) {
1987 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001988 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00001989
1990 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001991 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00001992 }
1993 *out++ = *cur++;
1994 }
1995 xmlFree(name);
1996 } else {
1997 unsigned int c;
1998 int bits;
1999
2000 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002001 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002002
2003 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002004 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002005 }
2006 c = (xmlChar)ent->value;
2007 if (c < 0x80)
2008 { *out++ = c; bits= -6; }
2009 else if (c < 0x800)
2010 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2011 else if (c < 0x10000)
2012 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2013 else
2014 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2015
2016 for ( ; bits >= 0; bits-= 6) {
2017 *out++ = ((c >> bits) & 0x3F) | 0x80;
2018 }
2019 xmlFree(name);
2020 }
2021 }
2022 } else {
2023 unsigned int c;
2024 int bits, l;
2025
2026 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002027 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002028
2029 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002030 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002031 }
2032 c = CUR_CHAR(l);
2033 if (c < 0x80)
2034 { *out++ = c; bits= -6; }
2035 else if (c < 0x800)
2036 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2037 else if (c < 0x10000)
2038 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2039 else
2040 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2041
2042 for ( ; bits >= 0; bits-= 6) {
2043 *out++ = ((c >> bits) & 0x3F) | 0x80;
2044 }
2045 NEXT;
2046 }
2047 }
2048 *out++ = 0;
2049 return(buffer);
2050}
2051
2052/**
Owen Taylor3473f882001-02-23 17:55:21 +00002053 * htmlParseEntityRef:
2054 * @ctxt: an HTML parser context
2055 * @str: location to store the entity name
2056 *
2057 * parse an HTML ENTITY references
2058 *
2059 * [68] EntityRef ::= '&' Name ';'
2060 *
2061 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2062 * if non-NULL *str will have to be freed by the caller.
2063 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002064const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00002065htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2066 xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002067 const htmlEntityDesc * ent = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002068 *str = NULL;
2069
2070 if (CUR == '&') {
2071 NEXT;
2072 name = htmlParseName(ctxt);
2073 if (name == NULL) {
2074 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2075 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2076 ctxt->wellFormed = 0;
2077 } else {
2078 GROW;
2079 if (CUR == ';') {
2080 *str = name;
2081
2082 /*
2083 * Lookup the entity in the table.
2084 */
2085 ent = htmlEntityLookup(name);
2086 if (ent != NULL) /* OK that's ugly !!! */
2087 NEXT;
2088 } else {
2089 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2090 ctxt->sax->error(ctxt->userData,
2091 "htmlParseEntityRef: expecting ';'\n");
2092 *str = name;
2093 }
2094 }
2095 }
2096 return(ent);
2097}
2098
2099/**
2100 * htmlParseAttValue:
2101 * @ctxt: an HTML parser context
2102 *
2103 * parse a value for an attribute
2104 * Note: the parser won't do substitution of entities here, this
2105 * will be handled later in xmlStringGetNodeList, unless it was
2106 * asked for ctxt->replaceEntities != 0
2107 *
2108 * Returns the AttValue parsed or NULL.
2109 */
2110
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002111static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002112htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2113 xmlChar *ret = NULL;
2114
2115 if (CUR == '"') {
2116 NEXT;
2117 ret = htmlParseHTMLAttribute(ctxt, '"');
2118 if (CUR != '"') {
2119 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2120 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2121 ctxt->wellFormed = 0;
2122 } else
2123 NEXT;
2124 } else if (CUR == '\'') {
2125 NEXT;
2126 ret = htmlParseHTMLAttribute(ctxt, '\'');
2127 if (CUR != '\'') {
2128 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2129 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2130 ctxt->wellFormed = 0;
2131 } else
2132 NEXT;
2133 } else {
2134 /*
2135 * That's an HTMLism, the attribute value may not be quoted
2136 */
2137 ret = htmlParseHTMLAttribute(ctxt, 0);
2138 if (ret == NULL) {
2139 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2140 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2141 ctxt->wellFormed = 0;
2142 }
2143 }
2144 return(ret);
2145}
2146
2147/**
2148 * htmlParseSystemLiteral:
2149 * @ctxt: an HTML parser context
2150 *
2151 * parse an HTML Literal
2152 *
2153 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2154 *
2155 * Returns the SystemLiteral parsed or NULL
2156 */
2157
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002158static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002159htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2160 const xmlChar *q;
2161 xmlChar *ret = NULL;
2162
2163 if (CUR == '"') {
2164 NEXT;
2165 q = CUR_PTR;
2166 while ((IS_CHAR(CUR)) && (CUR != '"'))
2167 NEXT;
2168 if (!IS_CHAR(CUR)) {
2169 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2170 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2171 ctxt->wellFormed = 0;
2172 } else {
2173 ret = xmlStrndup(q, CUR_PTR - q);
2174 NEXT;
2175 }
2176 } else if (CUR == '\'') {
2177 NEXT;
2178 q = CUR_PTR;
2179 while ((IS_CHAR(CUR)) && (CUR != '\''))
2180 NEXT;
2181 if (!IS_CHAR(CUR)) {
2182 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2183 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2184 ctxt->wellFormed = 0;
2185 } else {
2186 ret = xmlStrndup(q, CUR_PTR - q);
2187 NEXT;
2188 }
2189 } else {
2190 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2191 ctxt->sax->error(ctxt->userData,
2192 "SystemLiteral \" or ' expected\n");
2193 ctxt->wellFormed = 0;
2194 }
2195
2196 return(ret);
2197}
2198
2199/**
2200 * htmlParsePubidLiteral:
2201 * @ctxt: an HTML parser context
2202 *
2203 * parse an HTML public literal
2204 *
2205 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2206 *
2207 * Returns the PubidLiteral parsed or NULL.
2208 */
2209
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002210static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002211htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2212 const xmlChar *q;
2213 xmlChar *ret = NULL;
2214 /*
2215 * Name ::= (Letter | '_') (NameChar)*
2216 */
2217 if (CUR == '"') {
2218 NEXT;
2219 q = CUR_PTR;
2220 while (IS_PUBIDCHAR(CUR)) NEXT;
2221 if (CUR != '"') {
2222 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2223 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2224 ctxt->wellFormed = 0;
2225 } else {
2226 ret = xmlStrndup(q, CUR_PTR - q);
2227 NEXT;
2228 }
2229 } else if (CUR == '\'') {
2230 NEXT;
2231 q = CUR_PTR;
2232 while ((IS_LETTER(CUR)) && (CUR != '\''))
2233 NEXT;
2234 if (!IS_LETTER(CUR)) {
2235 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2236 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2237 ctxt->wellFormed = 0;
2238 } else {
2239 ret = xmlStrndup(q, CUR_PTR - q);
2240 NEXT;
2241 }
2242 } else {
2243 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2244 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2245 ctxt->wellFormed = 0;
2246 }
2247
2248 return(ret);
2249}
2250
2251/**
2252 * htmlParseScript:
2253 * @ctxt: an HTML parser context
2254 *
2255 * parse the content of an HTML SCRIPT or STYLE element
2256 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2257 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2258 * http://www.w3.org/TR/html4/types.html#type-script
2259 * http://www.w3.org/TR/html4/types.html#h-6.15
2260 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2261 *
2262 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2263 * element and the value of intrinsic event attributes. User agents must
2264 * not evaluate script data as HTML markup but instead must pass it on as
2265 * data to a script engine.
2266 * NOTES:
2267 * - The content is passed like CDATA
2268 * - the attributes for style and scripting "onXXX" are also described
2269 * as CDATA but SGML allows entities references in attributes so their
2270 * processing is identical as other attributes
2271 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002272static void
Owen Taylor3473f882001-02-23 17:55:21 +00002273htmlParseScript(htmlParserCtxtPtr ctxt) {
2274 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2275 int nbchar = 0;
2276 xmlChar cur;
2277
2278 SHRINK;
2279 cur = CUR;
2280 while (IS_CHAR(cur)) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00002281 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2282 (NXT(3) == '-')) {
2283 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2284 if (ctxt->sax->cdataBlock!= NULL) {
2285 /*
2286 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2287 */
2288 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2289 }
2290 }
2291 nbchar = 0;
2292 htmlParseComment(ctxt);
2293 cur = CUR;
2294 continue;
2295 } else if ((cur == '<') && (NXT(1) == '/')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002296 /*
2297 * One should break here, the specification is clear:
2298 * Authors should therefore escape "</" within the content.
2299 * Escape mechanisms are specific to each scripting or
2300 * style sheet language.
2301 */
2302 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2303 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2304 break; /* while */
2305 }
2306 buf[nbchar++] = cur;
2307 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2308 if (ctxt->sax->cdataBlock!= NULL) {
2309 /*
2310 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2311 */
2312 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2313 }
2314 nbchar = 0;
2315 }
2316 NEXT;
2317 cur = CUR;
2318 }
2319 if (!(IS_CHAR(cur))) {
2320 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2321 ctxt->sax->error(ctxt->userData,
2322 "Invalid char in CDATA 0x%X\n", cur);
2323 ctxt->wellFormed = 0;
2324 NEXT;
2325 }
2326
2327 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2328 if (ctxt->sax->cdataBlock!= NULL) {
2329 /*
2330 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2331 */
2332 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2333 }
2334 }
2335}
2336
2337
2338/**
2339 * htmlParseCharData:
2340 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002341 *
2342 * parse a CharData section.
2343 * if we are within a CDATA section ']]>' marks an end of section.
2344 *
2345 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2346 */
2347
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002348static void
2349htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002350 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2351 int nbchar = 0;
2352 int cur, l;
2353
2354 SHRINK;
2355 cur = CUR_CHAR(l);
2356 while (((cur != '<') || (ctxt->token == '<')) &&
2357 ((cur != '&') || (ctxt->token == '&')) &&
2358 (IS_CHAR(cur))) {
2359 COPY_BUF(l,buf,nbchar,cur);
2360 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2361 /*
2362 * Ok the segment is to be consumed as chars.
2363 */
2364 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2365 if (areBlanks(ctxt, buf, nbchar)) {
2366 if (ctxt->sax->ignorableWhitespace != NULL)
2367 ctxt->sax->ignorableWhitespace(ctxt->userData,
2368 buf, nbchar);
2369 } else {
2370 htmlCheckParagraph(ctxt);
2371 if (ctxt->sax->characters != NULL)
2372 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2373 }
2374 }
2375 nbchar = 0;
2376 }
2377 NEXTL(l);
2378 cur = CUR_CHAR(l);
2379 }
2380 if (nbchar != 0) {
2381 /*
2382 * Ok the segment is to be consumed as chars.
2383 */
2384 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2385 if (areBlanks(ctxt, buf, nbchar)) {
2386 if (ctxt->sax->ignorableWhitespace != NULL)
2387 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2388 } else {
2389 htmlCheckParagraph(ctxt);
2390 if (ctxt->sax->characters != NULL)
2391 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2392 }
2393 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002394 } else {
2395 /*
2396 * Loop detection
2397 */
2398 if (cur == 0)
2399 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002400 }
2401}
2402
2403/**
2404 * htmlParseExternalID:
2405 * @ctxt: an HTML parser context
2406 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002407 *
2408 * Parse an External ID or a Public ID
2409 *
Owen Taylor3473f882001-02-23 17:55:21 +00002410 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2411 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2412 *
2413 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2414 *
2415 * Returns the function returns SystemLiteral and in the second
2416 * case publicID receives PubidLiteral, is strict is off
2417 * it is possible to return NULL and have publicID set.
2418 */
2419
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002420static xmlChar *
2421htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002422 xmlChar *URI = NULL;
2423
2424 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2425 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2426 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2427 SKIP(6);
2428 if (!IS_BLANK(CUR)) {
2429 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2430 ctxt->sax->error(ctxt->userData,
2431 "Space required after 'SYSTEM'\n");
2432 ctxt->wellFormed = 0;
2433 }
2434 SKIP_BLANKS;
2435 URI = htmlParseSystemLiteral(ctxt);
2436 if (URI == NULL) {
2437 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2438 ctxt->sax->error(ctxt->userData,
2439 "htmlParseExternalID: SYSTEM, no URI\n");
2440 ctxt->wellFormed = 0;
2441 }
2442 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2443 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2444 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2445 SKIP(6);
2446 if (!IS_BLANK(CUR)) {
2447 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2448 ctxt->sax->error(ctxt->userData,
2449 "Space required after 'PUBLIC'\n");
2450 ctxt->wellFormed = 0;
2451 }
2452 SKIP_BLANKS;
2453 *publicID = htmlParsePubidLiteral(ctxt);
2454 if (*publicID == NULL) {
2455 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2456 ctxt->sax->error(ctxt->userData,
2457 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2458 ctxt->wellFormed = 0;
2459 }
2460 SKIP_BLANKS;
2461 if ((CUR == '"') || (CUR == '\'')) {
2462 URI = htmlParseSystemLiteral(ctxt);
2463 }
2464 }
2465 return(URI);
2466}
2467
2468/**
2469 * htmlParseComment:
2470 * @ctxt: an HTML parser context
2471 *
2472 * Parse an XML (SGML) comment <!-- .... -->
2473 *
2474 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2475 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002476static void
Owen Taylor3473f882001-02-23 17:55:21 +00002477htmlParseComment(htmlParserCtxtPtr ctxt) {
2478 xmlChar *buf = NULL;
2479 int len;
2480 int size = HTML_PARSER_BUFFER_SIZE;
2481 int q, ql;
2482 int r, rl;
2483 int cur, l;
2484 xmlParserInputState state;
2485
2486 /*
2487 * Check that there is a comment right here.
2488 */
2489 if ((RAW != '<') || (NXT(1) != '!') ||
2490 (NXT(2) != '-') || (NXT(3) != '-')) return;
2491
2492 state = ctxt->instate;
2493 ctxt->instate = XML_PARSER_COMMENT;
2494 SHRINK;
2495 SKIP(4);
2496 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2497 if (buf == NULL) {
2498 xmlGenericError(xmlGenericErrorContext,
2499 "malloc of %d byte failed\n", size);
2500 ctxt->instate = state;
2501 return;
2502 }
2503 q = CUR_CHAR(ql);
2504 NEXTL(ql);
2505 r = CUR_CHAR(rl);
2506 NEXTL(rl);
2507 cur = CUR_CHAR(l);
2508 len = 0;
2509 while (IS_CHAR(cur) &&
2510 ((cur != '>') ||
2511 (r != '-') || (q != '-'))) {
2512 if (len + 5 >= size) {
2513 size *= 2;
2514 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2515 if (buf == NULL) {
2516 xmlGenericError(xmlGenericErrorContext,
2517 "realloc of %d byte failed\n", size);
2518 ctxt->instate = state;
2519 return;
2520 }
2521 }
2522 COPY_BUF(ql,buf,len,q);
2523 q = r;
2524 ql = rl;
2525 r = cur;
2526 rl = l;
2527 NEXTL(l);
2528 cur = CUR_CHAR(l);
2529 if (cur == 0) {
2530 SHRINK;
2531 GROW;
2532 cur = CUR_CHAR(l);
2533 }
2534 }
2535 buf[len] = 0;
2536 if (!IS_CHAR(cur)) {
2537 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2538 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2539 ctxt->sax->error(ctxt->userData,
2540 "Comment not terminated \n<!--%.50s\n", buf);
2541 ctxt->wellFormed = 0;
2542 xmlFree(buf);
2543 } else {
2544 NEXT;
2545 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2546 (!ctxt->disableSAX))
2547 ctxt->sax->comment(ctxt->userData, buf);
2548 xmlFree(buf);
2549 }
2550 ctxt->instate = state;
2551}
2552
2553/**
2554 * htmlParseCharRef:
2555 * @ctxt: an HTML parser context
2556 *
2557 * parse Reference declarations
2558 *
2559 * [66] CharRef ::= '&#' [0-9]+ ';' |
2560 * '&#x' [0-9a-fA-F]+ ';'
2561 *
2562 * Returns the value parsed (as an int)
2563 */
2564int
2565htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2566 int val = 0;
2567
2568 if ((CUR == '&') && (NXT(1) == '#') &&
2569 (NXT(2) == 'x')) {
2570 SKIP(3);
2571 while (CUR != ';') {
2572 if ((CUR >= '0') && (CUR <= '9'))
2573 val = val * 16 + (CUR - '0');
2574 else if ((CUR >= 'a') && (CUR <= 'f'))
2575 val = val * 16 + (CUR - 'a') + 10;
2576 else if ((CUR >= 'A') && (CUR <= 'F'))
2577 val = val * 16 + (CUR - 'A') + 10;
2578 else {
2579 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2580 ctxt->sax->error(ctxt->userData,
2581 "htmlParseCharRef: invalid hexadecimal value\n");
2582 ctxt->wellFormed = 0;
2583 return(0);
2584 }
2585 NEXT;
2586 }
2587 if (CUR == ';')
2588 NEXT;
2589 } else if ((CUR == '&') && (NXT(1) == '#')) {
2590 SKIP(2);
2591 while (CUR != ';') {
2592 if ((CUR >= '0') && (CUR <= '9'))
2593 val = val * 10 + (CUR - '0');
2594 else {
2595 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2596 ctxt->sax->error(ctxt->userData,
2597 "htmlParseCharRef: invalid decimal value\n");
2598 ctxt->wellFormed = 0;
2599 return(0);
2600 }
2601 NEXT;
2602 }
2603 if (CUR == ';')
2604 NEXT;
2605 } else {
2606 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2607 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2608 ctxt->wellFormed = 0;
2609 }
2610 /*
2611 * Check the value IS_CHAR ...
2612 */
2613 if (IS_CHAR(val)) {
2614 return(val);
2615 } else {
2616 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2617 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
2618 val);
2619 ctxt->wellFormed = 0;
2620 }
2621 return(0);
2622}
2623
2624
2625/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00002626 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00002627 * @ctxt: an HTML parser context
2628 *
2629 * parse a DOCTYPE declaration
2630 *
2631 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2632 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2633 */
2634
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002635static void
Owen Taylor3473f882001-02-23 17:55:21 +00002636htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
2637 xmlChar *name;
2638 xmlChar *ExternalID = NULL;
2639 xmlChar *URI = NULL;
2640
2641 /*
2642 * We know that '<!DOCTYPE' has been detected.
2643 */
2644 SKIP(9);
2645
2646 SKIP_BLANKS;
2647
2648 /*
2649 * Parse the DOCTYPE name.
2650 */
2651 name = htmlParseName(ctxt);
2652 if (name == NULL) {
2653 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2654 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2655 ctxt->wellFormed = 0;
2656 }
2657 /*
2658 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2659 */
2660
2661 SKIP_BLANKS;
2662
2663 /*
2664 * Check for SystemID and ExternalID
2665 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002666 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00002667 SKIP_BLANKS;
2668
2669 /*
2670 * We should be at the end of the DOCTYPE declaration.
2671 */
2672 if (CUR != '>') {
2673 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardf6ed8bc2001-10-02 09:22:47 +00002674 ctxt->sax->error(ctxt->userData, "DOCTYPE improperly terminated\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002675 ctxt->wellFormed = 0;
2676 /* We shouldn't try to resynchronize ... */
2677 }
2678 NEXT;
2679
2680 /*
2681 * Create or update the document accordingly to the DOCTYPE
2682 */
2683 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2684 (!ctxt->disableSAX))
2685 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
2686
2687 /*
2688 * Cleanup, since we don't use all those identifiers
2689 */
2690 if (URI != NULL) xmlFree(URI);
2691 if (ExternalID != NULL) xmlFree(ExternalID);
2692 if (name != NULL) xmlFree(name);
2693}
2694
2695/**
2696 * htmlParseAttribute:
2697 * @ctxt: an HTML parser context
2698 * @value: a xmlChar ** used to store the value of the attribute
2699 *
2700 * parse an attribute
2701 *
2702 * [41] Attribute ::= Name Eq AttValue
2703 *
2704 * [25] Eq ::= S? '=' S?
2705 *
2706 * With namespace:
2707 *
2708 * [NS 11] Attribute ::= QName Eq AttValue
2709 *
2710 * Also the case QName == xmlns:??? is handled independently as a namespace
2711 * definition.
2712 *
2713 * Returns the attribute name, and the value in *value.
2714 */
2715
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002716static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002717htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
2718 xmlChar *name, *val = NULL;
2719
2720 *value = NULL;
2721 name = htmlParseHTMLName(ctxt);
2722 if (name == NULL) {
2723 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2724 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2725 ctxt->wellFormed = 0;
2726 return(NULL);
2727 }
2728
2729 /*
2730 * read the value
2731 */
2732 SKIP_BLANKS;
2733 if (CUR == '=') {
2734 NEXT;
2735 SKIP_BLANKS;
2736 val = htmlParseAttValue(ctxt);
2737 /******
2738 } else {
2739 * TODO : some attribute must have values, some may not
2740 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2741 ctxt->sax->warning(ctxt->userData,
2742 "No value for attribute %s\n", name); */
2743 }
2744
2745 *value = val;
2746 return(name);
2747}
2748
2749/**
2750 * htmlCheckEncoding:
2751 * @ctxt: an HTML parser context
2752 * @attvalue: the attribute value
2753 *
2754 * Checks an http-equiv attribute from a Meta tag to detect
2755 * the encoding
2756 * If a new encoding is detected the parser is switched to decode
2757 * it and pass UTF8
2758 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002759static void
Owen Taylor3473f882001-02-23 17:55:21 +00002760htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2761 const xmlChar *encoding;
2762
2763 if ((ctxt == NULL) || (attvalue == NULL))
2764 return;
2765
2766 /* do not change encoding */
2767 if (ctxt->input->encoding != NULL)
2768 return;
2769
2770 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
2771 if (encoding != NULL) {
2772 encoding += 8;
2773 } else {
2774 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
2775 if (encoding != NULL)
2776 encoding += 9;
2777 }
2778 if (encoding != NULL) {
2779 xmlCharEncoding enc;
2780 xmlCharEncodingHandlerPtr handler;
2781
2782 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2783
2784 if (ctxt->input->encoding != NULL)
2785 xmlFree((xmlChar *) ctxt->input->encoding);
2786 ctxt->input->encoding = xmlStrdup(encoding);
2787
2788 enc = xmlParseCharEncoding((const char *) encoding);
2789 /*
2790 * registered set of known encodings
2791 */
2792 if (enc != XML_CHAR_ENCODING_ERROR) {
2793 xmlSwitchEncoding(ctxt, enc);
2794 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2795 } else {
2796 /*
2797 * fallback for unknown encodings
2798 */
2799 handler = xmlFindCharEncodingHandler((const char *) encoding);
2800 if (handler != NULL) {
2801 xmlSwitchToEncoding(ctxt, handler);
2802 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2803 } else {
2804 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2805 }
2806 }
2807
2808 if ((ctxt->input->buf != NULL) &&
2809 (ctxt->input->buf->encoder != NULL) &&
2810 (ctxt->input->buf->raw != NULL) &&
2811 (ctxt->input->buf->buffer != NULL)) {
2812 int nbchars;
2813 int processed;
2814
2815 /*
2816 * convert as much as possible to the parser reading buffer.
2817 */
2818 processed = ctxt->input->cur - ctxt->input->base;
2819 xmlBufferShrink(ctxt->input->buf->buffer, processed);
2820 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2821 ctxt->input->buf->buffer,
2822 ctxt->input->buf->raw);
2823 if (nbchars < 0) {
2824 ctxt->errNo = XML_ERR_INVALID_ENCODING;
2825 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2826 ctxt->sax->error(ctxt->userData,
2827 "htmlCheckEncoding: encoder error\n");
2828 }
2829 ctxt->input->base =
2830 ctxt->input->cur = ctxt->input->buf->buffer->content;
2831 }
2832 }
2833}
2834
2835/**
2836 * htmlCheckMeta:
2837 * @ctxt: an HTML parser context
2838 * @atts: the attributes values
2839 *
2840 * Checks an attributes from a Meta tag
2841 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002842static void
Owen Taylor3473f882001-02-23 17:55:21 +00002843htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2844 int i;
2845 const xmlChar *att, *value;
2846 int http = 0;
2847 const xmlChar *content = NULL;
2848
2849 if ((ctxt == NULL) || (atts == NULL))
2850 return;
2851
2852 i = 0;
2853 att = atts[i++];
2854 while (att != NULL) {
2855 value = atts[i++];
2856 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
2857 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
2858 http = 1;
2859 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
2860 content = value;
2861 att = atts[i++];
2862 }
2863 if ((http) && (content != NULL))
2864 htmlCheckEncoding(ctxt, content);
2865
2866}
2867
2868/**
2869 * htmlParseStartTag:
2870 * @ctxt: an HTML parser context
2871 *
2872 * parse a start of tag either for rule element or
2873 * EmptyElement. In both case we don't parse the tag closing chars.
2874 *
2875 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2876 *
2877 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2878 *
2879 * With namespace:
2880 *
2881 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2882 *
2883 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2884 *
2885 */
2886
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002887static void
Owen Taylor3473f882001-02-23 17:55:21 +00002888htmlParseStartTag(htmlParserCtxtPtr ctxt) {
2889 xmlChar *name;
2890 xmlChar *attname;
2891 xmlChar *attvalue;
2892 const xmlChar **atts = NULL;
2893 int nbatts = 0;
2894 int maxatts = 0;
2895 int meta = 0;
2896 int i;
2897
2898 if (CUR != '<') return;
2899 NEXT;
2900
2901 GROW;
2902 name = htmlParseHTMLName(ctxt);
2903 if (name == NULL) {
2904 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2905 ctxt->sax->error(ctxt->userData,
2906 "htmlParseStartTag: invalid element name\n");
2907 ctxt->wellFormed = 0;
2908 /* Dump the bogus tag like browsers do */
2909 while ((IS_CHAR(CUR)) && (CUR != '>'))
2910 NEXT;
2911 return;
2912 }
2913 if (xmlStrEqual(name, BAD_CAST"meta"))
2914 meta = 1;
2915
2916 /*
2917 * Check for auto-closure of HTML elements.
2918 */
2919 htmlAutoClose(ctxt, name);
2920
2921 /*
2922 * Check for implied HTML elements.
2923 */
2924 htmlCheckImplied(ctxt, name);
2925
2926 /*
2927 * Avoid html at any level > 0, head at any level != 1
2928 * or any attempt to recurse body
2929 */
2930 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
2931 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2932 ctxt->sax->error(ctxt->userData,
2933 "htmlParseStartTag: misplaced <html> tag\n");
2934 ctxt->wellFormed = 0;
2935 xmlFree(name);
2936 return;
2937 }
2938 if ((ctxt->nameNr != 1) &&
2939 (xmlStrEqual(name, BAD_CAST"head"))) {
2940 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2941 ctxt->sax->error(ctxt->userData,
2942 "htmlParseStartTag: misplaced <head> tag\n");
2943 ctxt->wellFormed = 0;
2944 xmlFree(name);
2945 return;
2946 }
2947 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002948 int indx;
2949 for (indx = 0;indx < ctxt->nameNr;indx++) {
2950 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor3473f882001-02-23 17:55:21 +00002951 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2952 ctxt->sax->error(ctxt->userData,
2953 "htmlParseStartTag: misplaced <body> tag\n");
2954 ctxt->wellFormed = 0;
2955 xmlFree(name);
2956 return;
2957 }
2958 }
2959 }
2960
2961 /*
2962 * Now parse the attributes, it ends up with the ending
2963 *
2964 * (S Attribute)* S?
2965 */
2966 SKIP_BLANKS;
2967 while ((IS_CHAR(CUR)) &&
2968 (CUR != '>') &&
2969 ((CUR != '/') || (NXT(1) != '>'))) {
2970 long cons = ctxt->nbChars;
2971
2972 GROW;
2973 attname = htmlParseAttribute(ctxt, &attvalue);
2974 if (attname != NULL) {
2975
2976 /*
2977 * Well formedness requires at most one declaration of an attribute
2978 */
2979 for (i = 0; i < nbatts;i += 2) {
2980 if (xmlStrEqual(atts[i], attname)) {
2981 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2982 ctxt->sax->error(ctxt->userData,
2983 "Attribute %s redefined\n",
2984 attname);
2985 ctxt->wellFormed = 0;
2986 xmlFree(attname);
2987 if (attvalue != NULL)
2988 xmlFree(attvalue);
2989 goto failed;
2990 }
2991 }
2992
2993 /*
2994 * Add the pair to atts
2995 */
2996 if (atts == NULL) {
2997 maxatts = 10;
2998 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
2999 if (atts == NULL) {
3000 xmlGenericError(xmlGenericErrorContext,
3001 "malloc of %ld byte failed\n",
3002 maxatts * (long)sizeof(xmlChar *));
3003 if (name != NULL) xmlFree(name);
3004 return;
3005 }
3006 } else if (nbatts + 4 > maxatts) {
3007 maxatts *= 2;
3008 atts = (const xmlChar **) xmlRealloc((void *) atts,
3009 maxatts * sizeof(xmlChar *));
3010 if (atts == NULL) {
3011 xmlGenericError(xmlGenericErrorContext,
3012 "realloc of %ld byte failed\n",
3013 maxatts * (long)sizeof(xmlChar *));
3014 if (name != NULL) xmlFree(name);
3015 return;
3016 }
3017 }
3018 atts[nbatts++] = attname;
3019 atts[nbatts++] = attvalue;
3020 atts[nbatts] = NULL;
3021 atts[nbatts + 1] = NULL;
3022 }
3023 else {
3024 /* Dump the bogus attribute string up to the next blank or
3025 * the end of the tag. */
Daniel Veillard561b7f82002-03-20 21:55:57 +00003026 while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
3027 && ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003028 NEXT;
3029 }
3030
3031failed:
3032 SKIP_BLANKS;
3033 if (cons == ctxt->nbChars) {
3034 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3035 ctxt->sax->error(ctxt->userData,
3036 "htmlParseStartTag: problem parsing attributes\n");
3037 ctxt->wellFormed = 0;
3038 break;
3039 }
3040 }
3041
3042 /*
3043 * Handle specific association to the META tag
3044 */
3045 if (meta)
3046 htmlCheckMeta(ctxt, atts);
3047
3048 /*
3049 * SAX: Start of Element !
3050 */
3051 htmlnamePush(ctxt, xmlStrdup(name));
3052#ifdef DEBUG
3053 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
3054#endif
3055 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3056 ctxt->sax->startElement(ctxt->userData, name, atts);
3057
3058 if (atts != NULL) {
3059 for (i = 0;i < nbatts;i++) {
3060 if (atts[i] != NULL)
3061 xmlFree((xmlChar *) atts[i]);
3062 }
3063 xmlFree((void *) atts);
3064 }
3065 if (name != NULL) xmlFree(name);
3066}
3067
3068/**
3069 * htmlParseEndTag:
3070 * @ctxt: an HTML parser context
3071 *
3072 * parse an end of tag
3073 *
3074 * [42] ETag ::= '</' Name S? '>'
3075 *
3076 * With namespace
3077 *
3078 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003079 *
3080 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003081 */
3082
Daniel Veillardf420ac52001-07-04 16:04:09 +00003083static int
Owen Taylor3473f882001-02-23 17:55:21 +00003084htmlParseEndTag(htmlParserCtxtPtr ctxt) {
3085 xmlChar *name;
3086 xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003087 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003088
3089 if ((CUR != '<') || (NXT(1) != '/')) {
3090 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3091 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3092 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003093 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003094 }
3095 SKIP(2);
3096
3097 name = htmlParseHTMLName(ctxt);
Daniel Veillardf420ac52001-07-04 16:04:09 +00003098 if (name == NULL) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003099
3100 /*
3101 * We should definitely be at the ending "S? '>'" part
3102 */
3103 SKIP_BLANKS;
3104 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3105 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3106 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3107 ctxt->wellFormed = 0;
3108 } else
3109 NEXT;
3110
3111 /*
3112 * If the name read is not one of the element in the parsing stack
3113 * then return, it's just an error.
3114 */
3115 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
3116 if (xmlStrEqual(name, ctxt->nameTab[i])) break;
3117 }
3118 if (i < 0) {
3119 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3120 ctxt->sax->error(ctxt->userData,
3121 "Unexpected end tag : %s\n", name);
3122 xmlFree(name);
3123 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003124 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003125 }
3126
3127
3128 /*
3129 * Check for auto-closure of HTML elements.
3130 */
3131
3132 htmlAutoCloseOnClose(ctxt, name);
3133
3134 /*
3135 * Well formedness constraints, opening and closing must match.
3136 * With the exception that the autoclose may have popped stuff out
3137 * of the stack.
3138 */
3139 if (!xmlStrEqual(name, ctxt->name)) {
3140#ifdef DEBUG
3141 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
3142#endif
3143 if ((ctxt->name != NULL) &&
3144 (!xmlStrEqual(ctxt->name, name))) {
3145 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3146 ctxt->sax->error(ctxt->userData,
3147 "Opening and ending tag mismatch: %s and %s\n",
3148 name, ctxt->name);
3149 ctxt->wellFormed = 0;
3150 }
3151 }
3152
3153 /*
3154 * SAX: End of Tag
3155 */
3156 oldname = ctxt->name;
3157 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3158 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3159 ctxt->sax->endElement(ctxt->userData, name);
3160 oldname = htmlnamePop(ctxt);
3161 if (oldname != NULL) {
3162#ifdef DEBUG
3163 xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
3164#endif
3165 xmlFree(oldname);
3166#ifdef DEBUG
3167 } else {
3168 xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
3169#endif
3170 }
Daniel Veillardf420ac52001-07-04 16:04:09 +00003171 ret = 1;
3172 } else {
3173 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003174 }
3175
3176 if (name != NULL)
3177 xmlFree(name);
3178
Daniel Veillardf420ac52001-07-04 16:04:09 +00003179 return(ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003180}
3181
3182
3183/**
3184 * htmlParseReference:
3185 * @ctxt: an HTML parser context
3186 *
3187 * parse and handle entity references in content,
3188 * this will end-up in a call to character() since this is either a
3189 * CharRef, or a predefined entity.
3190 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003191static void
Owen Taylor3473f882001-02-23 17:55:21 +00003192htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003193 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003194 xmlChar out[6];
3195 xmlChar *name;
3196 if (CUR != '&') return;
3197
3198 if (NXT(1) == '#') {
3199 unsigned int c;
3200 int bits, i = 0;
3201
3202 c = htmlParseCharRef(ctxt);
3203 if (c == 0)
3204 return;
3205
3206 if (c < 0x80) { out[i++]= c; bits= -6; }
3207 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3208 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3209 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3210
3211 for ( ; bits >= 0; bits-= 6) {
3212 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3213 }
3214 out[i] = 0;
3215
3216 htmlCheckParagraph(ctxt);
3217 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3218 ctxt->sax->characters(ctxt->userData, out, i);
3219 } else {
3220 ent = htmlParseEntityRef(ctxt, &name);
3221 if (name == NULL) {
3222 htmlCheckParagraph(ctxt);
3223 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3224 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3225 return;
3226 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003227 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003228 htmlCheckParagraph(ctxt);
3229 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3230 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3231 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3232 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3233 }
3234 } else {
3235 unsigned int c;
3236 int bits, i = 0;
3237
3238 c = ent->value;
3239 if (c < 0x80)
3240 { out[i++]= c; bits= -6; }
3241 else if (c < 0x800)
3242 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3243 else if (c < 0x10000)
3244 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3245 else
3246 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3247
3248 for ( ; bits >= 0; bits-= 6) {
3249 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3250 }
3251 out[i] = 0;
3252
3253 htmlCheckParagraph(ctxt);
3254 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3255 ctxt->sax->characters(ctxt->userData, out, i);
3256 }
3257 xmlFree(name);
3258 }
3259}
3260
3261/**
3262 * htmlParseContent:
3263 * @ctxt: an HTML parser context
3264 * @name: the node name
3265 *
3266 * Parse a content: comment, sub-element, reference or text.
3267 *
3268 */
3269
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003270static void
Owen Taylor3473f882001-02-23 17:55:21 +00003271htmlParseContent(htmlParserCtxtPtr ctxt) {
3272 xmlChar *currentNode;
3273 int depth;
3274
3275 currentNode = xmlStrdup(ctxt->name);
3276 depth = ctxt->nameNr;
3277 while (1) {
3278 long cons = ctxt->nbChars;
3279
3280 GROW;
3281 /*
3282 * Our tag or one of it's parent or children is ending.
3283 */
3284 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003285 if (htmlParseEndTag(ctxt) &&
3286 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3287 if (currentNode != NULL)
3288 xmlFree(currentNode);
3289 return;
3290 }
3291 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003292 }
3293
3294 /*
3295 * Has this node been popped out during parsing of
3296 * the next element
3297 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003298 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3299 (!xmlStrEqual(currentNode, ctxt->name)))
3300 {
Owen Taylor3473f882001-02-23 17:55:21 +00003301 if (currentNode != NULL) xmlFree(currentNode);
3302 return;
3303 }
3304
Daniel Veillardf9533d12001-03-03 10:04:57 +00003305 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3306 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003307 /*
3308 * Handle SCRIPT/STYLE separately
3309 */
3310 htmlParseScript(ctxt);
3311 } else {
3312 /*
3313 * Sometimes DOCTYPE arrives in the middle of the document
3314 */
3315 if ((CUR == '<') && (NXT(1) == '!') &&
3316 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3317 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3318 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3319 (UPP(8) == 'E')) {
3320 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3321 ctxt->sax->error(ctxt->userData,
3322 "Misplaced DOCTYPE declaration\n");
3323 ctxt->wellFormed = 0;
3324 htmlParseDocTypeDecl(ctxt);
3325 }
3326
3327 /*
3328 * First case : a comment
3329 */
3330 if ((CUR == '<') && (NXT(1) == '!') &&
3331 (NXT(2) == '-') && (NXT(3) == '-')) {
3332 htmlParseComment(ctxt);
3333 }
3334
3335 /*
3336 * Second case : a sub-element.
3337 */
3338 else if (CUR == '<') {
3339 htmlParseElement(ctxt);
3340 }
3341
3342 /*
3343 * Third case : a reference. If if has not been resolved,
3344 * parsing returns it's Name, create the node
3345 */
3346 else if (CUR == '&') {
3347 htmlParseReference(ctxt);
3348 }
3349
3350 /*
3351 * Fourth : end of the resource
3352 */
3353 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003354 htmlAutoCloseOnEnd(ctxt);
3355 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003356 }
3357
3358 /*
3359 * Last case, text. Note that References are handled directly.
3360 */
3361 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003362 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003363 }
3364
3365 if (cons == ctxt->nbChars) {
3366 if (ctxt->node != NULL) {
3367 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3368 ctxt->sax->error(ctxt->userData,
3369 "detected an error in element content\n");
3370 ctxt->wellFormed = 0;
3371 }
3372 break;
3373 }
3374 }
3375 GROW;
3376 }
3377 if (currentNode != NULL) xmlFree(currentNode);
3378}
3379
3380/**
3381 * htmlParseElement:
3382 * @ctxt: an HTML parser context
3383 *
3384 * parse an HTML element, this is highly recursive
3385 *
3386 * [39] element ::= EmptyElemTag | STag content ETag
3387 *
3388 * [41] Attribute ::= Name Eq AttValue
3389 */
3390
3391void
3392htmlParseElement(htmlParserCtxtPtr ctxt) {
3393 xmlChar *name;
3394 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003395 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003396 htmlParserNodeInfo node_info;
3397 xmlChar *oldname;
3398 int depth = ctxt->nameNr;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003399 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003400
3401 /* Capture start position */
3402 if (ctxt->record_info) {
3403 node_info.begin_pos = ctxt->input->consumed +
3404 (CUR_PTR - ctxt->input->base);
3405 node_info.begin_line = ctxt->input->line;
3406 }
3407
3408 oldname = xmlStrdup(ctxt->name);
3409 htmlParseStartTag(ctxt);
3410 name = ctxt->name;
3411#ifdef DEBUG
3412 if (oldname == NULL)
3413 xmlGenericError(xmlGenericErrorContext,
3414 "Start of element %s\n", name);
3415 else if (name == NULL)
3416 xmlGenericError(xmlGenericErrorContext,
3417 "Start of element failed, was %s\n", oldname);
3418 else
3419 xmlGenericError(xmlGenericErrorContext,
3420 "Start of element %s, was %s\n", name, oldname);
3421#endif
3422 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3423 (name == NULL)) {
3424 if (CUR == '>')
3425 NEXT;
3426 if (oldname != NULL)
3427 xmlFree(oldname);
3428 return;
3429 }
3430 if (oldname != NULL)
3431 xmlFree(oldname);
3432
3433 /*
3434 * Lookup the info for that element.
3435 */
3436 info = htmlTagLookup(name);
3437 if (info == NULL) {
3438 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3439 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3440 name);
3441 ctxt->wellFormed = 0;
3442 } else if (info->depr) {
3443/***************************
3444 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3445 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3446 name);
3447 ***************************/
3448 }
3449
3450 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00003451 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00003452 */
3453 if ((CUR == '/') && (NXT(1) == '>')) {
3454 SKIP(2);
3455 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3456 ctxt->sax->endElement(ctxt->userData, name);
3457 oldname = htmlnamePop(ctxt);
3458#ifdef DEBUG
3459 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
3460#endif
3461 if (oldname != NULL)
3462 xmlFree(oldname);
3463 return;
3464 }
3465
3466 if (CUR == '>') {
3467 NEXT;
3468 } else {
3469 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3470 ctxt->sax->error(ctxt->userData,
3471 "Couldn't find end of Start Tag %s\n",
3472 name);
3473 ctxt->wellFormed = 0;
3474
3475 /*
3476 * end of parsing of this node.
3477 */
3478 if (xmlStrEqual(name, ctxt->name)) {
3479 nodePop(ctxt);
3480 oldname = htmlnamePop(ctxt);
3481#ifdef DEBUG
3482 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
3483#endif
3484 if (oldname != NULL)
3485 xmlFree(oldname);
3486 }
3487
3488 /*
3489 * Capture end position and add node
3490 */
3491 if ( currentNode != NULL && ctxt->record_info ) {
3492 node_info.end_pos = ctxt->input->consumed +
3493 (CUR_PTR - ctxt->input->base);
3494 node_info.end_line = ctxt->input->line;
3495 node_info.node = ctxt->node;
3496 xmlParserAddNodeInfo(ctxt, &node_info);
3497 }
3498 return;
3499 }
3500
3501 /*
3502 * Check for an Empty Element from DTD definition
3503 */
3504 if ((info != NULL) && (info->empty)) {
3505 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3506 ctxt->sax->endElement(ctxt->userData, name);
3507 oldname = htmlnamePop(ctxt);
3508#ifdef DEBUG
3509 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
3510#endif
3511 if (oldname != NULL)
3512 xmlFree(oldname);
3513 return;
3514 }
3515
3516 /*
3517 * Parse the content of the element:
3518 */
3519 currentNode = xmlStrdup(ctxt->name);
3520 depth = ctxt->nameNr;
3521 while (IS_CHAR(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00003522 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00003523 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00003524 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00003525 if (ctxt->nameNr < depth) break;
3526 }
3527
Owen Taylor3473f882001-02-23 17:55:21 +00003528 /*
3529 * Capture end position and add node
3530 */
3531 if ( currentNode != NULL && ctxt->record_info ) {
3532 node_info.end_pos = ctxt->input->consumed +
3533 (CUR_PTR - ctxt->input->base);
3534 node_info.end_line = ctxt->input->line;
3535 node_info.node = ctxt->node;
3536 xmlParserAddNodeInfo(ctxt, &node_info);
3537 }
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003538 if (!IS_CHAR(CUR)) {
3539 htmlAutoCloseOnEnd(ctxt);
3540 }
3541
Owen Taylor3473f882001-02-23 17:55:21 +00003542 if (currentNode != NULL)
3543 xmlFree(currentNode);
3544}
3545
3546/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003547 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00003548 * @ctxt: an HTML parser context
3549 *
3550 * parse an HTML document (and build a tree if using the standard SAX
3551 * interface).
3552 *
3553 * Returns 0, -1 in case of error. the parser context is augmented
3554 * as a result of the parsing.
3555 */
3556
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00003557int
Owen Taylor3473f882001-02-23 17:55:21 +00003558htmlParseDocument(htmlParserCtxtPtr ctxt) {
3559 xmlDtdPtr dtd;
3560
Daniel Veillardd0463562001-10-13 09:15:48 +00003561 xmlInitParser();
3562
Owen Taylor3473f882001-02-23 17:55:21 +00003563 htmlDefaultSAXHandlerInit();
3564 ctxt->html = 1;
3565
3566 GROW;
3567 /*
3568 * SAX: beginning of the document processing.
3569 */
3570 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3571 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3572
3573 /*
3574 * Wipe out everything which is before the first '<'
3575 */
3576 SKIP_BLANKS;
3577 if (CUR == 0) {
3578 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3579 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3580 ctxt->wellFormed = 0;
3581 }
3582
3583 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3584 ctxt->sax->startDocument(ctxt->userData);
3585
3586
3587 /*
3588 * Parse possible comments before any content
3589 */
3590 while ((CUR == '<') && (NXT(1) == '!') &&
3591 (NXT(2) == '-') && (NXT(3) == '-')) {
3592 htmlParseComment(ctxt);
3593 SKIP_BLANKS;
3594 }
3595
3596
3597 /*
3598 * Then possibly doc type declaration(s) and more Misc
3599 * (doctypedecl Misc*)?
3600 */
3601 if ((CUR == '<') && (NXT(1) == '!') &&
3602 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3603 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3604 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3605 (UPP(8) == 'E')) {
3606 htmlParseDocTypeDecl(ctxt);
3607 }
3608 SKIP_BLANKS;
3609
3610 /*
3611 * Parse possible comments before any content
3612 */
3613 while ((CUR == '<') && (NXT(1) == '!') &&
3614 (NXT(2) == '-') && (NXT(3) == '-')) {
3615 htmlParseComment(ctxt);
3616 SKIP_BLANKS;
3617 }
3618
3619 /*
3620 * Time to start parsing the tree itself
3621 */
3622 htmlParseContent(ctxt);
3623
3624 /*
3625 * autoclose
3626 */
3627 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003628 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003629
3630
3631 /*
3632 * SAX: end of the document processing.
3633 */
3634 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3635 ctxt->sax->endDocument(ctxt->userData);
3636
3637 if (ctxt->myDoc != NULL) {
3638 dtd = xmlGetIntSubset(ctxt->myDoc);
3639 if (dtd == NULL)
3640 ctxt->myDoc->intSubset =
3641 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3642 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3643 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3644 }
3645 if (! ctxt->wellFormed) return(-1);
3646 return(0);
3647}
3648
3649
3650/************************************************************************
3651 * *
3652 * Parser contexts handling *
3653 * *
3654 ************************************************************************/
3655
3656/**
3657 * xmlInitParserCtxt:
3658 * @ctxt: an HTML parser context
3659 *
3660 * Initialize a parser context
3661 */
3662
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003663static void
Owen Taylor3473f882001-02-23 17:55:21 +00003664htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3665{
3666 htmlSAXHandler *sax;
3667
3668 if (ctxt == NULL) return;
3669 memset(ctxt, 0, sizeof(htmlParserCtxt));
3670
3671 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3672 if (sax == NULL) {
3673 xmlGenericError(xmlGenericErrorContext,
3674 "htmlInitParserCtxt: out of memory\n");
3675 }
3676 else
3677 memset(sax, 0, sizeof(htmlSAXHandler));
3678
3679 /* Allocate the Input stack */
3680 ctxt->inputTab = (htmlParserInputPtr *)
3681 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3682 if (ctxt->inputTab == NULL) {
3683 xmlGenericError(xmlGenericErrorContext,
3684 "htmlInitParserCtxt: out of memory\n");
3685 ctxt->inputNr = 0;
3686 ctxt->inputMax = 0;
3687 ctxt->input = NULL;
3688 return;
3689 }
3690 ctxt->inputNr = 0;
3691 ctxt->inputMax = 5;
3692 ctxt->input = NULL;
3693 ctxt->version = NULL;
3694 ctxt->encoding = NULL;
3695 ctxt->standalone = -1;
3696 ctxt->instate = XML_PARSER_START;
3697
3698 /* Allocate the Node stack */
3699 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3700 if (ctxt->nodeTab == NULL) {
3701 xmlGenericError(xmlGenericErrorContext,
3702 "htmlInitParserCtxt: out of memory\n");
3703 ctxt->nodeNr = 0;
3704 ctxt->nodeMax = 0;
3705 ctxt->node = NULL;
3706 ctxt->inputNr = 0;
3707 ctxt->inputMax = 0;
3708 ctxt->input = NULL;
3709 return;
3710 }
3711 ctxt->nodeNr = 0;
3712 ctxt->nodeMax = 10;
3713 ctxt->node = NULL;
3714
3715 /* Allocate the Name stack */
3716 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
3717 if (ctxt->nameTab == NULL) {
3718 xmlGenericError(xmlGenericErrorContext,
3719 "htmlInitParserCtxt: out of memory\n");
3720 ctxt->nameNr = 0;
3721 ctxt->nameMax = 10;
3722 ctxt->name = NULL;
3723 ctxt->nodeNr = 0;
3724 ctxt->nodeMax = 0;
3725 ctxt->node = NULL;
3726 ctxt->inputNr = 0;
3727 ctxt->inputMax = 0;
3728 ctxt->input = NULL;
3729 return;
3730 }
3731 ctxt->nameNr = 0;
3732 ctxt->nameMax = 10;
3733 ctxt->name = NULL;
3734
3735 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3736 else {
3737 ctxt->sax = sax;
3738 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3739 }
3740 ctxt->userData = ctxt;
3741 ctxt->myDoc = NULL;
3742 ctxt->wellFormed = 1;
3743 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00003744 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00003745 ctxt->html = 1;
3746 ctxt->record_info = 0;
3747 ctxt->validate = 0;
3748 ctxt->nbChars = 0;
3749 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00003750 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003751 xmlInitNodeInfoSeq(&ctxt->node_seq);
3752}
3753
3754/**
3755 * htmlFreeParserCtxt:
3756 * @ctxt: an HTML parser context
3757 *
3758 * Free all the memory used by a parser context. However the parsed
3759 * document in ctxt->myDoc is not freed.
3760 */
3761
3762void
3763htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3764{
3765 xmlFreeParserCtxt(ctxt);
3766}
3767
3768/**
Daniel Veillard1d995272002-07-22 16:43:32 +00003769 * htmlNewParserCtxt:
3770 *
3771 * Allocate and initialize a new parser context.
3772 *
3773 * Returns the xmlParserCtxtPtr or NULL
3774 */
3775
3776static htmlParserCtxtPtr
3777htmlNewParserCtxt(void)
3778{
3779 xmlParserCtxtPtr ctxt;
3780
3781 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
3782 if (ctxt == NULL) {
3783 xmlGenericError(xmlGenericErrorContext,
3784 "xmlNewParserCtxt : cannot allocate context\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00003785 return(NULL);
3786 }
3787 memset(ctxt, 0, sizeof(xmlParserCtxt));
3788 htmlInitParserCtxt(ctxt);
3789 return(ctxt);
3790}
3791
3792/**
3793 * htmlCreateMemoryParserCtxt:
3794 * @buffer: a pointer to a char array
3795 * @size: the size of the array
3796 *
3797 * Create a parser context for an HTML in-memory document.
3798 *
3799 * Returns the new parser context or NULL
3800 */
3801static htmlParserCtxtPtr
3802htmlCreateMemoryParserCtxt(const char *buffer, int size) {
3803 xmlParserCtxtPtr ctxt;
3804 xmlParserInputPtr input;
3805 xmlParserInputBufferPtr buf;
3806
3807 if (buffer == NULL)
3808 return(NULL);
3809 if (size <= 0)
3810 return(NULL);
3811
3812 ctxt = htmlNewParserCtxt();
3813 if (ctxt == NULL)
3814 return(NULL);
3815
3816 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
3817 if (buf == NULL) return(NULL);
3818
3819 input = xmlNewInputStream(ctxt);
3820 if (input == NULL) {
3821 xmlFreeParserCtxt(ctxt);
3822 return(NULL);
3823 }
3824
3825 input->filename = NULL;
3826 input->buf = buf;
3827 input->base = input->buf->buffer->content;
3828 input->cur = input->buf->buffer->content;
3829 input->end = &input->buf->buffer->content[input->buf->buffer->use];
3830
3831 inputPush(ctxt, input);
3832 return(ctxt);
3833}
3834
3835/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003836 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00003837 * @cur: a pointer to an array of xmlChar
3838 * @encoding: a free form C string describing the HTML document encoding, or NULL
3839 *
3840 * Create a parser context for an HTML document.
3841 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003842 * TODO: check the need to add encoding handling there
3843 *
Owen Taylor3473f882001-02-23 17:55:21 +00003844 * Returns the new parser context or NULL
3845 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003846static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00003847htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Daniel Veillard1d995272002-07-22 16:43:32 +00003848 int len;
Owen Taylor3473f882001-02-23 17:55:21 +00003849
Daniel Veillard1d995272002-07-22 16:43:32 +00003850 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00003851 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00003852 len = xmlStrlen(cur);
3853 return(htmlCreateMemoryParserCtxt((char *)cur, len));
Owen Taylor3473f882001-02-23 17:55:21 +00003854}
3855
3856/************************************************************************
3857 * *
3858 * Progressive parsing interfaces *
3859 * *
3860 ************************************************************************/
3861
3862/**
3863 * htmlParseLookupSequence:
3864 * @ctxt: an HTML parser context
3865 * @first: the first char to lookup
3866 * @next: the next char to lookup or zero
3867 * @third: the next char to lookup or zero
3868 *
3869 * Try to find if a sequence (first, next, third) or just (first next) or
3870 * (first) is available in the input stream.
3871 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3872 * to avoid rescanning sequences of bytes, it DOES change the state of the
3873 * parser, do not use liberally.
3874 * This is basically similar to xmlParseLookupSequence()
3875 *
3876 * Returns the index to the current parsing point if the full sequence
3877 * is available, -1 otherwise.
3878 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003879static int
Owen Taylor3473f882001-02-23 17:55:21 +00003880htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3881 xmlChar next, xmlChar third) {
3882 int base, len;
3883 htmlParserInputPtr in;
3884 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00003885 int incomment = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003886
3887 in = ctxt->input;
3888 if (in == NULL) return(-1);
3889 base = in->cur - in->base;
3890 if (base < 0) return(-1);
3891 if (ctxt->checkIndex > base)
3892 base = ctxt->checkIndex;
3893 if (in->buf == NULL) {
3894 buf = in->base;
3895 len = in->length;
3896 } else {
3897 buf = in->buf->buffer->content;
3898 len = in->buf->buffer->use;
3899 }
3900 /* take into account the sequence length */
3901 if (third) len -= 2;
3902 else if (next) len --;
3903 for (;base < len;base++) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00003904 if (!incomment && (base + 4 < len)) {
3905 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
3906 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
3907 incomment = 1;
3908 }
3909 /* do not increment base, some people use <!--> */
3910 }
3911 if (incomment) {
3912 if (base + 3 < len)
3913 return(-1);
3914 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
3915 (buf[base + 2] == '>')) {
3916 incomment = 0;
3917 base += 2;
3918 }
3919 continue;
3920 }
Owen Taylor3473f882001-02-23 17:55:21 +00003921 if (buf[base] == first) {
3922 if (third != 0) {
3923 if ((buf[base + 1] != next) ||
3924 (buf[base + 2] != third)) continue;
3925 } else if (next != 0) {
3926 if (buf[base + 1] != next) continue;
3927 }
3928 ctxt->checkIndex = 0;
3929#ifdef DEBUG_PUSH
3930 if (next == 0)
3931 xmlGenericError(xmlGenericErrorContext,
3932 "HPP: lookup '%c' found at %d\n",
3933 first, base);
3934 else if (third == 0)
3935 xmlGenericError(xmlGenericErrorContext,
3936 "HPP: lookup '%c%c' found at %d\n",
3937 first, next, base);
3938 else
3939 xmlGenericError(xmlGenericErrorContext,
3940 "HPP: lookup '%c%c%c' found at %d\n",
3941 first, next, third, base);
3942#endif
3943 return(base - (in->cur - in->base));
3944 }
3945 }
3946 ctxt->checkIndex = base;
3947#ifdef DEBUG_PUSH
3948 if (next == 0)
3949 xmlGenericError(xmlGenericErrorContext,
3950 "HPP: lookup '%c' failed\n", first);
3951 else if (third == 0)
3952 xmlGenericError(xmlGenericErrorContext,
3953 "HPP: lookup '%c%c' failed\n", first, next);
3954 else
3955 xmlGenericError(xmlGenericErrorContext,
3956 "HPP: lookup '%c%c%c' failed\n", first, next, third);
3957#endif
3958 return(-1);
3959}
3960
3961/**
3962 * htmlParseTryOrFinish:
3963 * @ctxt: an HTML parser context
3964 * @terminate: last chunk indicator
3965 *
3966 * Try to progress on parsing
3967 *
3968 * Returns zero if no parsing was possible
3969 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003970static int
Owen Taylor3473f882001-02-23 17:55:21 +00003971htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
3972 int ret = 0;
3973 htmlParserInputPtr in;
3974 int avail = 0;
3975 xmlChar cur, next;
3976
3977#ifdef DEBUG_PUSH
3978 switch (ctxt->instate) {
3979 case XML_PARSER_EOF:
3980 xmlGenericError(xmlGenericErrorContext,
3981 "HPP: try EOF\n"); break;
3982 case XML_PARSER_START:
3983 xmlGenericError(xmlGenericErrorContext,
3984 "HPP: try START\n"); break;
3985 case XML_PARSER_MISC:
3986 xmlGenericError(xmlGenericErrorContext,
3987 "HPP: try MISC\n");break;
3988 case XML_PARSER_COMMENT:
3989 xmlGenericError(xmlGenericErrorContext,
3990 "HPP: try COMMENT\n");break;
3991 case XML_PARSER_PROLOG:
3992 xmlGenericError(xmlGenericErrorContext,
3993 "HPP: try PROLOG\n");break;
3994 case XML_PARSER_START_TAG:
3995 xmlGenericError(xmlGenericErrorContext,
3996 "HPP: try START_TAG\n");break;
3997 case XML_PARSER_CONTENT:
3998 xmlGenericError(xmlGenericErrorContext,
3999 "HPP: try CONTENT\n");break;
4000 case XML_PARSER_CDATA_SECTION:
4001 xmlGenericError(xmlGenericErrorContext,
4002 "HPP: try CDATA_SECTION\n");break;
4003 case XML_PARSER_END_TAG:
4004 xmlGenericError(xmlGenericErrorContext,
4005 "HPP: try END_TAG\n");break;
4006 case XML_PARSER_ENTITY_DECL:
4007 xmlGenericError(xmlGenericErrorContext,
4008 "HPP: try ENTITY_DECL\n");break;
4009 case XML_PARSER_ENTITY_VALUE:
4010 xmlGenericError(xmlGenericErrorContext,
4011 "HPP: try ENTITY_VALUE\n");break;
4012 case XML_PARSER_ATTRIBUTE_VALUE:
4013 xmlGenericError(xmlGenericErrorContext,
4014 "HPP: try ATTRIBUTE_VALUE\n");break;
4015 case XML_PARSER_DTD:
4016 xmlGenericError(xmlGenericErrorContext,
4017 "HPP: try DTD\n");break;
4018 case XML_PARSER_EPILOG:
4019 xmlGenericError(xmlGenericErrorContext,
4020 "HPP: try EPILOG\n");break;
4021 case XML_PARSER_PI:
4022 xmlGenericError(xmlGenericErrorContext,
4023 "HPP: try PI\n");break;
4024 case XML_PARSER_SYSTEM_LITERAL:
4025 xmlGenericError(xmlGenericErrorContext,
4026 "HPP: try SYSTEM_LITERAL\n");break;
4027 }
4028#endif
4029
4030 while (1) {
4031
4032 in = ctxt->input;
4033 if (in == NULL) break;
4034 if (in->buf == NULL)
4035 avail = in->length - (in->cur - in->base);
4036 else
4037 avail = in->buf->buffer->use - (in->cur - in->base);
4038 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004039 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004040 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4041 /*
4042 * SAX: end of the document processing.
4043 */
4044 ctxt->instate = XML_PARSER_EOF;
4045 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4046 ctxt->sax->endDocument(ctxt->userData);
4047 }
4048 }
4049 if (avail < 1)
4050 goto done;
4051 switch (ctxt->instate) {
4052 case XML_PARSER_EOF:
4053 /*
4054 * Document parsing is done !
4055 */
4056 goto done;
4057 case XML_PARSER_START:
4058 /*
4059 * Very first chars read from the document flow.
4060 */
4061 cur = in->cur[0];
4062 if (IS_BLANK(cur)) {
4063 SKIP_BLANKS;
4064 if (in->buf == NULL)
4065 avail = in->length - (in->cur - in->base);
4066 else
4067 avail = in->buf->buffer->use - (in->cur - in->base);
4068 }
4069 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4070 ctxt->sax->setDocumentLocator(ctxt->userData,
4071 &xmlDefaultSAXLocator);
4072 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4073 (!ctxt->disableSAX))
4074 ctxt->sax->startDocument(ctxt->userData);
4075
4076 cur = in->cur[0];
4077 next = in->cur[1];
4078 if ((cur == '<') && (next == '!') &&
4079 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4080 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4081 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4082 (UPP(8) == 'E')) {
4083 if ((!terminate) &&
4084 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4085 goto done;
4086#ifdef DEBUG_PUSH
4087 xmlGenericError(xmlGenericErrorContext,
4088 "HPP: Parsing internal subset\n");
4089#endif
4090 htmlParseDocTypeDecl(ctxt);
4091 ctxt->instate = XML_PARSER_PROLOG;
4092#ifdef DEBUG_PUSH
4093 xmlGenericError(xmlGenericErrorContext,
4094 "HPP: entering PROLOG\n");
4095#endif
4096 } else {
4097 ctxt->instate = XML_PARSER_MISC;
4098 }
4099#ifdef DEBUG_PUSH
4100 xmlGenericError(xmlGenericErrorContext,
4101 "HPP: entering MISC\n");
4102#endif
4103 break;
4104 case XML_PARSER_MISC:
4105 SKIP_BLANKS;
4106 if (in->buf == NULL)
4107 avail = in->length - (in->cur - in->base);
4108 else
4109 avail = in->buf->buffer->use - (in->cur - in->base);
4110 if (avail < 2)
4111 goto done;
4112 cur = in->cur[0];
4113 next = in->cur[1];
4114 if ((cur == '<') && (next == '!') &&
4115 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4116 if ((!terminate) &&
4117 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4118 goto done;
4119#ifdef DEBUG_PUSH
4120 xmlGenericError(xmlGenericErrorContext,
4121 "HPP: Parsing Comment\n");
4122#endif
4123 htmlParseComment(ctxt);
4124 ctxt->instate = XML_PARSER_MISC;
4125 } else if ((cur == '<') && (next == '!') &&
4126 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4127 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4128 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4129 (UPP(8) == 'E')) {
4130 if ((!terminate) &&
4131 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4132 goto done;
4133#ifdef DEBUG_PUSH
4134 xmlGenericError(xmlGenericErrorContext,
4135 "HPP: Parsing internal subset\n");
4136#endif
4137 htmlParseDocTypeDecl(ctxt);
4138 ctxt->instate = XML_PARSER_PROLOG;
4139#ifdef DEBUG_PUSH
4140 xmlGenericError(xmlGenericErrorContext,
4141 "HPP: entering PROLOG\n");
4142#endif
4143 } else if ((cur == '<') && (next == '!') &&
4144 (avail < 9)) {
4145 goto done;
4146 } else {
4147 ctxt->instate = XML_PARSER_START_TAG;
4148#ifdef DEBUG_PUSH
4149 xmlGenericError(xmlGenericErrorContext,
4150 "HPP: entering START_TAG\n");
4151#endif
4152 }
4153 break;
4154 case XML_PARSER_PROLOG:
4155 SKIP_BLANKS;
4156 if (in->buf == NULL)
4157 avail = in->length - (in->cur - in->base);
4158 else
4159 avail = in->buf->buffer->use - (in->cur - in->base);
4160 if (avail < 2)
4161 goto done;
4162 cur = in->cur[0];
4163 next = in->cur[1];
4164 if ((cur == '<') && (next == '!') &&
4165 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4166 if ((!terminate) &&
4167 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4168 goto done;
4169#ifdef DEBUG_PUSH
4170 xmlGenericError(xmlGenericErrorContext,
4171 "HPP: Parsing Comment\n");
4172#endif
4173 htmlParseComment(ctxt);
4174 ctxt->instate = XML_PARSER_PROLOG;
4175 } else if ((cur == '<') && (next == '!') &&
4176 (avail < 4)) {
4177 goto done;
4178 } else {
4179 ctxt->instate = XML_PARSER_START_TAG;
4180#ifdef DEBUG_PUSH
4181 xmlGenericError(xmlGenericErrorContext,
4182 "HPP: entering START_TAG\n");
4183#endif
4184 }
4185 break;
4186 case XML_PARSER_EPILOG:
4187 if (in->buf == NULL)
4188 avail = in->length - (in->cur - in->base);
4189 else
4190 avail = in->buf->buffer->use - (in->cur - in->base);
4191 if (avail < 1)
4192 goto done;
4193 cur = in->cur[0];
4194 if (IS_BLANK(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004195 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004196 goto done;
4197 }
4198 if (avail < 2)
4199 goto done;
4200 next = in->cur[1];
4201 if ((cur == '<') && (next == '!') &&
4202 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4203 if ((!terminate) &&
4204 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4205 goto done;
4206#ifdef DEBUG_PUSH
4207 xmlGenericError(xmlGenericErrorContext,
4208 "HPP: Parsing Comment\n");
4209#endif
4210 htmlParseComment(ctxt);
4211 ctxt->instate = XML_PARSER_EPILOG;
4212 } else if ((cur == '<') && (next == '!') &&
4213 (avail < 4)) {
4214 goto done;
4215 } else {
4216 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004217 ctxt->wellFormed = 0;
4218 ctxt->instate = XML_PARSER_EOF;
4219#ifdef DEBUG_PUSH
4220 xmlGenericError(xmlGenericErrorContext,
4221 "HPP: entering EOF\n");
4222#endif
4223 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4224 ctxt->sax->endDocument(ctxt->userData);
4225 goto done;
4226 }
4227 break;
4228 case XML_PARSER_START_TAG: {
4229 xmlChar *name, *oldname;
4230 int depth = ctxt->nameNr;
Daniel Veillardbb371292001-08-16 23:26:59 +00004231 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004232
4233 if (avail < 2)
4234 goto done;
4235 cur = in->cur[0];
4236 if (cur != '<') {
4237 ctxt->instate = XML_PARSER_CONTENT;
4238#ifdef DEBUG_PUSH
4239 xmlGenericError(xmlGenericErrorContext,
4240 "HPP: entering CONTENT\n");
4241#endif
4242 break;
4243 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004244 if (in->cur[1] == '/') {
4245 ctxt->instate = XML_PARSER_END_TAG;
4246 ctxt->checkIndex = 0;
4247#ifdef DEBUG_PUSH
4248 xmlGenericError(xmlGenericErrorContext,
4249 "HPP: entering END_TAG\n");
4250#endif
4251 break;
4252 }
Owen Taylor3473f882001-02-23 17:55:21 +00004253 if ((!terminate) &&
4254 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4255 goto done;
4256
4257 oldname = xmlStrdup(ctxt->name);
4258 htmlParseStartTag(ctxt);
4259 name = ctxt->name;
4260#ifdef DEBUG
4261 if (oldname == NULL)
4262 xmlGenericError(xmlGenericErrorContext,
4263 "Start of element %s\n", name);
4264 else if (name == NULL)
4265 xmlGenericError(xmlGenericErrorContext,
4266 "Start of element failed, was %s\n",
4267 oldname);
4268 else
4269 xmlGenericError(xmlGenericErrorContext,
4270 "Start of element %s, was %s\n",
4271 name, oldname);
4272#endif
4273 if (((depth == ctxt->nameNr) &&
4274 (xmlStrEqual(oldname, ctxt->name))) ||
4275 (name == NULL)) {
4276 if (CUR == '>')
4277 NEXT;
4278 if (oldname != NULL)
4279 xmlFree(oldname);
4280 break;
4281 }
4282 if (oldname != NULL)
4283 xmlFree(oldname);
4284
4285 /*
4286 * Lookup the info for that element.
4287 */
4288 info = htmlTagLookup(name);
4289 if (info == NULL) {
4290 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4291 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4292 name);
4293 ctxt->wellFormed = 0;
4294 } else if (info->depr) {
4295 /***************************
4296 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4297 ctxt->sax->warning(ctxt->userData,
4298 "Tag %s is deprecated\n",
4299 name);
4300 ***************************/
4301 }
4302
4303 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004304 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004305 */
4306 if ((CUR == '/') && (NXT(1) == '>')) {
4307 SKIP(2);
4308 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4309 ctxt->sax->endElement(ctxt->userData, name);
4310 oldname = htmlnamePop(ctxt);
4311#ifdef DEBUG
4312 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
4313 oldname);
4314#endif
4315 if (oldname != NULL)
4316 xmlFree(oldname);
4317 ctxt->instate = XML_PARSER_CONTENT;
4318#ifdef DEBUG_PUSH
4319 xmlGenericError(xmlGenericErrorContext,
4320 "HPP: entering CONTENT\n");
4321#endif
4322 break;
4323 }
4324
4325 if (CUR == '>') {
4326 NEXT;
4327 } else {
4328 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4329 ctxt->sax->error(ctxt->userData,
4330 "Couldn't find end of Start Tag %s\n",
4331 name);
4332 ctxt->wellFormed = 0;
4333
4334 /*
4335 * end of parsing of this node.
4336 */
4337 if (xmlStrEqual(name, ctxt->name)) {
4338 nodePop(ctxt);
4339 oldname = htmlnamePop(ctxt);
4340#ifdef DEBUG
4341 xmlGenericError(xmlGenericErrorContext,
4342 "End of start tag problem: popping out %s\n", oldname);
4343#endif
4344 if (oldname != NULL)
4345 xmlFree(oldname);
4346 }
4347
4348 ctxt->instate = XML_PARSER_CONTENT;
4349#ifdef DEBUG_PUSH
4350 xmlGenericError(xmlGenericErrorContext,
4351 "HPP: entering CONTENT\n");
4352#endif
4353 break;
4354 }
4355
4356 /*
4357 * Check for an Empty Element from DTD definition
4358 */
4359 if ((info != NULL) && (info->empty)) {
4360 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4361 ctxt->sax->endElement(ctxt->userData, name);
4362 oldname = htmlnamePop(ctxt);
4363#ifdef DEBUG
4364 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
4365#endif
4366 if (oldname != NULL)
4367 xmlFree(oldname);
4368 }
4369 ctxt->instate = XML_PARSER_CONTENT;
4370#ifdef DEBUG_PUSH
4371 xmlGenericError(xmlGenericErrorContext,
4372 "HPP: entering CONTENT\n");
4373#endif
4374 break;
4375 }
4376 case XML_PARSER_CONTENT: {
4377 long cons;
4378 /*
4379 * Handle preparsed entities and charRef
4380 */
4381 if (ctxt->token != 0) {
4382 xmlChar chr[2] = { 0 , 0 } ;
4383
4384 chr[0] = (xmlChar) ctxt->token;
4385 htmlCheckParagraph(ctxt);
4386 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4387 ctxt->sax->characters(ctxt->userData, chr, 1);
4388 ctxt->token = 0;
4389 ctxt->checkIndex = 0;
4390 }
4391 if ((avail == 1) && (terminate)) {
4392 cur = in->cur[0];
4393 if ((cur != '<') && (cur != '&')) {
4394 if (ctxt->sax != NULL) {
4395 if (IS_BLANK(cur)) {
4396 if (ctxt->sax->ignorableWhitespace != NULL)
4397 ctxt->sax->ignorableWhitespace(
4398 ctxt->userData, &cur, 1);
4399 } else {
4400 htmlCheckParagraph(ctxt);
4401 if (ctxt->sax->characters != NULL)
4402 ctxt->sax->characters(
4403 ctxt->userData, &cur, 1);
4404 }
4405 }
4406 ctxt->token = 0;
4407 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00004408 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00004409 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004410 }
Owen Taylor3473f882001-02-23 17:55:21 +00004411 }
4412 if (avail < 2)
4413 goto done;
4414 cur = in->cur[0];
4415 next = in->cur[1];
4416 cons = ctxt->nbChars;
4417 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4418 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4419 /*
4420 * Handle SCRIPT/STYLE separately
4421 */
4422 if ((!terminate) &&
4423 (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4424 goto done;
4425 htmlParseScript(ctxt);
4426 if ((cur == '<') && (next == '/')) {
4427 ctxt->instate = XML_PARSER_END_TAG;
4428 ctxt->checkIndex = 0;
4429#ifdef DEBUG_PUSH
4430 xmlGenericError(xmlGenericErrorContext,
4431 "HPP: entering END_TAG\n");
4432#endif
4433 break;
4434 }
4435 } else {
4436 /*
4437 * Sometimes DOCTYPE arrives in the middle of the document
4438 */
4439 if ((cur == '<') && (next == '!') &&
4440 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4441 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4442 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4443 (UPP(8) == 'E')) {
4444 if ((!terminate) &&
4445 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4446 goto done;
4447 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4448 ctxt->sax->error(ctxt->userData,
4449 "Misplaced DOCTYPE declaration\n");
4450 ctxt->wellFormed = 0;
4451 htmlParseDocTypeDecl(ctxt);
4452 } else if ((cur == '<') && (next == '!') &&
4453 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4454 if ((!terminate) &&
4455 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4456 goto done;
4457#ifdef DEBUG_PUSH
4458 xmlGenericError(xmlGenericErrorContext,
4459 "HPP: Parsing Comment\n");
4460#endif
4461 htmlParseComment(ctxt);
4462 ctxt->instate = XML_PARSER_CONTENT;
4463 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4464 goto done;
4465 } else if ((cur == '<') && (next == '/')) {
4466 ctxt->instate = XML_PARSER_END_TAG;
4467 ctxt->checkIndex = 0;
4468#ifdef DEBUG_PUSH
4469 xmlGenericError(xmlGenericErrorContext,
4470 "HPP: entering END_TAG\n");
4471#endif
4472 break;
4473 } else if (cur == '<') {
4474 ctxt->instate = XML_PARSER_START_TAG;
4475 ctxt->checkIndex = 0;
4476#ifdef DEBUG_PUSH
4477 xmlGenericError(xmlGenericErrorContext,
4478 "HPP: entering START_TAG\n");
4479#endif
4480 break;
4481 } else if (cur == '&') {
4482 if ((!terminate) &&
4483 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4484 goto done;
4485#ifdef DEBUG_PUSH
4486 xmlGenericError(xmlGenericErrorContext,
4487 "HPP: Parsing Reference\n");
4488#endif
4489 /* TODO: check generation of subtrees if noent !!! */
4490 htmlParseReference(ctxt);
4491 } else {
4492 /* TODO Avoid the extra copy, handle directly !!!!!! */
4493 /*
Daniel Veillard01c13b52002-12-10 15:19:08 +00004494 * Goal of the following test is:
Owen Taylor3473f882001-02-23 17:55:21 +00004495 * - minimize calls to the SAX 'character' callback
4496 * when they are mergeable
4497 */
4498 if ((ctxt->inputNr == 1) &&
4499 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4500 if ((!terminate) &&
4501 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4502 goto done;
4503 }
4504 ctxt->checkIndex = 0;
4505#ifdef DEBUG_PUSH
4506 xmlGenericError(xmlGenericErrorContext,
4507 "HPP: Parsing char data\n");
4508#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004509 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004510 }
4511 }
4512 if (cons == ctxt->nbChars) {
4513 if (ctxt->node != NULL) {
4514 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4515 ctxt->sax->error(ctxt->userData,
4516 "detected an error in element content\n");
4517 ctxt->wellFormed = 0;
4518 }
4519 NEXT;
4520 break;
4521 }
4522
4523 break;
4524 }
4525 case XML_PARSER_END_TAG:
4526 if (avail < 2)
4527 goto done;
4528 if ((!terminate) &&
4529 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4530 goto done;
4531 htmlParseEndTag(ctxt);
4532 if (ctxt->nameNr == 0) {
4533 ctxt->instate = XML_PARSER_EPILOG;
4534 } else {
4535 ctxt->instate = XML_PARSER_CONTENT;
4536 }
4537 ctxt->checkIndex = 0;
4538#ifdef DEBUG_PUSH
4539 xmlGenericError(xmlGenericErrorContext,
4540 "HPP: entering CONTENT\n");
4541#endif
4542 break;
4543 case XML_PARSER_CDATA_SECTION:
4544 xmlGenericError(xmlGenericErrorContext,
4545 "HPP: internal error, state == CDATA\n");
4546 ctxt->instate = XML_PARSER_CONTENT;
4547 ctxt->checkIndex = 0;
4548#ifdef DEBUG_PUSH
4549 xmlGenericError(xmlGenericErrorContext,
4550 "HPP: entering CONTENT\n");
4551#endif
4552 break;
4553 case XML_PARSER_DTD:
4554 xmlGenericError(xmlGenericErrorContext,
4555 "HPP: internal error, state == DTD\n");
4556 ctxt->instate = XML_PARSER_CONTENT;
4557 ctxt->checkIndex = 0;
4558#ifdef DEBUG_PUSH
4559 xmlGenericError(xmlGenericErrorContext,
4560 "HPP: entering CONTENT\n");
4561#endif
4562 break;
4563 case XML_PARSER_COMMENT:
4564 xmlGenericError(xmlGenericErrorContext,
4565 "HPP: internal error, state == COMMENT\n");
4566 ctxt->instate = XML_PARSER_CONTENT;
4567 ctxt->checkIndex = 0;
4568#ifdef DEBUG_PUSH
4569 xmlGenericError(xmlGenericErrorContext,
4570 "HPP: entering CONTENT\n");
4571#endif
4572 break;
4573 case XML_PARSER_PI:
4574 xmlGenericError(xmlGenericErrorContext,
4575 "HPP: internal error, state == PI\n");
4576 ctxt->instate = XML_PARSER_CONTENT;
4577 ctxt->checkIndex = 0;
4578#ifdef DEBUG_PUSH
4579 xmlGenericError(xmlGenericErrorContext,
4580 "HPP: entering CONTENT\n");
4581#endif
4582 break;
4583 case XML_PARSER_ENTITY_DECL:
4584 xmlGenericError(xmlGenericErrorContext,
4585 "HPP: internal error, state == ENTITY_DECL\n");
4586 ctxt->instate = XML_PARSER_CONTENT;
4587 ctxt->checkIndex = 0;
4588#ifdef DEBUG_PUSH
4589 xmlGenericError(xmlGenericErrorContext,
4590 "HPP: entering CONTENT\n");
4591#endif
4592 break;
4593 case XML_PARSER_ENTITY_VALUE:
4594 xmlGenericError(xmlGenericErrorContext,
4595 "HPP: internal error, state == ENTITY_VALUE\n");
4596 ctxt->instate = XML_PARSER_CONTENT;
4597 ctxt->checkIndex = 0;
4598#ifdef DEBUG_PUSH
4599 xmlGenericError(xmlGenericErrorContext,
4600 "HPP: entering DTD\n");
4601#endif
4602 break;
4603 case XML_PARSER_ATTRIBUTE_VALUE:
4604 xmlGenericError(xmlGenericErrorContext,
4605 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
4606 ctxt->instate = XML_PARSER_START_TAG;
4607 ctxt->checkIndex = 0;
4608#ifdef DEBUG_PUSH
4609 xmlGenericError(xmlGenericErrorContext,
4610 "HPP: entering START_TAG\n");
4611#endif
4612 break;
4613 case XML_PARSER_SYSTEM_LITERAL:
4614 xmlGenericError(xmlGenericErrorContext,
4615 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
4616 ctxt->instate = XML_PARSER_CONTENT;
4617 ctxt->checkIndex = 0;
4618#ifdef DEBUG_PUSH
4619 xmlGenericError(xmlGenericErrorContext,
4620 "HPP: entering CONTENT\n");
4621#endif
4622 break;
4623 case XML_PARSER_IGNORE:
4624 xmlGenericError(xmlGenericErrorContext,
4625 "HPP: internal error, state == XML_PARSER_IGNORE\n");
4626 ctxt->instate = XML_PARSER_CONTENT;
4627 ctxt->checkIndex = 0;
4628#ifdef DEBUG_PUSH
4629 xmlGenericError(xmlGenericErrorContext,
4630 "HPP: entering CONTENT\n");
4631#endif
4632 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00004633 case XML_PARSER_PUBLIC_LITERAL:
4634 xmlGenericError(xmlGenericErrorContext,
4635 "HPP: internal error, state == XML_PARSER_LITERAL\n");
4636 ctxt->instate = XML_PARSER_CONTENT;
4637 ctxt->checkIndex = 0;
4638#ifdef DEBUG_PUSH
4639 xmlGenericError(xmlGenericErrorContext,
4640 "HPP: entering CONTENT\n");
4641#endif
4642 break;
4643
Owen Taylor3473f882001-02-23 17:55:21 +00004644 }
4645 }
4646done:
4647 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004648 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004649 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4650 /*
4651 * SAX: end of the document processing.
4652 */
4653 ctxt->instate = XML_PARSER_EOF;
4654 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4655 ctxt->sax->endDocument(ctxt->userData);
4656 }
4657 }
4658 if ((ctxt->myDoc != NULL) &&
4659 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4660 (ctxt->instate == XML_PARSER_EPILOG))) {
4661 xmlDtdPtr dtd;
4662 dtd = xmlGetIntSubset(ctxt->myDoc);
4663 if (dtd == NULL)
4664 ctxt->myDoc->intSubset =
4665 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4666 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4667 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4668 }
4669#ifdef DEBUG_PUSH
4670 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4671#endif
4672 return(ret);
4673}
4674
4675/**
Owen Taylor3473f882001-02-23 17:55:21 +00004676 * htmlParseChunk:
4677 * @ctxt: an XML parser context
4678 * @chunk: an char array
4679 * @size: the size in byte of the chunk
4680 * @terminate: last chunk indicator
4681 *
4682 * Parse a Chunk of memory
4683 *
4684 * Returns zero if no error, the xmlParserErrors otherwise.
4685 */
4686int
4687htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4688 int terminate) {
4689 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4690 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4691 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4692 int cur = ctxt->input->cur - ctxt->input->base;
4693
4694 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4695 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4696 ctxt->input->cur = ctxt->input->base + cur;
4697#ifdef DEBUG_PUSH
4698 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4699#endif
4700
4701 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4702 htmlParseTryOrFinish(ctxt, terminate);
4703 } else if (ctxt->instate != XML_PARSER_EOF) {
4704 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
4705 htmlParseTryOrFinish(ctxt, terminate);
4706 }
4707 if (terminate) {
4708 if ((ctxt->instate != XML_PARSER_EOF) &&
4709 (ctxt->instate != XML_PARSER_EPILOG) &&
4710 (ctxt->instate != XML_PARSER_MISC)) {
4711 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004712 ctxt->wellFormed = 0;
4713 }
4714 if (ctxt->instate != XML_PARSER_EOF) {
4715 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4716 ctxt->sax->endDocument(ctxt->userData);
4717 }
4718 ctxt->instate = XML_PARSER_EOF;
4719 }
4720 return((xmlParserErrors) ctxt->errNo);
4721}
4722
4723/************************************************************************
4724 * *
4725 * User entry points *
4726 * *
4727 ************************************************************************/
4728
4729/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004730 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004731 * @sax: a SAX handler
4732 * @user_data: The user data returned on SAX callbacks
4733 * @chunk: a pointer to an array of chars
4734 * @size: number of chars in the array
4735 * @filename: an optional file name or URI
4736 * @enc: an optional encoding
4737 *
4738 * Create a parser context for using the HTML parser in push mode
4739 * To allow content encoding detection, @size should be >= 4
4740 * The value of @filename is used for fetching external entities
4741 * and error/warning reports.
4742 *
4743 * Returns the new parser context or NULL
4744 */
4745htmlParserCtxtPtr
4746htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4747 const char *chunk, int size, const char *filename,
4748 xmlCharEncoding enc) {
4749 htmlParserCtxtPtr ctxt;
4750 htmlParserInputPtr inputStream;
4751 xmlParserInputBufferPtr buf;
4752
Daniel Veillardd0463562001-10-13 09:15:48 +00004753 xmlInitParser();
4754
Owen Taylor3473f882001-02-23 17:55:21 +00004755 buf = xmlAllocParserInputBuffer(enc);
4756 if (buf == NULL) return(NULL);
4757
4758 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4759 if (ctxt == NULL) {
4760 xmlFree(buf);
4761 return(NULL);
4762 }
4763 memset(ctxt, 0, sizeof(htmlParserCtxt));
4764 htmlInitParserCtxt(ctxt);
4765 if (sax != NULL) {
4766 if (ctxt->sax != &htmlDefaultSAXHandler)
4767 xmlFree(ctxt->sax);
4768 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4769 if (ctxt->sax == NULL) {
4770 xmlFree(buf);
4771 xmlFree(ctxt);
4772 return(NULL);
4773 }
4774 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4775 if (user_data != NULL)
4776 ctxt->userData = user_data;
4777 }
4778 if (filename == NULL) {
4779 ctxt->directory = NULL;
4780 } else {
4781 ctxt->directory = xmlParserGetDirectory(filename);
4782 }
4783
4784 inputStream = htmlNewInputStream(ctxt);
4785 if (inputStream == NULL) {
4786 xmlFreeParserCtxt(ctxt);
4787 return(NULL);
4788 }
4789
4790 if (filename == NULL)
4791 inputStream->filename = NULL;
4792 else
4793 inputStream->filename = xmlMemStrdup(filename);
4794 inputStream->buf = buf;
4795 inputStream->base = inputStream->buf->buffer->content;
4796 inputStream->cur = inputStream->buf->buffer->content;
4797
4798 inputPush(ctxt, inputStream);
4799
4800 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4801 (ctxt->input->buf != NULL)) {
4802 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4803#ifdef DEBUG_PUSH
4804 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4805#endif
4806 }
4807
4808 return(ctxt);
4809}
4810
4811/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004812 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00004813 * @cur: a pointer to an array of xmlChar
4814 * @encoding: a free form C string describing the HTML document encoding, or NULL
4815 * @sax: the SAX handler block
4816 * @userData: if using SAX, this pointer will be provided on callbacks.
4817 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004818 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
4819 * to handle parse events. If sax is NULL, fallback to the default DOM
4820 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00004821 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004822 * Returns the resulting document tree unless SAX is NULL or the document is
4823 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00004824 */
4825
4826htmlDocPtr
4827htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
4828 htmlDocPtr ret;
4829 htmlParserCtxtPtr ctxt;
4830
Daniel Veillardd0463562001-10-13 09:15:48 +00004831 xmlInitParser();
4832
Owen Taylor3473f882001-02-23 17:55:21 +00004833 if (cur == NULL) return(NULL);
4834
4835
4836 ctxt = htmlCreateDocParserCtxt(cur, encoding);
4837 if (ctxt == NULL) return(NULL);
4838 if (sax != NULL) {
4839 ctxt->sax = sax;
4840 ctxt->userData = userData;
4841 }
4842
4843 htmlParseDocument(ctxt);
4844 ret = ctxt->myDoc;
4845 if (sax != NULL) {
4846 ctxt->sax = NULL;
4847 ctxt->userData = NULL;
4848 }
4849 htmlFreeParserCtxt(ctxt);
4850
4851 return(ret);
4852}
4853
4854/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004855 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00004856 * @cur: a pointer to an array of xmlChar
4857 * @encoding: a free form C string describing the HTML document encoding, or NULL
4858 *
4859 * parse an HTML in-memory document and build a tree.
4860 *
4861 * Returns the resulting document tree
4862 */
4863
4864htmlDocPtr
4865htmlParseDoc(xmlChar *cur, const char *encoding) {
4866 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4867}
4868
4869
4870/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004871 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004872 * @filename: the filename
4873 * @encoding: a free form C string describing the HTML document encoding, or NULL
4874 *
4875 * Create a parser context for a file content.
4876 * Automatic support for ZLIB/Compress compressed document is provided
4877 * by default if found at compile-time.
4878 *
4879 * Returns the new parser context or NULL
4880 */
4881htmlParserCtxtPtr
4882htmlCreateFileParserCtxt(const char *filename, const char *encoding)
4883{
4884 htmlParserCtxtPtr ctxt;
4885 htmlParserInputPtr inputStream;
4886 xmlParserInputBufferPtr buf;
4887 /* htmlCharEncoding enc; */
4888 xmlChar *content, *content_line = (xmlChar *) "charset=";
4889
4890 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
4891 if (buf == NULL) return(NULL);
4892
4893 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4894 if (ctxt == NULL) {
Daniel Veillard3487c8d2002-09-05 11:33:25 +00004895 xmlGenericError(xmlGenericErrorContext, "malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004896 return(NULL);
4897 }
4898 memset(ctxt, 0, sizeof(htmlParserCtxt));
4899 htmlInitParserCtxt(ctxt);
4900 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
4901 if (inputStream == NULL) {
Daniel Veillard3487c8d2002-09-05 11:33:25 +00004902 xmlGenericError(xmlGenericErrorContext, "malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004903 xmlFree(ctxt);
4904 return(NULL);
4905 }
4906 memset(inputStream, 0, sizeof(htmlParserInput));
4907
Daniel Veillarda646cfd2002-09-17 21:50:03 +00004908 inputStream->filename = (char *)
4909 xmlNormalizeWindowsPath((xmlChar *)filename);
Owen Taylor3473f882001-02-23 17:55:21 +00004910 inputStream->line = 1;
4911 inputStream->col = 1;
4912 inputStream->buf = buf;
4913 inputStream->directory = NULL;
4914
4915 inputStream->base = inputStream->buf->buffer->content;
4916 inputStream->cur = inputStream->buf->buffer->content;
4917 inputStream->free = NULL;
4918
4919 inputPush(ctxt, inputStream);
4920
4921 /* set encoding */
4922 if (encoding) {
4923 content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
4924 if (content) {
4925 strcpy ((char *)content, (char *)content_line);
4926 strcat ((char *)content, (char *)encoding);
4927 htmlCheckEncoding (ctxt, content);
4928 xmlFree (content);
4929 }
4930 }
4931
4932 return(ctxt);
4933}
4934
4935/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004936 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00004937 * @filename: the filename
4938 * @encoding: a free form C string describing the HTML document encoding, or NULL
4939 * @sax: the SAX handler block
4940 * @userData: if using SAX, this pointer will be provided on callbacks.
4941 *
4942 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4943 * compressed document is provided by default if found at compile-time.
4944 * It use the given SAX function block to handle the parsing callback.
4945 * If sax is NULL, fallback to the default DOM tree building routines.
4946 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004947 * Returns the resulting document tree unless SAX is NULL or the document is
4948 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00004949 */
4950
4951htmlDocPtr
4952htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
4953 void *userData) {
4954 htmlDocPtr ret;
4955 htmlParserCtxtPtr ctxt;
4956 htmlSAXHandlerPtr oldsax = NULL;
4957
Daniel Veillardd0463562001-10-13 09:15:48 +00004958 xmlInitParser();
4959
Owen Taylor3473f882001-02-23 17:55:21 +00004960 ctxt = htmlCreateFileParserCtxt(filename, encoding);
4961 if (ctxt == NULL) return(NULL);
4962 if (sax != NULL) {
4963 oldsax = ctxt->sax;
4964 ctxt->sax = sax;
4965 ctxt->userData = userData;
4966 }
4967
4968 htmlParseDocument(ctxt);
4969
4970 ret = ctxt->myDoc;
4971 if (sax != NULL) {
4972 ctxt->sax = oldsax;
4973 ctxt->userData = NULL;
4974 }
4975 htmlFreeParserCtxt(ctxt);
4976
4977 return(ret);
4978}
4979
4980/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004981 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00004982 * @filename: the filename
4983 * @encoding: a free form C string describing the HTML document encoding, or NULL
4984 *
4985 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4986 * compressed document is provided by default if found at compile-time.
4987 *
4988 * Returns the resulting document tree
4989 */
4990
4991htmlDocPtr
4992htmlParseFile(const char *filename, const char *encoding) {
4993 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
4994}
4995
4996/**
4997 * htmlHandleOmittedElem:
4998 * @val: int 0 or 1
4999 *
5000 * Set and return the previous value for handling HTML omitted tags.
5001 *
5002 * Returns the last value for 0 for no handling, 1 for auto insertion.
5003 */
5004
5005int
5006htmlHandleOmittedElem(int val) {
5007 int old = htmlOmittedDefaultValue;
5008
5009 htmlOmittedDefaultValue = val;
5010 return(old);
5011}
5012
5013#endif /* LIBXML_HTML_ENABLED */