blob: 27207cdd66d32e3cfc29955b6397b6ec08b34e19 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Owen Taylor3473f882001-02-23 17:55:21 +000045
46#define HTML_MAX_NAMELEN 1000
47#define HTML_PARSER_BIG_BUFFER_SIZE 1000
48#define HTML_PARSER_BUFFER_SIZE 100
49
50/* #define DEBUG */
51/* #define DEBUG_PUSH */
52
Daniel Veillard22090732001-07-16 00:06:07 +000053static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000054
Daniel Veillard56a4cb82001-03-24 17:00:36 +000055xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
56 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000057static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000058
59/************************************************************************
60 * *
Owen Taylor3473f882001-02-23 17:55:21 +000061 * Parser stacks related functions and macros *
62 * *
63 ************************************************************************/
64
65/*
66 * Generic function for accessing stacks in the Parser Context
67 */
68
69#define PUSH_AND_POP(scope, type, name) \
70scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
71 if (ctxt->name##Nr >= ctxt->name##Max) { \
72 ctxt->name##Max *= 2; \
73 ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
74 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
75 if (ctxt->name##Tab == NULL) { \
76 xmlGenericError(xmlGenericErrorContext, \
77 "realloc failed !\n"); \
78 return(0); \
79 } \
80 } \
81 ctxt->name##Tab[ctxt->name##Nr] = value; \
82 ctxt->name = value; \
83 return(ctxt->name##Nr++); \
84} \
85scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
86 type ret; \
87 if (ctxt->name##Nr < 0) return(0); \
88 ctxt->name##Nr--; \
89 if (ctxt->name##Nr < 0) return(0); \
90 if (ctxt->name##Nr > 0) \
91 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
92 else \
93 ctxt->name = NULL; \
94 ret = ctxt->name##Tab[ctxt->name##Nr]; \
95 ctxt->name##Tab[ctxt->name##Nr] = 0; \
96 return(ret); \
97} \
98
Daniel Veillard56a4cb82001-03-24 17:00:36 +000099/* PUSH_AND_POP(static, xmlNodePtr, node) */
100PUSH_AND_POP(static, xmlChar*, name)
Owen Taylor3473f882001-02-23 17:55:21 +0000101
102/*
103 * Macros for accessing the content. Those should be used only by the parser,
104 * and not exported.
105 *
106 * Dirty macros, i.e. one need to make assumption on the context to use them
107 *
108 * CUR_PTR return the current pointer to the xmlChar to be parsed.
109 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
110 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
111 * in UNICODE mode. This should be used internally by the parser
112 * only to compare to ASCII values otherwise it would break when
113 * running with UTF-8 encoding.
114 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
115 * to compare on ASCII based substring.
116 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
117 * it should be used only to compare on ASCII based substring.
118 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
119 * strings within the parser.
120 *
121 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
122 *
123 * CURRENT Returns the current char value, with the full decoding of
124 * UTF-8 if we are using this mode. It returns an int.
125 * NEXT Skip to the next character, this does the proper decoding
126 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
127 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
128 */
129
130#define UPPER (toupper(*ctxt->input->cur))
131
132#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
133
134#define NXT(val) ctxt->input->cur[(val)]
135
136#define UPP(val) (toupper(ctxt->input->cur[(val)]))
137
138#define CUR_PTR ctxt->input->cur
139
140#define SHRINK xmlParserInputShrink(ctxt->input)
141
142#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
143
144#define CURRENT ((int) (*ctxt->input->cur))
145
146#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
147
148/* Inported from XML */
149
Daniel Veillard561b7f82002-03-20 21:55:57 +0000150/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
151#define CUR ((int) (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000152#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
153
Daniel Veillard561b7f82002-03-20 21:55:57 +0000154#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000155#define NXT(val) ctxt->input->cur[(val)]
156#define CUR_PTR ctxt->input->cur
157
158
159#define NEXTL(l) do { \
160 if (*(ctxt->input->cur) == '\n') { \
161 ctxt->input->line++; ctxt->input->col = 1; \
162 } else ctxt->input->col++; \
163 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
164 } while (0)
165
166/************
167 \
168 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
169 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
170 ************/
171
172#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
173#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
174
175#define COPY_BUF(l,b,i,v) \
176 if (l == 1) b[i++] = (xmlChar) v; \
177 else i += xmlCopyChar(l,&b[i],v)
178
179/**
180 * htmlCurrentChar:
181 * @ctxt: the HTML parser context
182 * @len: pointer to the length of the char read
183 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000184 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000185 * bytes in the input buffer. Implement the end of line normalization:
186 * 2.11 End-of-Line Handling
187 * If the encoding is unspecified, in the case we find an ISO-Latin-1
188 * char, then the encoding converter is plugged in automatically.
189 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000190 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000191 */
192
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000193static int
Owen Taylor3473f882001-02-23 17:55:21 +0000194htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
195 if (ctxt->instate == XML_PARSER_EOF)
196 return(0);
197
198 if (ctxt->token != 0) {
199 *len = 0;
200 return(ctxt->token);
201 }
202 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
203 /*
204 * We are supposed to handle UTF8, check it's valid
205 * From rfc2044: encoding of the Unicode values on UTF-8:
206 *
207 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
208 * 0000 0000-0000 007F 0xxxxxxx
209 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
210 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
211 *
212 * Check for the 0x110000 limit too
213 */
214 const unsigned char *cur = ctxt->input->cur;
215 unsigned char c;
216 unsigned int val;
217
218 c = *cur;
219 if (c & 0x80) {
220 if (cur[1] == 0)
221 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
222 if ((cur[1] & 0xc0) != 0x80)
223 goto encoding_error;
224 if ((c & 0xe0) == 0xe0) {
225
226 if (cur[2] == 0)
227 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
228 if ((cur[2] & 0xc0) != 0x80)
229 goto encoding_error;
230 if ((c & 0xf0) == 0xf0) {
231 if (cur[3] == 0)
232 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
233 if (((c & 0xf8) != 0xf0) ||
234 ((cur[3] & 0xc0) != 0x80))
235 goto encoding_error;
236 /* 4-byte code */
237 *len = 4;
238 val = (cur[0] & 0x7) << 18;
239 val |= (cur[1] & 0x3f) << 12;
240 val |= (cur[2] & 0x3f) << 6;
241 val |= cur[3] & 0x3f;
242 } else {
243 /* 3-byte code */
244 *len = 3;
245 val = (cur[0] & 0xf) << 12;
246 val |= (cur[1] & 0x3f) << 6;
247 val |= cur[2] & 0x3f;
248 }
249 } else {
250 /* 2-byte code */
251 *len = 2;
252 val = (cur[0] & 0x1f) << 6;
253 val |= cur[1] & 0x3f;
254 }
255 if (!IS_CHAR(val)) {
256 ctxt->errNo = XML_ERR_INVALID_ENCODING;
257 if ((ctxt->sax != NULL) &&
258 (ctxt->sax->error != NULL))
259 ctxt->sax->error(ctxt->userData,
260 "Char 0x%X out of allowed range\n", val);
261 ctxt->wellFormed = 0;
262 ctxt->disableSAX = 1;
263 }
264 return(val);
265 } else {
266 /* 1-byte code */
267 *len = 1;
268 return((int) *ctxt->input->cur);
269 }
270 }
271 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000272 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000273 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000274 * XML constructs only use < 128 chars
275 */
276 *len = 1;
277 if ((int) *ctxt->input->cur < 0x80)
278 return((int) *ctxt->input->cur);
279
280 /*
281 * Humm this is bad, do an automatic flow conversion
282 */
283 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
284 ctxt->charset = XML_CHAR_ENCODING_UTF8;
285 return(xmlCurrentChar(ctxt, len));
286
287encoding_error:
288 /*
289 * If we detect an UTF8 error that probably mean that the
290 * input encoding didn't get properly advertized in the
291 * declaration header. Report the error and switch the encoding
292 * to ISO-Latin-1 (if you don't like this policy, just declare the
293 * encoding !)
294 */
295 ctxt->errNo = XML_ERR_INVALID_ENCODING;
296 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
297 ctxt->sax->error(ctxt->userData,
298 "Input is not proper UTF-8, indicate encoding !\n");
299 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
300 ctxt->input->cur[0], ctxt->input->cur[1],
301 ctxt->input->cur[2], ctxt->input->cur[3]);
302 }
303
304 ctxt->charset = XML_CHAR_ENCODING_8859_1;
305 *len = 1;
306 return((int) *ctxt->input->cur);
307}
308
309/**
Owen Taylor3473f882001-02-23 17:55:21 +0000310 * htmlSkipBlankChars:
311 * @ctxt: the HTML parser context
312 *
313 * skip all blanks character found at that point in the input streams.
314 *
315 * Returns the number of space chars skipped
316 */
317
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000318static int
Owen Taylor3473f882001-02-23 17:55:21 +0000319htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
320 int res = 0;
321
322 while (IS_BLANK(*(ctxt->input->cur))) {
323 if ((*ctxt->input->cur == 0) &&
324 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
325 xmlPopInput(ctxt);
326 } else {
327 if (*(ctxt->input->cur) == '\n') {
328 ctxt->input->line++; ctxt->input->col = 1;
329 } else ctxt->input->col++;
330 ctxt->input->cur++;
331 ctxt->nbChars++;
332 if (*ctxt->input->cur == 0)
333 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
334 }
335 res++;
336 }
337 return(res);
338}
339
340
341
342/************************************************************************
343 * *
344 * The list of HTML elements and their properties *
345 * *
346 ************************************************************************/
347
348/*
349 * Start Tag: 1 means the start tag can be ommited
350 * End Tag: 1 means the end tag can be ommited
351 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000352 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000353 * Depr: this element is deprecated
354 * DTD: 1 means that this element is valid only in the Loose DTD
355 * 2 means that this element is valid only in the Frameset DTD
356 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000357 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Owen Taylor3473f882001-02-23 17:55:21 +0000358 */
Daniel Veillard22090732001-07-16 00:06:07 +0000359static const htmlElemDesc
360html40ElementTable[] = {
Daniel Veillard02bb1702001-06-13 21:11:59 +0000361{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor " },
362{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form" },
363{ "acronym", 0, 0, 0, 0, 0, 0, 1, "" },
364{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author " },
365{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet " },
366{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area " },
367{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style" },
368{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri " },
369{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " },
370{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride " },
371{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style" },
372{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation " },
373{ "body", 1, 1, 0, 0, 0, 0, 0, "document body " },
374{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break " },
375{ "button", 0, 0, 0, 0, 0, 0, 2, "push button " },
376{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption " },
377{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center " },
378{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation" },
379{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment" },
380{ "col", 0, 2, 2, 1, 0, 0, 0, "table column " },
381{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group " },
382{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description " },
383{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text " },
384{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition" },
385{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list" },
386{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container"},
387{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list " },
388{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term " },
389{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis" },
390{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group " },
391{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font " },
392{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form " },
393{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " },
394{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" },
395{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading " },
396{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading " },
397{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading " },
398{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading " },
399{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading " },
400{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading " },
401{ "head", 1, 1, 0, 0, 0, 0, 0, "document head " },
402{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " },
403{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element " },
404{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style" },
405{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow " },
406{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image " },
407{ "input", 0, 2, 2, 1, 0, 0, 1, "form control " },
408{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text" },
409{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt " },
410{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user" },
411{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text " },
412{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend " },
413{ "li", 0, 1, 1, 0, 0, 0, 0, "list item " },
414{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link " },
415{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map " },
416{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list " },
417{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation " },
418{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering " },
419{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
420{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object " },
421{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list " },
422{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group " },
423{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " },
424{ "p", 0, 1, 1, 0, 0, 0, 0, "paragraph " },
425{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value " },
426{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text " },
427{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation " },
428{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style" },
429{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc." },
430{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements " },
431{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector " },
432{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style" },
433{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container " },
434{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text" },
435{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis" },
436{ "style", 0, 0, 0, 0, 0, 0, 0, "style info " },
437{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript" },
438{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript " },
439{ "table", 0, 0, 0, 0, 0, 0, 0, "&#160;" },
440{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body " },
441{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell" },
442{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field " },
443{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer " },
444{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell" },
445{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header " },
446{ "title", 0, 0, 0, 0, 0, 0, 0, "document title " },
447{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row " },
448{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style" },
449{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style" },
450{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list " },
451{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument" },
Owen Taylor3473f882001-02-23 17:55:21 +0000452};
453
454/*
Owen Taylor3473f882001-02-23 17:55:21 +0000455 * start tags that imply the end of current element
456 */
Daniel Veillard22090732001-07-16 00:06:07 +0000457static const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000458"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
459 "dl", "ul", "ol", "menu", "dir", "address", "pre",
460 "listing", "xmp", "head", NULL,
461"head", "p", NULL,
462"title", "p", NULL,
463"body", "head", "style", "link", "title", "p", NULL,
464"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
465 "pre", "listing", "xmp", "head", "li", NULL,
466"hr", "p", "head", NULL,
467"h1", "p", "head", NULL,
468"h2", "p", "head", NULL,
469"h3", "p", "head", NULL,
470"h4", "p", "head", NULL,
471"h5", "p", "head", NULL,
472"h6", "p", "head", NULL,
473"dir", "p", "head", NULL,
474"address", "p", "head", "ul", NULL,
475"pre", "p", "head", "ul", NULL,
476"listing", "p", "head", NULL,
477"xmp", "p", "head", NULL,
478"blockquote", "p", "head", NULL,
479"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
480 "xmp", "head", NULL,
481"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
482 "head", "dd", NULL,
483"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
484 "head", "dt", NULL,
485"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
486 "listing", "xmp", NULL,
487"ol", "p", "head", "ul", NULL,
488"menu", "p", "head", "ul", NULL,
489"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
490"div", "p", "head", NULL,
491"noscript", "p", "head", NULL,
492"center", "font", "b", "i", "p", "head", NULL,
493"a", "a", NULL,
494"caption", "p", NULL,
495"colgroup", "caption", "colgroup", "col", "p", NULL,
496"col", "caption", "col", "p", NULL,
497"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
498 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000499"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
500"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000501"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
502"thead", "caption", "col", "colgroup", NULL,
503"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
504 "tbody", "p", NULL,
505"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
506 "tfoot", "tbody", "p", NULL,
507"optgroup", "option", NULL,
508"option", "option", NULL,
509"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
510 "pre", "listing", "xmp", "a", NULL,
511NULL
512};
513
514/*
515 * The list of HTML elements which are supposed not to have
516 * CDATA content and where a p element will be implied
517 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000518 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +0000519 * implied paragraph
520 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000521static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000522 "html",
523 "head",
524 "body",
525 NULL
526};
527
528/*
529 * The list of HTML attributes which are of content %Script;
530 * NOTE: when adding ones, check htmlIsScriptAttribute() since
531 * it assumes the name starts with 'on'
532 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000533static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000534 "onclick",
535 "ondblclick",
536 "onmousedown",
537 "onmouseup",
538 "onmouseover",
539 "onmousemove",
540 "onmouseout",
541 "onkeypress",
542 "onkeydown",
543 "onkeyup",
544 "onload",
545 "onunload",
546 "onfocus",
547 "onblur",
548 "onsubmit",
549 "onrest",
550 "onchange",
551 "onselect"
552};
553
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000554/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000555 * This table is used by the htmlparser to know what to do with
556 * broken html pages. By assigning different priorities to different
557 * elements the parser can decide how to handle extra endtags.
558 * Endtags are only allowed to close elements with lower or equal
559 * priority.
560 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000561
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000562typedef struct {
563 const char *name;
564 int priority;
565} elementPriority;
566
Daniel Veillard22090732001-07-16 00:06:07 +0000567static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000568 {"div", 150},
569 {"td", 160},
570 {"th", 160},
571 {"tr", 170},
572 {"thead", 180},
573 {"tbody", 180},
574 {"tfoot", 180},
575 {"table", 190},
576 {"head", 200},
577 {"body", 200},
578 {"html", 220},
579 {NULL, 100} /* Default priority */
580};
Owen Taylor3473f882001-02-23 17:55:21 +0000581
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000582static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +0000583static int htmlStartCloseIndexinitialized = 0;
584
585/************************************************************************
586 * *
587 * functions to handle HTML specific data *
588 * *
589 ************************************************************************/
590
591/**
592 * htmlInitAutoClose:
593 *
594 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
595 * This is not reentrant. Call xmlInitParser() once before processing in
596 * case of use in multithreaded programs.
597 */
598void
599htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000600 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000601
602 if (htmlStartCloseIndexinitialized) return;
603
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000604 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
605 indx = 0;
606 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
607 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +0000608 while (htmlStartClose[i] != NULL) i++;
609 i++;
610 }
611 htmlStartCloseIndexinitialized = 1;
612}
613
614/**
615 * htmlTagLookup:
616 * @tag: The tag name in lowercase
617 *
618 * Lookup the HTML tag in the ElementTable
619 *
620 * Returns the related htmlElemDescPtr or NULL if not found.
621 */
Daniel Veillardbb371292001-08-16 23:26:59 +0000622const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +0000623htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000624 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000625
626 for (i = 0; i < (sizeof(html40ElementTable) /
627 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000628 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillard22090732001-07-16 00:06:07 +0000629 return((const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +0000630 }
631 return(NULL);
632}
633
634/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000635 * htmlGetEndPriority:
636 * @name: The name of the element to look up the priority for.
637 *
638 * Return value: The "endtag" priority.
639 **/
640static int
641htmlGetEndPriority (const xmlChar *name) {
642 int i = 0;
643
644 while ((htmlEndPriority[i].name != NULL) &&
645 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
646 i++;
647
648 return(htmlEndPriority[i].priority);
649}
650
651/**
Owen Taylor3473f882001-02-23 17:55:21 +0000652 * htmlCheckAutoClose:
653 * @newtag: The new tag name
654 * @oldtag: The old tag name
655 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000656 * Checks whether the new tag is one of the registered valid tags for
657 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +0000658 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
659 *
660 * Returns 0 if no, 1 if yes.
661 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000662static int
Owen Taylor3473f882001-02-23 17:55:21 +0000663htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000664 int i, indx;
665 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +0000666
667 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
668
669 /* inefficient, but not a big deal */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000670 for (indx = 0; indx < 100;indx++) {
671 closed = htmlStartCloseIndex[indx];
672 if (closed == NULL) return(0);
673 if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor3473f882001-02-23 17:55:21 +0000674 }
675
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000676 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +0000677 i++;
678 while (htmlStartClose[i] != NULL) {
679 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
680 return(1);
681 }
682 i++;
683 }
684 return(0);
685}
686
687/**
688 * htmlAutoCloseOnClose:
689 * @ctxt: an HTML parser context
690 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000691 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +0000692 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000693 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000694 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000695static void
Owen Taylor3473f882001-02-23 17:55:21 +0000696htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillardbb371292001-08-16 23:26:59 +0000697 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +0000698 xmlChar *oldname;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000699 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +0000700
701#ifdef DEBUG
702 xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
703 for (i = 0;i < ctxt->nameNr;i++)
704 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
705#endif
706
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000707 priority = htmlGetEndPriority (newtag);
708
Owen Taylor3473f882001-02-23 17:55:21 +0000709 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000710
Owen Taylor3473f882001-02-23 17:55:21 +0000711 if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000712 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000713 * A missplaced endtag can only close elements with lower
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000714 * or equal priority, so if we find an element with higher
715 * priority before we find an element with
716 * matching name, we just ignore this endtag
717 */
718 if (htmlGetEndPriority (ctxt->nameTab[i]) > priority) return;
Owen Taylor3473f882001-02-23 17:55:21 +0000719 }
720 if (i < 0) return;
721
722 while (!xmlStrEqual(newtag, ctxt->name)) {
723 info = htmlTagLookup(ctxt->name);
724 if ((info == NULL) || (info->endTag == 1)) {
725#ifdef DEBUG
726 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
727#endif
Daniel Veillard56098d42001-04-24 12:51:09 +0000728 } else if (info->endTag == 3) {
729#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +0000730 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", newtag, ctxt->name);
William M. Brack1633d182001-10-05 15:41:19 +0000731
Daniel Veillard56098d42001-04-24 12:51:09 +0000732#endif
733 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
734 ctxt->sax->error(ctxt->userData,
735 "Opening and ending tag mismatch: %s and %s\n",
736 newtag, ctxt->name);
737 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000738 }
739 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
740 ctxt->sax->endElement(ctxt->userData, ctxt->name);
741 oldname = htmlnamePop(ctxt);
742 if (oldname != NULL) {
743#ifdef DEBUG
744 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
745#endif
746 xmlFree(oldname);
747 }
748 }
749}
750
751/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000752 * htmlAutoCloseOnEnd:
753 * @ctxt: an HTML parser context
754 *
755 * Close all remaining tags at the end of the stream
756 */
757static void
758htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
759 xmlChar *oldname;
760 int i;
761
762 if (ctxt->nameNr == 0)
763 return;
764#ifdef DEBUG
765 xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
766#endif
767
768 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
769#ifdef DEBUG
770 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
771#endif
772 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
773 ctxt->sax->endElement(ctxt->userData, ctxt->name);
774 oldname = htmlnamePop(ctxt);
775 if (oldname != NULL) {
776#ifdef DEBUG
777 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
778#endif
779 xmlFree(oldname);
780 }
781 }
782}
783
784/**
Owen Taylor3473f882001-02-23 17:55:21 +0000785 * htmlAutoClose:
786 * @ctxt: an HTML parser context
787 * @newtag: The new tag name or NULL
788 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000789 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000790 * The list is kept in htmlStartClose array. This function is
791 * called when a new tag has been detected and generates the
792 * appropriates closes if possible/needed.
793 * If newtag is NULL this mean we are at the end of the resource
794 * and we should check
795 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000796static void
Owen Taylor3473f882001-02-23 17:55:21 +0000797htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
798 xmlChar *oldname;
799 while ((newtag != NULL) && (ctxt->name != NULL) &&
800 (htmlCheckAutoClose(newtag, ctxt->name))) {
801#ifdef DEBUG
802 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
803#endif
804 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
805 ctxt->sax->endElement(ctxt->userData, ctxt->name);
806 oldname = htmlnamePop(ctxt);
807 if (oldname != NULL) {
808#ifdef DEBUG
809 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
810#endif
811 xmlFree(oldname);
812 }
813 }
814 if (newtag == NULL) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000815 htmlAutoCloseOnEnd(ctxt);
816 return;
Owen Taylor3473f882001-02-23 17:55:21 +0000817 }
818 while ((newtag == NULL) && (ctxt->name != NULL) &&
819 ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
820 (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
821 (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
822#ifdef DEBUG
823 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
824#endif
825 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
826 ctxt->sax->endElement(ctxt->userData, ctxt->name);
827 oldname = htmlnamePop(ctxt);
828 if (oldname != NULL) {
829#ifdef DEBUG
830 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
831#endif
832 xmlFree(oldname);
833 }
834 }
835
836}
837
838/**
839 * htmlAutoCloseTag:
840 * @doc: the HTML document
841 * @name: The tag name
842 * @elem: the HTML element
843 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000844 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000845 * The list is kept in htmlStartClose array. This function checks
846 * if the element or one of it's children would autoclose the
847 * given tag.
848 *
849 * Returns 1 if autoclose, 0 otherwise
850 */
851int
852htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
853 htmlNodePtr child;
854
855 if (elem == NULL) return(1);
856 if (xmlStrEqual(name, elem->name)) return(0);
857 if (htmlCheckAutoClose(elem->name, name)) return(1);
858 child = elem->children;
859 while (child != NULL) {
860 if (htmlAutoCloseTag(doc, name, child)) return(1);
861 child = child->next;
862 }
863 return(0);
864}
865
866/**
867 * htmlIsAutoClosed:
868 * @doc: the HTML document
869 * @elem: the HTML element
870 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000871 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000872 * The list is kept in htmlStartClose array. This function checks
873 * if a tag is autoclosed by one of it's child
874 *
875 * Returns 1 if autoclosed, 0 otherwise
876 */
877int
878htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
879 htmlNodePtr child;
880
881 if (elem == NULL) return(1);
882 child = elem->children;
883 while (child != NULL) {
884 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
885 child = child->next;
886 }
887 return(0);
888}
889
890/**
891 * htmlCheckImplied:
892 * @ctxt: an HTML parser context
893 * @newtag: The new tag name
894 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000895 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +0000896 * called when a new tag has been detected and generates the
897 * appropriates implicit tags if missing
898 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000899static void
Owen Taylor3473f882001-02-23 17:55:21 +0000900htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
901 if (!htmlOmittedDefaultValue)
902 return;
903 if (xmlStrEqual(newtag, BAD_CAST"html"))
904 return;
905 if (ctxt->nameNr <= 0) {
906#ifdef DEBUG
907 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
908#endif
909 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
910 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
911 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
912 }
913 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
914 return;
915 if ((ctxt->nameNr <= 1) &&
916 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
917 (xmlStrEqual(newtag, BAD_CAST"style")) ||
918 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
919 (xmlStrEqual(newtag, BAD_CAST"link")) ||
920 (xmlStrEqual(newtag, BAD_CAST"title")) ||
921 (xmlStrEqual(newtag, BAD_CAST"base")))) {
922 /*
923 * dropped OBJECT ... i you put it first BODY will be
924 * assumed !
925 */
926#ifdef DEBUG
927 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
928#endif
929 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
930 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
931 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
932 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
933 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
934 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
935 int i;
936 for (i = 0;i < ctxt->nameNr;i++) {
937 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
938 return;
939 }
940 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
941 return;
942 }
943 }
944
945#ifdef DEBUG
946 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
947#endif
948 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
949 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
950 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
951 }
952}
953
954/**
955 * htmlCheckParagraph
956 * @ctxt: an HTML parser context
957 *
958 * Check whether a p element need to be implied before inserting
959 * characters in the current element.
960 *
961 * Returns 1 if a paragraph has been inserted, 0 if not and -1
962 * in case of error.
963 */
964
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000965static int
Owen Taylor3473f882001-02-23 17:55:21 +0000966htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
967 const xmlChar *tag;
968 int i;
969
970 if (ctxt == NULL)
971 return(-1);
972 tag = ctxt->name;
973 if (tag == NULL) {
974 htmlAutoClose(ctxt, BAD_CAST"p");
975 htmlCheckImplied(ctxt, BAD_CAST"p");
976 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
977 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
978 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
979 return(1);
980 }
981 if (!htmlOmittedDefaultValue)
982 return(0);
983 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
984 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
985#ifdef DEBUG
986 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
987#endif
988 htmlAutoClose(ctxt, BAD_CAST"p");
989 htmlCheckImplied(ctxt, BAD_CAST"p");
990 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
991 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
992 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
993 return(1);
994 }
995 }
996 return(0);
997}
998
999/**
1000 * htmlIsScriptAttribute:
1001 * @name: an attribute name
1002 *
1003 * Check if an attribute is of content type Script
1004 *
1005 * Returns 1 is the attribute is a script 0 otherwise
1006 */
1007int
1008htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001009 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001010
1011 if (name == NULL)
1012 return(0);
1013 /*
1014 * all script attributes start with 'on'
1015 */
1016 if ((name[0] != 'o') || (name[1] != 'n'))
1017 return(0);
1018 for (i = 0;
1019 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1020 i++) {
1021 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1022 return(1);
1023 }
1024 return(0);
1025}
1026
1027/************************************************************************
1028 * *
1029 * The list of HTML predefined entities *
1030 * *
1031 ************************************************************************/
1032
1033
Daniel Veillard22090732001-07-16 00:06:07 +00001034static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001035/*
1036 * the 4 absolute ones, plus apostrophe.
1037 */
1038{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1039{ 38, "amp", "ampersand, U+0026 ISOnum" },
1040{ 39, "apos", "single quote" },
1041{ 60, "lt", "less-than sign, U+003C ISOnum" },
1042{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1043
1044/*
1045 * A bunch still in the 128-255 range
1046 * Replacing them depend really on the charset used.
1047 */
1048{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1049{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1050{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1051{ 163, "pound","pound sign, U+00A3 ISOnum" },
1052{ 164, "curren","currency sign, U+00A4 ISOnum" },
1053{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1054{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1055{ 167, "sect", "section sign, U+00A7 ISOnum" },
1056{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1057{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1058{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1059{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1060{ 172, "not", "not sign, U+00AC ISOnum" },
1061{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1062{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1063{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1064{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1065{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1066{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1067{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1068{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1069{ 181, "micro","micro sign, U+00B5 ISOnum" },
1070{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1071{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1072{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1073{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1074{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1075{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1076{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1077{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1078{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1079{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1080{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1081{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1082{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1083{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1084{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1085{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1086{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1087{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1088{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1089{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1090{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1091{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1092{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1093{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1094{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1095{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1096{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1097{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1098{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1099{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1100{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1101{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1102{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1103{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1104{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1105{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1106{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1107{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1108{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1109{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1110{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1111{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1112{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1113{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1114{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1115{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1116{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1117{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1118{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1119{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1120{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1121{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1122{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1123{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1124{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1125{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1126{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1127{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1128{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1129{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1130{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1131{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1132{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1133{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1134{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1135{ 247, "divide","division sign, U+00F7 ISOnum" },
1136{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1137{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1138{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1139{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1140{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1141{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1142{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1143{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1144
1145{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1146{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1147{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1148{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1149{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1150
1151/*
1152 * Anything below should really be kept as entities references
1153 */
1154{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1155
1156{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1157{ 732, "tilde","small tilde, U+02DC ISOdia" },
1158
1159{ 913, "Alpha","greek capital letter alpha, U+0391" },
1160{ 914, "Beta", "greek capital letter beta, U+0392" },
1161{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1162{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1163{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1164{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1165{ 919, "Eta", "greek capital letter eta, U+0397" },
1166{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1167{ 921, "Iota", "greek capital letter iota, U+0399" },
1168{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001169{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001170{ 924, "Mu", "greek capital letter mu, U+039C" },
1171{ 925, "Nu", "greek capital letter nu, U+039D" },
1172{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1173{ 927, "Omicron","greek capital letter omicron, U+039F" },
1174{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1175{ 929, "Rho", "greek capital letter rho, U+03A1" },
1176{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1177{ 932, "Tau", "greek capital letter tau, U+03A4" },
1178{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1179{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1180{ 935, "Chi", "greek capital letter chi, U+03A7" },
1181{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1182{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1183
1184{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1185{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1186{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1187{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1188{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1189{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1190{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1191{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1192{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1193{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1194{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1195{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1196{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1197{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1198{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1199{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1200{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1201{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1202{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1203{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1204{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1205{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1206{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1207{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1208{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1209{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1210{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1211{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1212
1213{ 8194, "ensp", "en space, U+2002 ISOpub" },
1214{ 8195, "emsp", "em space, U+2003 ISOpub" },
1215{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1216{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1217{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1218{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1219{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1220{ 8211, "ndash","en dash, U+2013 ISOpub" },
1221{ 8212, "mdash","em dash, U+2014 ISOpub" },
1222{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1223{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1224{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1225{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1226{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1227{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1228{ 8224, "dagger","dagger, U+2020 ISOpub" },
1229{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1230
1231{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1232{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1233
1234{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1235
1236{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1237{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1238
1239{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1240{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1241
1242{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1243{ 8260, "frasl","fraction slash, U+2044 NEW" },
1244
1245{ 8364, "euro", "euro sign, U+20AC NEW" },
1246
1247{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1248{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1249{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1250{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1251{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1252{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1253{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1254{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1255{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1256{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1257{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1258{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1259{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1260{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1261{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1262{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1263
1264{ 8704, "forall","for all, U+2200 ISOtech" },
1265{ 8706, "part", "partial differential, U+2202 ISOtech" },
1266{ 8707, "exist","there exists, U+2203 ISOtech" },
1267{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1268{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1269{ 8712, "isin", "element of, U+2208 ISOtech" },
1270{ 8713, "notin","not an element of, U+2209 ISOtech" },
1271{ 8715, "ni", "contains as member, U+220B ISOtech" },
1272{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001273{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001274{ 8722, "minus","minus sign, U+2212 ISOtech" },
1275{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1276{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1277{ 8733, "prop", "proportional to, U+221D ISOtech" },
1278{ 8734, "infin","infinity, U+221E ISOtech" },
1279{ 8736, "ang", "angle, U+2220 ISOamso" },
1280{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1281{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1282{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1283{ 8746, "cup", "union = cup, U+222A ISOtech" },
1284{ 8747, "int", "integral, U+222B ISOtech" },
1285{ 8756, "there4","therefore, U+2234 ISOtech" },
1286{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1287{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1288{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1289{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1290{ 8801, "equiv","identical to, U+2261 ISOtech" },
1291{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1292{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1293{ 8834, "sub", "subset of, U+2282 ISOtech" },
1294{ 8835, "sup", "superset of, U+2283 ISOtech" },
1295{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1296{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1297{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1298{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1299{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1300{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1301{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1302{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1303{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1304{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1305{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1306{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1307{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1308{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1309
1310{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1311{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1312{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1313{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1314
1315};
1316
1317/************************************************************************
1318 * *
1319 * Commodity functions to handle entities *
1320 * *
1321 ************************************************************************/
1322
1323/*
1324 * Macro used to grow the current buffer.
1325 */
1326#define growBuffer(buffer) { \
1327 buffer##_size *= 2; \
1328 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1329 if (buffer == NULL) { \
1330 perror("realloc failed"); \
1331 return(NULL); \
1332 } \
1333}
1334
1335/**
1336 * htmlEntityLookup:
1337 * @name: the entity name
1338 *
1339 * Lookup the given entity in EntitiesTable
1340 *
1341 * TODO: the linear scan is really ugly, an hash table is really needed.
1342 *
1343 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1344 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001345const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001346htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001347 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001348
1349 for (i = 0;i < (sizeof(html40EntitiesTable)/
1350 sizeof(html40EntitiesTable[0]));i++) {
1351 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1352#ifdef DEBUG
1353 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
1354#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001355 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001356 }
1357 }
1358 return(NULL);
1359}
1360
1361/**
1362 * htmlEntityValueLookup:
1363 * @value: the entity's unicode value
1364 *
1365 * Lookup the given entity in EntitiesTable
1366 *
1367 * TODO: the linear scan is really ugly, an hash table is really needed.
1368 *
1369 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1370 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001371const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001372htmlEntityValueLookup(unsigned int value) {
1373 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001374#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00001375 unsigned int lv = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001376#endif
1377
1378 for (i = 0;i < (sizeof(html40EntitiesTable)/
1379 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001380 if (html40EntitiesTable[i].value >= value) {
1381 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001382 break;
1383#ifdef DEBUG
1384 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
1385#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001386 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001387 }
1388#ifdef DEBUG
1389 if (lv > html40EntitiesTable[i].value) {
1390 xmlGenericError(xmlGenericErrorContext,
1391 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1392 lv, html40EntitiesTable[i].value);
1393 }
1394 lv = html40EntitiesTable[i].value;
1395#endif
1396 }
1397 return(NULL);
1398}
1399
1400/**
1401 * UTF8ToHtml:
1402 * @out: a pointer to an array of bytes to store the result
1403 * @outlen: the length of @out
1404 * @in: a pointer to an array of UTF-8 chars
1405 * @inlen: the length of @in
1406 *
1407 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1408 * plus HTML entities block of chars out.
1409 *
1410 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1411 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001412 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001413 * The value of @outlen after return is the number of octets consumed.
1414 */
1415int
1416UTF8ToHtml(unsigned char* out, int *outlen,
1417 const unsigned char* in, int *inlen) {
1418 const unsigned char* processed = in;
1419 const unsigned char* outend;
1420 const unsigned char* outstart = out;
1421 const unsigned char* instart = in;
1422 const unsigned char* inend;
1423 unsigned int c, d;
1424 int trailing;
1425
1426 if (in == NULL) {
1427 /*
1428 * initialization nothing to do
1429 */
1430 *outlen = 0;
1431 *inlen = 0;
1432 return(0);
1433 }
1434 inend = in + (*inlen);
1435 outend = out + (*outlen);
1436 while (in < inend) {
1437 d = *in++;
1438 if (d < 0x80) { c= d; trailing= 0; }
1439 else if (d < 0xC0) {
1440 /* trailing byte in leading position */
1441 *outlen = out - outstart;
1442 *inlen = processed - instart;
1443 return(-2);
1444 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1445 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1446 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1447 else {
1448 /* no chance for this in Ascii */
1449 *outlen = out - outstart;
1450 *inlen = processed - instart;
1451 return(-2);
1452 }
1453
1454 if (inend - in < trailing) {
1455 break;
1456 }
1457
1458 for ( ; trailing; trailing--) {
1459 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1460 break;
1461 c <<= 6;
1462 c |= d & 0x3F;
1463 }
1464
1465 /* assertion: c is a single UTF-4 value */
1466 if (c < 0x80) {
1467 if (out + 1 >= outend)
1468 break;
1469 *out++ = c;
1470 } else {
1471 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001472 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001473
1474 /*
1475 * Try to lookup a predefined HTML entity for it
1476 */
1477
1478 ent = htmlEntityValueLookup(c);
1479 if (ent == NULL) {
1480 /* no chance for this in Ascii */
1481 *outlen = out - outstart;
1482 *inlen = processed - instart;
1483 return(-2);
1484 }
1485 len = strlen(ent->name);
1486 if (out + 2 + len >= outend)
1487 break;
1488 *out++ = '&';
1489 memcpy(out, ent->name, len);
1490 out += len;
1491 *out++ = ';';
1492 }
1493 processed = in;
1494 }
1495 *outlen = out - outstart;
1496 *inlen = processed - instart;
1497 return(0);
1498}
1499
1500/**
1501 * htmlEncodeEntities:
1502 * @out: a pointer to an array of bytes to store the result
1503 * @outlen: the length of @out
1504 * @in: a pointer to an array of UTF-8 chars
1505 * @inlen: the length of @in
1506 * @quoteChar: the quote character to escape (' or ") or zero.
1507 *
1508 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1509 * plus HTML entities block of chars out.
1510 *
1511 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1512 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001513 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001514 * The value of @outlen after return is the number of octets consumed.
1515 */
1516int
1517htmlEncodeEntities(unsigned char* out, int *outlen,
1518 const unsigned char* in, int *inlen, int quoteChar) {
1519 const unsigned char* processed = in;
1520 const unsigned char* outend = out + (*outlen);
1521 const unsigned char* outstart = out;
1522 const unsigned char* instart = in;
1523 const unsigned char* inend = in + (*inlen);
1524 unsigned int c, d;
1525 int trailing;
1526
1527 while (in < inend) {
1528 d = *in++;
1529 if (d < 0x80) { c= d; trailing= 0; }
1530 else if (d < 0xC0) {
1531 /* trailing byte in leading position */
1532 *outlen = out - outstart;
1533 *inlen = processed - instart;
1534 return(-2);
1535 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1536 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1537 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1538 else {
1539 /* no chance for this in Ascii */
1540 *outlen = out - outstart;
1541 *inlen = processed - instart;
1542 return(-2);
1543 }
1544
1545 if (inend - in < trailing)
1546 break;
1547
1548 while (trailing--) {
1549 if (((d= *in++) & 0xC0) != 0x80) {
1550 *outlen = out - outstart;
1551 *inlen = processed - instart;
1552 return(-2);
1553 }
1554 c <<= 6;
1555 c |= d & 0x3F;
1556 }
1557
1558 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001559 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1560 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001561 if (out >= outend)
1562 break;
1563 *out++ = c;
1564 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001565 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001566 const char *cp;
1567 char nbuf[16];
1568 int len;
1569
1570 /*
1571 * Try to lookup a predefined HTML entity for it
1572 */
1573 ent = htmlEntityValueLookup(c);
1574 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00001575 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00001576 cp = nbuf;
1577 }
1578 else
1579 cp = ent->name;
1580 len = strlen(cp);
1581 if (out + 2 + len > outend)
1582 break;
1583 *out++ = '&';
1584 memcpy(out, cp, len);
1585 out += len;
1586 *out++ = ';';
1587 }
1588 processed = in;
1589 }
1590 *outlen = out - outstart;
1591 *inlen = processed - instart;
1592 return(0);
1593}
1594
1595/**
1596 * htmlDecodeEntities:
1597 * @ctxt: the parser context
1598 * @len: the len to decode (in bytes !), -1 for no size limit
1599 * @end: an end marker xmlChar, 0 if none
1600 * @end2: an end marker xmlChar, 0 if none
1601 * @end3: an end marker xmlChar, 0 if none
1602 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001603 * Substitute the HTML entities by their value
Owen Taylor3473f882001-02-23 17:55:21 +00001604 *
1605 * DEPRECATED !!!!
1606 *
1607 * Returns A newly allocated string with the substitution done. The caller
1608 * must deallocate it !
1609 */
1610xmlChar *
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00001611htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
1612 xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001613 static int deprecated = 0;
1614 if (!deprecated) {
1615 xmlGenericError(xmlGenericErrorContext,
1616 "htmlDecodeEntities() deprecated function reached\n");
1617 deprecated = 1;
1618 }
1619 return(NULL);
1620#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00001621 xmlChar *name = NULL;
1622 xmlChar *buffer = NULL;
1623 unsigned int buffer_size = 0;
1624 unsigned int nbchars = 0;
1625 htmlEntityDescPtr ent;
1626 unsigned int max = (unsigned int) len;
1627 int c,l;
1628
1629 if (ctxt->depth > 40) {
1630 ctxt->errNo = XML_ERR_ENTITY_LOOP;
1631 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1632 ctxt->sax->error(ctxt->userData,
1633 "Detected entity reference loop\n");
1634 ctxt->wellFormed = 0;
1635 ctxt->disableSAX = 1;
1636 return(NULL);
1637 }
1638
1639 /*
1640 * allocate a translation buffer.
1641 */
1642 buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1643 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1644 if (buffer == NULL) {
1645 perror("xmlDecodeEntities: malloc failed");
1646 return(NULL);
1647 }
1648
1649 /*
1650 * Ok loop until we reach one of the ending char or a size limit.
1651 */
1652 c = CUR_CHAR(l);
1653 while ((nbchars < max) && (c != end) &&
1654 (c != end2) && (c != end3)) {
1655
1656 if (c == 0) break;
1657 if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
1658 int val = htmlParseCharRef(ctxt);
1659 COPY_BUF(0,buffer,nbchars,val);
1660 NEXTL(l);
1661 } else if ((c == '&') && (ctxt->token != '&')) {
1662 ent = htmlParseEntityRef(ctxt, &name);
1663 if (name != NULL) {
1664 if (ent != NULL) {
1665 int val = ent->value;
1666 COPY_BUF(0,buffer,nbchars,val);
1667 NEXTL(l);
1668 } else {
1669 const xmlChar *cur = name;
1670
1671 buffer[nbchars++] = '&';
1672 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1673 growBuffer(buffer);
1674 }
1675 while (*cur != 0) {
1676 buffer[nbchars++] = *cur++;
1677 }
1678 buffer[nbchars++] = ';';
1679 }
1680 }
1681 } else {
1682 COPY_BUF(l,buffer,nbchars,c);
1683 NEXTL(l);
1684 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1685 growBuffer(buffer);
1686 }
1687 }
1688 c = CUR_CHAR(l);
1689 }
1690 buffer[nbchars++] = 0;
1691 return(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001692#endif
Owen Taylor3473f882001-02-23 17:55:21 +00001693}
1694
1695/************************************************************************
1696 * *
1697 * Commodity functions to handle streams *
1698 * *
1699 ************************************************************************/
1700
1701/**
Owen Taylor3473f882001-02-23 17:55:21 +00001702 * htmlNewInputStream:
1703 * @ctxt: an HTML parser context
1704 *
1705 * Create a new input stream structure
1706 * Returns the new input stream or NULL
1707 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001708static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001709htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1710 htmlParserInputPtr input;
1711
1712 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1713 if (input == NULL) {
1714 ctxt->errNo = XML_ERR_NO_MEMORY;
1715 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1716 ctxt->sax->error(ctxt->userData,
1717 "malloc: couldn't allocate a new input stream\n");
1718 return(NULL);
1719 }
1720 memset(input, 0, sizeof(htmlParserInput));
1721 input->filename = NULL;
1722 input->directory = NULL;
1723 input->base = NULL;
1724 input->cur = NULL;
1725 input->buf = NULL;
1726 input->line = 1;
1727 input->col = 1;
1728 input->buf = NULL;
1729 input->free = NULL;
1730 input->version = NULL;
1731 input->consumed = 0;
1732 input->length = 0;
1733 return(input);
1734}
1735
1736
1737/************************************************************************
1738 * *
1739 * Commodity functions, cleanup needed ? *
1740 * *
1741 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001742/*
1743 * all tags allowing pc data from the html 4.01 loose dtd
1744 * NOTE: it might be more apropriate to integrate this information
1745 * into the html40ElementTable array but I don't want to risk any
1746 * binary incomptibility
1747 */
1748static const char *allowPCData[] = {
1749 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
1750 "blockquote", "body", "button", "caption", "center", "cite", "code",
1751 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
1752 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
1753 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
1754 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
1755};
Owen Taylor3473f882001-02-23 17:55:21 +00001756
1757/**
1758 * areBlanks:
1759 * @ctxt: an HTML parser context
1760 * @str: a xmlChar *
1761 * @len: the size of @str
1762 *
1763 * Is this a sequence of blank chars that one can ignore ?
1764 *
1765 * Returns 1 if ignorable 0 otherwise.
1766 */
1767
1768static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001769 unsigned int i;
1770 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00001771 xmlNodePtr lastChild;
1772
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001773 for (j = 0;j < len;j++)
1774 if (!(IS_BLANK(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001775
1776 if (CUR == 0) return(1);
1777 if (CUR != '<') return(0);
1778 if (ctxt->name == NULL)
1779 return(1);
1780 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
1781 return(1);
1782 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
1783 return(1);
1784 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
1785 return(1);
1786 if (ctxt->node == NULL) return(0);
1787 lastChild = xmlGetLastChild(ctxt->node);
1788 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00001789 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
1790 (ctxt->node->content != NULL)) return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001791 /* keep ws in constructs like ...<b> </b>...
1792 for all tags "b" allowing PCDATA */
1793 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
1794 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
1795 return(0);
1796 }
1797 }
Owen Taylor3473f882001-02-23 17:55:21 +00001798 } else if (xmlNodeIsText(lastChild)) {
1799 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001800 } else {
1801 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
1802 for all tags "p" allowing PCDATA */
1803 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
1804 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
1805 return(0);
1806 }
1807 }
Owen Taylor3473f882001-02-23 17:55:21 +00001808 }
1809 return(1);
1810}
1811
1812/**
Owen Taylor3473f882001-02-23 17:55:21 +00001813 * htmlNewDocNoDtD:
1814 * @URI: URI for the dtd, or NULL
1815 * @ExternalID: the external ID of the DTD, or NULL
1816 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001817 * Creates a new HTML document without a DTD node if @URI and @ExternalID
1818 * are NULL
1819 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001820 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00001821 */
1822htmlDocPtr
1823htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
1824 xmlDocPtr cur;
1825
1826 /*
1827 * Allocate a new document and fill the fields.
1828 */
1829 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
1830 if (cur == NULL) {
1831 xmlGenericError(xmlGenericErrorContext,
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001832 "htmlNewDocNoDtD : malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00001833 return(NULL);
1834 }
1835 memset(cur, 0, sizeof(xmlDoc));
1836
1837 cur->type = XML_HTML_DOCUMENT_NODE;
1838 cur->version = NULL;
1839 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001840 cur->doc = cur;
1841 cur->name = NULL;
1842 cur->children = NULL;
1843 cur->extSubset = NULL;
1844 cur->oldNs = NULL;
1845 cur->encoding = NULL;
1846 cur->standalone = 1;
1847 cur->compression = 0;
1848 cur->ids = NULL;
1849 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001850 cur->_private = NULL;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00001851 if ((ExternalID != NULL) ||
1852 (URI != NULL))
Daniel Veillard5151c062001-10-23 13:10:19 +00001853 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00001854 return(cur);
1855}
1856
1857/**
1858 * htmlNewDoc:
1859 * @URI: URI for the dtd, or NULL
1860 * @ExternalID: the external ID of the DTD, or NULL
1861 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001862 * Creates a new HTML document
1863 *
Owen Taylor3473f882001-02-23 17:55:21 +00001864 * Returns a new document
1865 */
1866htmlDocPtr
1867htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1868 if ((URI == NULL) && (ExternalID == NULL))
1869 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00001870 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
1871 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00001872
1873 return(htmlNewDocNoDtD(URI, ExternalID));
1874}
1875
1876
1877/************************************************************************
1878 * *
1879 * The parser itself *
1880 * Relates to http://www.w3.org/TR/html40 *
1881 * *
1882 ************************************************************************/
1883
1884/************************************************************************
1885 * *
1886 * The parser itself *
1887 * *
1888 ************************************************************************/
1889
1890/**
1891 * htmlParseHTMLName:
1892 * @ctxt: an HTML parser context
1893 *
1894 * parse an HTML tag or attribute name, note that we convert it to lowercase
1895 * since HTML names are not case-sensitive.
1896 *
1897 * Returns the Tag Name parsed or NULL
1898 */
1899
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001900static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001901htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1902 xmlChar *ret = NULL;
1903 int i = 0;
1904 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
1905
1906 if (!IS_LETTER(CUR) && (CUR != '_') &&
1907 (CUR != ':')) return(NULL);
1908
1909 while ((i < HTML_PARSER_BUFFER_SIZE) &&
1910 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1911 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
1912 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
1913 else loc[i] = CUR;
1914 i++;
1915
1916 NEXT;
1917 }
1918
1919 ret = xmlStrndup(loc, i);
1920
1921 return(ret);
1922}
1923
1924/**
1925 * htmlParseName:
1926 * @ctxt: an HTML parser context
1927 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001928 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00001929 *
1930 * Returns the Name parsed or NULL
1931 */
1932
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001933static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001934htmlParseName(htmlParserCtxtPtr ctxt) {
1935 xmlChar buf[HTML_MAX_NAMELEN];
1936 int len = 0;
1937
1938 GROW;
1939 if (!IS_LETTER(CUR) && (CUR != '_')) {
1940 return(NULL);
1941 }
1942
1943 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1944 (CUR == '.') || (CUR == '-') ||
1945 (CUR == '_') || (CUR == ':') ||
1946 (IS_COMBINING(CUR)) ||
1947 (IS_EXTENDER(CUR))) {
1948 buf[len++] = CUR;
1949 NEXT;
1950 if (len >= HTML_MAX_NAMELEN) {
1951 xmlGenericError(xmlGenericErrorContext,
1952 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1953 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1954 (CUR == '.') || (CUR == '-') ||
1955 (CUR == '_') || (CUR == ':') ||
1956 (IS_COMBINING(CUR)) ||
1957 (IS_EXTENDER(CUR)))
1958 NEXT;
1959 break;
1960 }
1961 }
1962 return(xmlStrndup(buf, len));
1963}
1964
1965/**
1966 * htmlParseHTMLAttribute:
1967 * @ctxt: an HTML parser context
1968 * @stop: a char stop value
1969 *
1970 * parse an HTML attribute value till the stop (quote), if
1971 * stop is 0 then it stops at the first space
1972 *
1973 * Returns the attribute parsed or NULL
1974 */
1975
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001976static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001977htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
1978 xmlChar *buffer = NULL;
1979 int buffer_size = 0;
1980 xmlChar *out = NULL;
1981 xmlChar *name = NULL;
1982
1983 xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00001984 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001985
1986 /*
1987 * allocate a translation buffer.
1988 */
1989 buffer_size = HTML_PARSER_BUFFER_SIZE;
1990 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1991 if (buffer == NULL) {
1992 perror("htmlParseHTMLAttribute: malloc failed");
1993 return(NULL);
1994 }
1995 out = buffer;
1996
1997 /*
1998 * Ok loop until we reach one of the ending chars
1999 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002000 while ((CUR != 0) && (CUR != stop)) {
2001 if ((stop == 0) && (CUR == '>')) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002002 if ((stop == 0) && (IS_BLANK(CUR))) break;
2003 if (CUR == '&') {
2004 if (NXT(1) == '#') {
2005 unsigned int c;
2006 int bits;
2007
2008 c = htmlParseCharRef(ctxt);
2009 if (c < 0x80)
2010 { *out++ = c; bits= -6; }
2011 else if (c < 0x800)
2012 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2013 else if (c < 0x10000)
2014 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2015 else
2016 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2017
2018 for ( ; bits >= 0; bits-= 6) {
2019 *out++ = ((c >> bits) & 0x3F) | 0x80;
2020 }
2021 } else {
2022 ent = htmlParseEntityRef(ctxt, &name);
2023 if (name == NULL) {
2024 *out++ = '&';
2025 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002026 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002027
2028 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002029 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002030 }
2031 } else if (ent == NULL) {
2032 *out++ = '&';
2033 cur = name;
2034 while (*cur != 0) {
2035 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002036 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002037
2038 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002039 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002040 }
2041 *out++ = *cur++;
2042 }
2043 xmlFree(name);
2044 } else {
2045 unsigned int c;
2046 int bits;
2047
2048 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002049 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002050
2051 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002052 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002053 }
2054 c = (xmlChar)ent->value;
2055 if (c < 0x80)
2056 { *out++ = c; bits= -6; }
2057 else if (c < 0x800)
2058 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2059 else if (c < 0x10000)
2060 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2061 else
2062 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2063
2064 for ( ; bits >= 0; bits-= 6) {
2065 *out++ = ((c >> bits) & 0x3F) | 0x80;
2066 }
2067 xmlFree(name);
2068 }
2069 }
2070 } else {
2071 unsigned int c;
2072 int bits, l;
2073
2074 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002075 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002076
2077 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002078 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002079 }
2080 c = CUR_CHAR(l);
2081 if (c < 0x80)
2082 { *out++ = c; bits= -6; }
2083 else if (c < 0x800)
2084 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2085 else if (c < 0x10000)
2086 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2087 else
2088 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2089
2090 for ( ; bits >= 0; bits-= 6) {
2091 *out++ = ((c >> bits) & 0x3F) | 0x80;
2092 }
2093 NEXT;
2094 }
2095 }
2096 *out++ = 0;
2097 return(buffer);
2098}
2099
2100/**
Owen Taylor3473f882001-02-23 17:55:21 +00002101 * htmlParseEntityRef:
2102 * @ctxt: an HTML parser context
2103 * @str: location to store the entity name
2104 *
2105 * parse an HTML ENTITY references
2106 *
2107 * [68] EntityRef ::= '&' Name ';'
2108 *
2109 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2110 * if non-NULL *str will have to be freed by the caller.
2111 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002112const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00002113htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2114 xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002115 const htmlEntityDesc * ent = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002116 *str = NULL;
2117
2118 if (CUR == '&') {
2119 NEXT;
2120 name = htmlParseName(ctxt);
2121 if (name == NULL) {
2122 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2123 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2124 ctxt->wellFormed = 0;
2125 } else {
2126 GROW;
2127 if (CUR == ';') {
2128 *str = name;
2129
2130 /*
2131 * Lookup the entity in the table.
2132 */
2133 ent = htmlEntityLookup(name);
2134 if (ent != NULL) /* OK that's ugly !!! */
2135 NEXT;
2136 } else {
2137 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2138 ctxt->sax->error(ctxt->userData,
2139 "htmlParseEntityRef: expecting ';'\n");
2140 *str = name;
2141 }
2142 }
2143 }
2144 return(ent);
2145}
2146
2147/**
2148 * htmlParseAttValue:
2149 * @ctxt: an HTML parser context
2150 *
2151 * parse a value for an attribute
2152 * Note: the parser won't do substitution of entities here, this
2153 * will be handled later in xmlStringGetNodeList, unless it was
2154 * asked for ctxt->replaceEntities != 0
2155 *
2156 * Returns the AttValue parsed or NULL.
2157 */
2158
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002159static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002160htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2161 xmlChar *ret = NULL;
2162
2163 if (CUR == '"') {
2164 NEXT;
2165 ret = htmlParseHTMLAttribute(ctxt, '"');
2166 if (CUR != '"') {
2167 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2168 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2169 ctxt->wellFormed = 0;
2170 } else
2171 NEXT;
2172 } else if (CUR == '\'') {
2173 NEXT;
2174 ret = htmlParseHTMLAttribute(ctxt, '\'');
2175 if (CUR != '\'') {
2176 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2177 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2178 ctxt->wellFormed = 0;
2179 } else
2180 NEXT;
2181 } else {
2182 /*
2183 * That's an HTMLism, the attribute value may not be quoted
2184 */
2185 ret = htmlParseHTMLAttribute(ctxt, 0);
2186 if (ret == NULL) {
2187 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2188 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2189 ctxt->wellFormed = 0;
2190 }
2191 }
2192 return(ret);
2193}
2194
2195/**
2196 * htmlParseSystemLiteral:
2197 * @ctxt: an HTML parser context
2198 *
2199 * parse an HTML Literal
2200 *
2201 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2202 *
2203 * Returns the SystemLiteral parsed or NULL
2204 */
2205
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002206static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002207htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2208 const xmlChar *q;
2209 xmlChar *ret = NULL;
2210
2211 if (CUR == '"') {
2212 NEXT;
2213 q = CUR_PTR;
2214 while ((IS_CHAR(CUR)) && (CUR != '"'))
2215 NEXT;
2216 if (!IS_CHAR(CUR)) {
2217 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2218 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2219 ctxt->wellFormed = 0;
2220 } else {
2221 ret = xmlStrndup(q, CUR_PTR - q);
2222 NEXT;
2223 }
2224 } else if (CUR == '\'') {
2225 NEXT;
2226 q = CUR_PTR;
2227 while ((IS_CHAR(CUR)) && (CUR != '\''))
2228 NEXT;
2229 if (!IS_CHAR(CUR)) {
2230 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2231 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2232 ctxt->wellFormed = 0;
2233 } else {
2234 ret = xmlStrndup(q, CUR_PTR - q);
2235 NEXT;
2236 }
2237 } else {
2238 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2239 ctxt->sax->error(ctxt->userData,
2240 "SystemLiteral \" or ' expected\n");
2241 ctxt->wellFormed = 0;
2242 }
2243
2244 return(ret);
2245}
2246
2247/**
2248 * htmlParsePubidLiteral:
2249 * @ctxt: an HTML parser context
2250 *
2251 * parse an HTML public literal
2252 *
2253 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2254 *
2255 * Returns the PubidLiteral parsed or NULL.
2256 */
2257
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002258static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002259htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2260 const xmlChar *q;
2261 xmlChar *ret = NULL;
2262 /*
2263 * Name ::= (Letter | '_') (NameChar)*
2264 */
2265 if (CUR == '"') {
2266 NEXT;
2267 q = CUR_PTR;
2268 while (IS_PUBIDCHAR(CUR)) NEXT;
2269 if (CUR != '"') {
2270 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2271 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2272 ctxt->wellFormed = 0;
2273 } else {
2274 ret = xmlStrndup(q, CUR_PTR - q);
2275 NEXT;
2276 }
2277 } else if (CUR == '\'') {
2278 NEXT;
2279 q = CUR_PTR;
2280 while ((IS_LETTER(CUR)) && (CUR != '\''))
2281 NEXT;
2282 if (!IS_LETTER(CUR)) {
2283 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2284 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2285 ctxt->wellFormed = 0;
2286 } else {
2287 ret = xmlStrndup(q, CUR_PTR - q);
2288 NEXT;
2289 }
2290 } else {
2291 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2292 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2293 ctxt->wellFormed = 0;
2294 }
2295
2296 return(ret);
2297}
2298
2299/**
2300 * htmlParseScript:
2301 * @ctxt: an HTML parser context
2302 *
2303 * parse the content of an HTML SCRIPT or STYLE element
2304 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2305 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2306 * http://www.w3.org/TR/html4/types.html#type-script
2307 * http://www.w3.org/TR/html4/types.html#h-6.15
2308 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2309 *
2310 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2311 * element and the value of intrinsic event attributes. User agents must
2312 * not evaluate script data as HTML markup but instead must pass it on as
2313 * data to a script engine.
2314 * NOTES:
2315 * - The content is passed like CDATA
2316 * - the attributes for style and scripting "onXXX" are also described
2317 * as CDATA but SGML allows entities references in attributes so their
2318 * processing is identical as other attributes
2319 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002320static void
Owen Taylor3473f882001-02-23 17:55:21 +00002321htmlParseScript(htmlParserCtxtPtr ctxt) {
2322 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2323 int nbchar = 0;
2324 xmlChar cur;
2325
2326 SHRINK;
2327 cur = CUR;
2328 while (IS_CHAR(cur)) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00002329 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2330 (NXT(3) == '-')) {
2331 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2332 if (ctxt->sax->cdataBlock!= NULL) {
2333 /*
2334 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2335 */
2336 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2337 }
2338 }
2339 nbchar = 0;
2340 htmlParseComment(ctxt);
2341 cur = CUR;
2342 continue;
2343 } else if ((cur == '<') && (NXT(1) == '/')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002344 /*
2345 * One should break here, the specification is clear:
2346 * Authors should therefore escape "</" within the content.
2347 * Escape mechanisms are specific to each scripting or
2348 * style sheet language.
2349 */
2350 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2351 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2352 break; /* while */
2353 }
2354 buf[nbchar++] = cur;
2355 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2356 if (ctxt->sax->cdataBlock!= NULL) {
2357 /*
2358 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2359 */
2360 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2361 }
2362 nbchar = 0;
2363 }
2364 NEXT;
2365 cur = CUR;
2366 }
2367 if (!(IS_CHAR(cur))) {
2368 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2369 ctxt->sax->error(ctxt->userData,
2370 "Invalid char in CDATA 0x%X\n", cur);
2371 ctxt->wellFormed = 0;
2372 NEXT;
2373 }
2374
2375 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2376 if (ctxt->sax->cdataBlock!= NULL) {
2377 /*
2378 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2379 */
2380 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2381 }
2382 }
2383}
2384
2385
2386/**
2387 * htmlParseCharData:
2388 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002389 *
2390 * parse a CharData section.
2391 * if we are within a CDATA section ']]>' marks an end of section.
2392 *
2393 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2394 */
2395
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002396static void
2397htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002398 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2399 int nbchar = 0;
2400 int cur, l;
2401
2402 SHRINK;
2403 cur = CUR_CHAR(l);
2404 while (((cur != '<') || (ctxt->token == '<')) &&
2405 ((cur != '&') || (ctxt->token == '&')) &&
2406 (IS_CHAR(cur))) {
2407 COPY_BUF(l,buf,nbchar,cur);
2408 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2409 /*
2410 * Ok the segment is to be consumed as chars.
2411 */
2412 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2413 if (areBlanks(ctxt, buf, nbchar)) {
2414 if (ctxt->sax->ignorableWhitespace != NULL)
2415 ctxt->sax->ignorableWhitespace(ctxt->userData,
2416 buf, nbchar);
2417 } else {
2418 htmlCheckParagraph(ctxt);
2419 if (ctxt->sax->characters != NULL)
2420 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2421 }
2422 }
2423 nbchar = 0;
2424 }
2425 NEXTL(l);
2426 cur = CUR_CHAR(l);
2427 }
2428 if (nbchar != 0) {
2429 /*
2430 * Ok the segment is to be consumed as chars.
2431 */
2432 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2433 if (areBlanks(ctxt, buf, nbchar)) {
2434 if (ctxt->sax->ignorableWhitespace != NULL)
2435 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2436 } else {
2437 htmlCheckParagraph(ctxt);
2438 if (ctxt->sax->characters != NULL)
2439 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2440 }
2441 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002442 } else {
2443 /*
2444 * Loop detection
2445 */
2446 if (cur == 0)
2447 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002448 }
2449}
2450
2451/**
2452 * htmlParseExternalID:
2453 * @ctxt: an HTML parser context
2454 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002455 *
2456 * Parse an External ID or a Public ID
2457 *
Owen Taylor3473f882001-02-23 17:55:21 +00002458 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2459 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2460 *
2461 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2462 *
2463 * Returns the function returns SystemLiteral and in the second
2464 * case publicID receives PubidLiteral, is strict is off
2465 * it is possible to return NULL and have publicID set.
2466 */
2467
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002468static xmlChar *
2469htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002470 xmlChar *URI = NULL;
2471
2472 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2473 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2474 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2475 SKIP(6);
2476 if (!IS_BLANK(CUR)) {
2477 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2478 ctxt->sax->error(ctxt->userData,
2479 "Space required after 'SYSTEM'\n");
2480 ctxt->wellFormed = 0;
2481 }
2482 SKIP_BLANKS;
2483 URI = htmlParseSystemLiteral(ctxt);
2484 if (URI == NULL) {
2485 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2486 ctxt->sax->error(ctxt->userData,
2487 "htmlParseExternalID: SYSTEM, no URI\n");
2488 ctxt->wellFormed = 0;
2489 }
2490 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2491 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2492 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2493 SKIP(6);
2494 if (!IS_BLANK(CUR)) {
2495 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2496 ctxt->sax->error(ctxt->userData,
2497 "Space required after 'PUBLIC'\n");
2498 ctxt->wellFormed = 0;
2499 }
2500 SKIP_BLANKS;
2501 *publicID = htmlParsePubidLiteral(ctxt);
2502 if (*publicID == NULL) {
2503 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2504 ctxt->sax->error(ctxt->userData,
2505 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2506 ctxt->wellFormed = 0;
2507 }
2508 SKIP_BLANKS;
2509 if ((CUR == '"') || (CUR == '\'')) {
2510 URI = htmlParseSystemLiteral(ctxt);
2511 }
2512 }
2513 return(URI);
2514}
2515
2516/**
2517 * htmlParseComment:
2518 * @ctxt: an HTML parser context
2519 *
2520 * Parse an XML (SGML) comment <!-- .... -->
2521 *
2522 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2523 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002524static void
Owen Taylor3473f882001-02-23 17:55:21 +00002525htmlParseComment(htmlParserCtxtPtr ctxt) {
2526 xmlChar *buf = NULL;
2527 int len;
2528 int size = HTML_PARSER_BUFFER_SIZE;
2529 int q, ql;
2530 int r, rl;
2531 int cur, l;
2532 xmlParserInputState state;
2533
2534 /*
2535 * Check that there is a comment right here.
2536 */
2537 if ((RAW != '<') || (NXT(1) != '!') ||
2538 (NXT(2) != '-') || (NXT(3) != '-')) return;
2539
2540 state = ctxt->instate;
2541 ctxt->instate = XML_PARSER_COMMENT;
2542 SHRINK;
2543 SKIP(4);
2544 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2545 if (buf == NULL) {
2546 xmlGenericError(xmlGenericErrorContext,
2547 "malloc of %d byte failed\n", size);
2548 ctxt->instate = state;
2549 return;
2550 }
2551 q = CUR_CHAR(ql);
2552 NEXTL(ql);
2553 r = CUR_CHAR(rl);
2554 NEXTL(rl);
2555 cur = CUR_CHAR(l);
2556 len = 0;
2557 while (IS_CHAR(cur) &&
2558 ((cur != '>') ||
2559 (r != '-') || (q != '-'))) {
2560 if (len + 5 >= size) {
2561 size *= 2;
2562 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2563 if (buf == NULL) {
2564 xmlGenericError(xmlGenericErrorContext,
2565 "realloc of %d byte failed\n", size);
2566 ctxt->instate = state;
2567 return;
2568 }
2569 }
2570 COPY_BUF(ql,buf,len,q);
2571 q = r;
2572 ql = rl;
2573 r = cur;
2574 rl = l;
2575 NEXTL(l);
2576 cur = CUR_CHAR(l);
2577 if (cur == 0) {
2578 SHRINK;
2579 GROW;
2580 cur = CUR_CHAR(l);
2581 }
2582 }
2583 buf[len] = 0;
2584 if (!IS_CHAR(cur)) {
2585 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2586 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2587 ctxt->sax->error(ctxt->userData,
2588 "Comment not terminated \n<!--%.50s\n", buf);
2589 ctxt->wellFormed = 0;
2590 xmlFree(buf);
2591 } else {
2592 NEXT;
2593 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2594 (!ctxt->disableSAX))
2595 ctxt->sax->comment(ctxt->userData, buf);
2596 xmlFree(buf);
2597 }
2598 ctxt->instate = state;
2599}
2600
2601/**
2602 * htmlParseCharRef:
2603 * @ctxt: an HTML parser context
2604 *
2605 * parse Reference declarations
2606 *
2607 * [66] CharRef ::= '&#' [0-9]+ ';' |
2608 * '&#x' [0-9a-fA-F]+ ';'
2609 *
2610 * Returns the value parsed (as an int)
2611 */
2612int
2613htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2614 int val = 0;
2615
2616 if ((CUR == '&') && (NXT(1) == '#') &&
2617 (NXT(2) == 'x')) {
2618 SKIP(3);
2619 while (CUR != ';') {
2620 if ((CUR >= '0') && (CUR <= '9'))
2621 val = val * 16 + (CUR - '0');
2622 else if ((CUR >= 'a') && (CUR <= 'f'))
2623 val = val * 16 + (CUR - 'a') + 10;
2624 else if ((CUR >= 'A') && (CUR <= 'F'))
2625 val = val * 16 + (CUR - 'A') + 10;
2626 else {
2627 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2628 ctxt->sax->error(ctxt->userData,
2629 "htmlParseCharRef: invalid hexadecimal value\n");
2630 ctxt->wellFormed = 0;
2631 return(0);
2632 }
2633 NEXT;
2634 }
2635 if (CUR == ';')
2636 NEXT;
2637 } else if ((CUR == '&') && (NXT(1) == '#')) {
2638 SKIP(2);
2639 while (CUR != ';') {
2640 if ((CUR >= '0') && (CUR <= '9'))
2641 val = val * 10 + (CUR - '0');
2642 else {
2643 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2644 ctxt->sax->error(ctxt->userData,
2645 "htmlParseCharRef: invalid decimal value\n");
2646 ctxt->wellFormed = 0;
2647 return(0);
2648 }
2649 NEXT;
2650 }
2651 if (CUR == ';')
2652 NEXT;
2653 } else {
2654 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2655 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2656 ctxt->wellFormed = 0;
2657 }
2658 /*
2659 * Check the value IS_CHAR ...
2660 */
2661 if (IS_CHAR(val)) {
2662 return(val);
2663 } else {
2664 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2665 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
2666 val);
2667 ctxt->wellFormed = 0;
2668 }
2669 return(0);
2670}
2671
2672
2673/**
2674 * htmlParseDocTypeDecl :
2675 * @ctxt: an HTML parser context
2676 *
2677 * parse a DOCTYPE declaration
2678 *
2679 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2680 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2681 */
2682
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002683static void
Owen Taylor3473f882001-02-23 17:55:21 +00002684htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
2685 xmlChar *name;
2686 xmlChar *ExternalID = NULL;
2687 xmlChar *URI = NULL;
2688
2689 /*
2690 * We know that '<!DOCTYPE' has been detected.
2691 */
2692 SKIP(9);
2693
2694 SKIP_BLANKS;
2695
2696 /*
2697 * Parse the DOCTYPE name.
2698 */
2699 name = htmlParseName(ctxt);
2700 if (name == NULL) {
2701 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2702 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2703 ctxt->wellFormed = 0;
2704 }
2705 /*
2706 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2707 */
2708
2709 SKIP_BLANKS;
2710
2711 /*
2712 * Check for SystemID and ExternalID
2713 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002714 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00002715 SKIP_BLANKS;
2716
2717 /*
2718 * We should be at the end of the DOCTYPE declaration.
2719 */
2720 if (CUR != '>') {
2721 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardf6ed8bc2001-10-02 09:22:47 +00002722 ctxt->sax->error(ctxt->userData, "DOCTYPE improperly terminated\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002723 ctxt->wellFormed = 0;
2724 /* We shouldn't try to resynchronize ... */
2725 }
2726 NEXT;
2727
2728 /*
2729 * Create or update the document accordingly to the DOCTYPE
2730 */
2731 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2732 (!ctxt->disableSAX))
2733 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
2734
2735 /*
2736 * Cleanup, since we don't use all those identifiers
2737 */
2738 if (URI != NULL) xmlFree(URI);
2739 if (ExternalID != NULL) xmlFree(ExternalID);
2740 if (name != NULL) xmlFree(name);
2741}
2742
2743/**
2744 * htmlParseAttribute:
2745 * @ctxt: an HTML parser context
2746 * @value: a xmlChar ** used to store the value of the attribute
2747 *
2748 * parse an attribute
2749 *
2750 * [41] Attribute ::= Name Eq AttValue
2751 *
2752 * [25] Eq ::= S? '=' S?
2753 *
2754 * With namespace:
2755 *
2756 * [NS 11] Attribute ::= QName Eq AttValue
2757 *
2758 * Also the case QName == xmlns:??? is handled independently as a namespace
2759 * definition.
2760 *
2761 * Returns the attribute name, and the value in *value.
2762 */
2763
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002764static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002765htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
2766 xmlChar *name, *val = NULL;
2767
2768 *value = NULL;
2769 name = htmlParseHTMLName(ctxt);
2770 if (name == NULL) {
2771 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2772 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2773 ctxt->wellFormed = 0;
2774 return(NULL);
2775 }
2776
2777 /*
2778 * read the value
2779 */
2780 SKIP_BLANKS;
2781 if (CUR == '=') {
2782 NEXT;
2783 SKIP_BLANKS;
2784 val = htmlParseAttValue(ctxt);
2785 /******
2786 } else {
2787 * TODO : some attribute must have values, some may not
2788 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2789 ctxt->sax->warning(ctxt->userData,
2790 "No value for attribute %s\n", name); */
2791 }
2792
2793 *value = val;
2794 return(name);
2795}
2796
2797/**
2798 * htmlCheckEncoding:
2799 * @ctxt: an HTML parser context
2800 * @attvalue: the attribute value
2801 *
2802 * Checks an http-equiv attribute from a Meta tag to detect
2803 * the encoding
2804 * If a new encoding is detected the parser is switched to decode
2805 * it and pass UTF8
2806 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002807static void
Owen Taylor3473f882001-02-23 17:55:21 +00002808htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2809 const xmlChar *encoding;
2810
2811 if ((ctxt == NULL) || (attvalue == NULL))
2812 return;
2813
2814 /* do not change encoding */
2815 if (ctxt->input->encoding != NULL)
2816 return;
2817
2818 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
2819 if (encoding != NULL) {
2820 encoding += 8;
2821 } else {
2822 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
2823 if (encoding != NULL)
2824 encoding += 9;
2825 }
2826 if (encoding != NULL) {
2827 xmlCharEncoding enc;
2828 xmlCharEncodingHandlerPtr handler;
2829
2830 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2831
2832 if (ctxt->input->encoding != NULL)
2833 xmlFree((xmlChar *) ctxt->input->encoding);
2834 ctxt->input->encoding = xmlStrdup(encoding);
2835
2836 enc = xmlParseCharEncoding((const char *) encoding);
2837 /*
2838 * registered set of known encodings
2839 */
2840 if (enc != XML_CHAR_ENCODING_ERROR) {
2841 xmlSwitchEncoding(ctxt, enc);
2842 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2843 } else {
2844 /*
2845 * fallback for unknown encodings
2846 */
2847 handler = xmlFindCharEncodingHandler((const char *) encoding);
2848 if (handler != NULL) {
2849 xmlSwitchToEncoding(ctxt, handler);
2850 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2851 } else {
2852 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2853 }
2854 }
2855
2856 if ((ctxt->input->buf != NULL) &&
2857 (ctxt->input->buf->encoder != NULL) &&
2858 (ctxt->input->buf->raw != NULL) &&
2859 (ctxt->input->buf->buffer != NULL)) {
2860 int nbchars;
2861 int processed;
2862
2863 /*
2864 * convert as much as possible to the parser reading buffer.
2865 */
2866 processed = ctxt->input->cur - ctxt->input->base;
2867 xmlBufferShrink(ctxt->input->buf->buffer, processed);
2868 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2869 ctxt->input->buf->buffer,
2870 ctxt->input->buf->raw);
2871 if (nbchars < 0) {
2872 ctxt->errNo = XML_ERR_INVALID_ENCODING;
2873 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2874 ctxt->sax->error(ctxt->userData,
2875 "htmlCheckEncoding: encoder error\n");
2876 }
2877 ctxt->input->base =
2878 ctxt->input->cur = ctxt->input->buf->buffer->content;
2879 }
2880 }
2881}
2882
2883/**
2884 * htmlCheckMeta:
2885 * @ctxt: an HTML parser context
2886 * @atts: the attributes values
2887 *
2888 * Checks an attributes from a Meta tag
2889 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002890static void
Owen Taylor3473f882001-02-23 17:55:21 +00002891htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2892 int i;
2893 const xmlChar *att, *value;
2894 int http = 0;
2895 const xmlChar *content = NULL;
2896
2897 if ((ctxt == NULL) || (atts == NULL))
2898 return;
2899
2900 i = 0;
2901 att = atts[i++];
2902 while (att != NULL) {
2903 value = atts[i++];
2904 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
2905 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
2906 http = 1;
2907 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
2908 content = value;
2909 att = atts[i++];
2910 }
2911 if ((http) && (content != NULL))
2912 htmlCheckEncoding(ctxt, content);
2913
2914}
2915
2916/**
2917 * htmlParseStartTag:
2918 * @ctxt: an HTML parser context
2919 *
2920 * parse a start of tag either for rule element or
2921 * EmptyElement. In both case we don't parse the tag closing chars.
2922 *
2923 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2924 *
2925 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2926 *
2927 * With namespace:
2928 *
2929 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2930 *
2931 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2932 *
2933 */
2934
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002935static void
Owen Taylor3473f882001-02-23 17:55:21 +00002936htmlParseStartTag(htmlParserCtxtPtr ctxt) {
2937 xmlChar *name;
2938 xmlChar *attname;
2939 xmlChar *attvalue;
2940 const xmlChar **atts = NULL;
2941 int nbatts = 0;
2942 int maxatts = 0;
2943 int meta = 0;
2944 int i;
2945
2946 if (CUR != '<') return;
2947 NEXT;
2948
2949 GROW;
2950 name = htmlParseHTMLName(ctxt);
2951 if (name == NULL) {
2952 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2953 ctxt->sax->error(ctxt->userData,
2954 "htmlParseStartTag: invalid element name\n");
2955 ctxt->wellFormed = 0;
2956 /* Dump the bogus tag like browsers do */
2957 while ((IS_CHAR(CUR)) && (CUR != '>'))
2958 NEXT;
2959 return;
2960 }
2961 if (xmlStrEqual(name, BAD_CAST"meta"))
2962 meta = 1;
2963
2964 /*
2965 * Check for auto-closure of HTML elements.
2966 */
2967 htmlAutoClose(ctxt, name);
2968
2969 /*
2970 * Check for implied HTML elements.
2971 */
2972 htmlCheckImplied(ctxt, name);
2973
2974 /*
2975 * Avoid html at any level > 0, head at any level != 1
2976 * or any attempt to recurse body
2977 */
2978 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
2979 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2980 ctxt->sax->error(ctxt->userData,
2981 "htmlParseStartTag: misplaced <html> tag\n");
2982 ctxt->wellFormed = 0;
2983 xmlFree(name);
2984 return;
2985 }
2986 if ((ctxt->nameNr != 1) &&
2987 (xmlStrEqual(name, BAD_CAST"head"))) {
2988 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2989 ctxt->sax->error(ctxt->userData,
2990 "htmlParseStartTag: misplaced <head> tag\n");
2991 ctxt->wellFormed = 0;
2992 xmlFree(name);
2993 return;
2994 }
2995 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002996 int indx;
2997 for (indx = 0;indx < ctxt->nameNr;indx++) {
2998 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor3473f882001-02-23 17:55:21 +00002999 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3000 ctxt->sax->error(ctxt->userData,
3001 "htmlParseStartTag: misplaced <body> tag\n");
3002 ctxt->wellFormed = 0;
3003 xmlFree(name);
3004 return;
3005 }
3006 }
3007 }
3008
3009 /*
3010 * Now parse the attributes, it ends up with the ending
3011 *
3012 * (S Attribute)* S?
3013 */
3014 SKIP_BLANKS;
3015 while ((IS_CHAR(CUR)) &&
3016 (CUR != '>') &&
3017 ((CUR != '/') || (NXT(1) != '>'))) {
3018 long cons = ctxt->nbChars;
3019
3020 GROW;
3021 attname = htmlParseAttribute(ctxt, &attvalue);
3022 if (attname != NULL) {
3023
3024 /*
3025 * Well formedness requires at most one declaration of an attribute
3026 */
3027 for (i = 0; i < nbatts;i += 2) {
3028 if (xmlStrEqual(atts[i], attname)) {
3029 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3030 ctxt->sax->error(ctxt->userData,
3031 "Attribute %s redefined\n",
3032 attname);
3033 ctxt->wellFormed = 0;
3034 xmlFree(attname);
3035 if (attvalue != NULL)
3036 xmlFree(attvalue);
3037 goto failed;
3038 }
3039 }
3040
3041 /*
3042 * Add the pair to atts
3043 */
3044 if (atts == NULL) {
3045 maxatts = 10;
3046 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
3047 if (atts == NULL) {
3048 xmlGenericError(xmlGenericErrorContext,
3049 "malloc of %ld byte failed\n",
3050 maxatts * (long)sizeof(xmlChar *));
3051 if (name != NULL) xmlFree(name);
3052 return;
3053 }
3054 } else if (nbatts + 4 > maxatts) {
3055 maxatts *= 2;
3056 atts = (const xmlChar **) xmlRealloc((void *) atts,
3057 maxatts * sizeof(xmlChar *));
3058 if (atts == NULL) {
3059 xmlGenericError(xmlGenericErrorContext,
3060 "realloc of %ld byte failed\n",
3061 maxatts * (long)sizeof(xmlChar *));
3062 if (name != NULL) xmlFree(name);
3063 return;
3064 }
3065 }
3066 atts[nbatts++] = attname;
3067 atts[nbatts++] = attvalue;
3068 atts[nbatts] = NULL;
3069 atts[nbatts + 1] = NULL;
3070 }
3071 else {
3072 /* Dump the bogus attribute string up to the next blank or
3073 * the end of the tag. */
Daniel Veillard561b7f82002-03-20 21:55:57 +00003074 while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
3075 && ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003076 NEXT;
3077 }
3078
3079failed:
3080 SKIP_BLANKS;
3081 if (cons == ctxt->nbChars) {
3082 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3083 ctxt->sax->error(ctxt->userData,
3084 "htmlParseStartTag: problem parsing attributes\n");
3085 ctxt->wellFormed = 0;
3086 break;
3087 }
3088 }
3089
3090 /*
3091 * Handle specific association to the META tag
3092 */
3093 if (meta)
3094 htmlCheckMeta(ctxt, atts);
3095
3096 /*
3097 * SAX: Start of Element !
3098 */
3099 htmlnamePush(ctxt, xmlStrdup(name));
3100#ifdef DEBUG
3101 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
3102#endif
3103 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3104 ctxt->sax->startElement(ctxt->userData, name, atts);
3105
3106 if (atts != NULL) {
3107 for (i = 0;i < nbatts;i++) {
3108 if (atts[i] != NULL)
3109 xmlFree((xmlChar *) atts[i]);
3110 }
3111 xmlFree((void *) atts);
3112 }
3113 if (name != NULL) xmlFree(name);
3114}
3115
3116/**
3117 * htmlParseEndTag:
3118 * @ctxt: an HTML parser context
3119 *
3120 * parse an end of tag
3121 *
3122 * [42] ETag ::= '</' Name S? '>'
3123 *
3124 * With namespace
3125 *
3126 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003127 *
3128 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003129 */
3130
Daniel Veillardf420ac52001-07-04 16:04:09 +00003131static int
Owen Taylor3473f882001-02-23 17:55:21 +00003132htmlParseEndTag(htmlParserCtxtPtr ctxt) {
3133 xmlChar *name;
3134 xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003135 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003136
3137 if ((CUR != '<') || (NXT(1) != '/')) {
3138 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3139 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3140 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003141 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003142 }
3143 SKIP(2);
3144
3145 name = htmlParseHTMLName(ctxt);
Daniel Veillardf420ac52001-07-04 16:04:09 +00003146 if (name == NULL) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003147
3148 /*
3149 * We should definitely be at the ending "S? '>'" part
3150 */
3151 SKIP_BLANKS;
3152 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3153 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3154 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3155 ctxt->wellFormed = 0;
3156 } else
3157 NEXT;
3158
3159 /*
3160 * If the name read is not one of the element in the parsing stack
3161 * then return, it's just an error.
3162 */
3163 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
3164 if (xmlStrEqual(name, ctxt->nameTab[i])) break;
3165 }
3166 if (i < 0) {
3167 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3168 ctxt->sax->error(ctxt->userData,
3169 "Unexpected end tag : %s\n", name);
3170 xmlFree(name);
3171 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003172 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003173 }
3174
3175
3176 /*
3177 * Check for auto-closure of HTML elements.
3178 */
3179
3180 htmlAutoCloseOnClose(ctxt, name);
3181
3182 /*
3183 * Well formedness constraints, opening and closing must match.
3184 * With the exception that the autoclose may have popped stuff out
3185 * of the stack.
3186 */
3187 if (!xmlStrEqual(name, ctxt->name)) {
3188#ifdef DEBUG
3189 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
3190#endif
3191 if ((ctxt->name != NULL) &&
3192 (!xmlStrEqual(ctxt->name, name))) {
3193 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3194 ctxt->sax->error(ctxt->userData,
3195 "Opening and ending tag mismatch: %s and %s\n",
3196 name, ctxt->name);
3197 ctxt->wellFormed = 0;
3198 }
3199 }
3200
3201 /*
3202 * SAX: End of Tag
3203 */
3204 oldname = ctxt->name;
3205 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3206 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3207 ctxt->sax->endElement(ctxt->userData, name);
3208 oldname = htmlnamePop(ctxt);
3209 if (oldname != NULL) {
3210#ifdef DEBUG
3211 xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
3212#endif
3213 xmlFree(oldname);
3214#ifdef DEBUG
3215 } else {
3216 xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
3217#endif
3218 }
Daniel Veillardf420ac52001-07-04 16:04:09 +00003219 ret = 1;
3220 } else {
3221 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003222 }
3223
3224 if (name != NULL)
3225 xmlFree(name);
3226
Daniel Veillardf420ac52001-07-04 16:04:09 +00003227 return(ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003228}
3229
3230
3231/**
3232 * htmlParseReference:
3233 * @ctxt: an HTML parser context
3234 *
3235 * parse and handle entity references in content,
3236 * this will end-up in a call to character() since this is either a
3237 * CharRef, or a predefined entity.
3238 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003239static void
Owen Taylor3473f882001-02-23 17:55:21 +00003240htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003241 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003242 xmlChar out[6];
3243 xmlChar *name;
3244 if (CUR != '&') return;
3245
3246 if (NXT(1) == '#') {
3247 unsigned int c;
3248 int bits, i = 0;
3249
3250 c = htmlParseCharRef(ctxt);
3251 if (c == 0)
3252 return;
3253
3254 if (c < 0x80) { out[i++]= c; bits= -6; }
3255 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3256 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3257 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3258
3259 for ( ; bits >= 0; bits-= 6) {
3260 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3261 }
3262 out[i] = 0;
3263
3264 htmlCheckParagraph(ctxt);
3265 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3266 ctxt->sax->characters(ctxt->userData, out, i);
3267 } else {
3268 ent = htmlParseEntityRef(ctxt, &name);
3269 if (name == NULL) {
3270 htmlCheckParagraph(ctxt);
3271 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3272 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3273 return;
3274 }
3275 if ((ent == NULL) || (ent->value <= 0)) {
3276 htmlCheckParagraph(ctxt);
3277 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3278 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3279 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3280 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3281 }
3282 } else {
3283 unsigned int c;
3284 int bits, i = 0;
3285
3286 c = ent->value;
3287 if (c < 0x80)
3288 { out[i++]= c; bits= -6; }
3289 else if (c < 0x800)
3290 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3291 else if (c < 0x10000)
3292 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3293 else
3294 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3295
3296 for ( ; bits >= 0; bits-= 6) {
3297 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3298 }
3299 out[i] = 0;
3300
3301 htmlCheckParagraph(ctxt);
3302 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3303 ctxt->sax->characters(ctxt->userData, out, i);
3304 }
3305 xmlFree(name);
3306 }
3307}
3308
3309/**
3310 * htmlParseContent:
3311 * @ctxt: an HTML parser context
3312 * @name: the node name
3313 *
3314 * Parse a content: comment, sub-element, reference or text.
3315 *
3316 */
3317
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003318static void
Owen Taylor3473f882001-02-23 17:55:21 +00003319htmlParseContent(htmlParserCtxtPtr ctxt) {
3320 xmlChar *currentNode;
3321 int depth;
3322
3323 currentNode = xmlStrdup(ctxt->name);
3324 depth = ctxt->nameNr;
3325 while (1) {
3326 long cons = ctxt->nbChars;
3327
3328 GROW;
3329 /*
3330 * Our tag or one of it's parent or children is ending.
3331 */
3332 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003333 if (htmlParseEndTag(ctxt) &&
3334 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3335 if (currentNode != NULL)
3336 xmlFree(currentNode);
3337 return;
3338 }
3339 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003340 }
3341
3342 /*
3343 * Has this node been popped out during parsing of
3344 * the next element
3345 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003346 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3347 (!xmlStrEqual(currentNode, ctxt->name)))
3348 {
Owen Taylor3473f882001-02-23 17:55:21 +00003349 if (currentNode != NULL) xmlFree(currentNode);
3350 return;
3351 }
3352
Daniel Veillardf9533d12001-03-03 10:04:57 +00003353 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3354 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003355 /*
3356 * Handle SCRIPT/STYLE separately
3357 */
3358 htmlParseScript(ctxt);
3359 } else {
3360 /*
3361 * Sometimes DOCTYPE arrives in the middle of the document
3362 */
3363 if ((CUR == '<') && (NXT(1) == '!') &&
3364 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3365 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3366 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3367 (UPP(8) == 'E')) {
3368 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3369 ctxt->sax->error(ctxt->userData,
3370 "Misplaced DOCTYPE declaration\n");
3371 ctxt->wellFormed = 0;
3372 htmlParseDocTypeDecl(ctxt);
3373 }
3374
3375 /*
3376 * First case : a comment
3377 */
3378 if ((CUR == '<') && (NXT(1) == '!') &&
3379 (NXT(2) == '-') && (NXT(3) == '-')) {
3380 htmlParseComment(ctxt);
3381 }
3382
3383 /*
3384 * Second case : a sub-element.
3385 */
3386 else if (CUR == '<') {
3387 htmlParseElement(ctxt);
3388 }
3389
3390 /*
3391 * Third case : a reference. If if has not been resolved,
3392 * parsing returns it's Name, create the node
3393 */
3394 else if (CUR == '&') {
3395 htmlParseReference(ctxt);
3396 }
3397
3398 /*
3399 * Fourth : end of the resource
3400 */
3401 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003402 htmlAutoCloseOnEnd(ctxt);
3403 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003404 }
3405
3406 /*
3407 * Last case, text. Note that References are handled directly.
3408 */
3409 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003410 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003411 }
3412
3413 if (cons == ctxt->nbChars) {
3414 if (ctxt->node != NULL) {
3415 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3416 ctxt->sax->error(ctxt->userData,
3417 "detected an error in element content\n");
3418 ctxt->wellFormed = 0;
3419 }
3420 break;
3421 }
3422 }
3423 GROW;
3424 }
3425 if (currentNode != NULL) xmlFree(currentNode);
3426}
3427
3428/**
3429 * htmlParseElement:
3430 * @ctxt: an HTML parser context
3431 *
3432 * parse an HTML element, this is highly recursive
3433 *
3434 * [39] element ::= EmptyElemTag | STag content ETag
3435 *
3436 * [41] Attribute ::= Name Eq AttValue
3437 */
3438
3439void
3440htmlParseElement(htmlParserCtxtPtr ctxt) {
3441 xmlChar *name;
3442 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003443 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003444 htmlParserNodeInfo node_info;
3445 xmlChar *oldname;
3446 int depth = ctxt->nameNr;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003447 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003448
3449 /* Capture start position */
3450 if (ctxt->record_info) {
3451 node_info.begin_pos = ctxt->input->consumed +
3452 (CUR_PTR - ctxt->input->base);
3453 node_info.begin_line = ctxt->input->line;
3454 }
3455
3456 oldname = xmlStrdup(ctxt->name);
3457 htmlParseStartTag(ctxt);
3458 name = ctxt->name;
3459#ifdef DEBUG
3460 if (oldname == NULL)
3461 xmlGenericError(xmlGenericErrorContext,
3462 "Start of element %s\n", name);
3463 else if (name == NULL)
3464 xmlGenericError(xmlGenericErrorContext,
3465 "Start of element failed, was %s\n", oldname);
3466 else
3467 xmlGenericError(xmlGenericErrorContext,
3468 "Start of element %s, was %s\n", name, oldname);
3469#endif
3470 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3471 (name == NULL)) {
3472 if (CUR == '>')
3473 NEXT;
3474 if (oldname != NULL)
3475 xmlFree(oldname);
3476 return;
3477 }
3478 if (oldname != NULL)
3479 xmlFree(oldname);
3480
3481 /*
3482 * Lookup the info for that element.
3483 */
3484 info = htmlTagLookup(name);
3485 if (info == NULL) {
3486 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3487 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3488 name);
3489 ctxt->wellFormed = 0;
3490 } else if (info->depr) {
3491/***************************
3492 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3493 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3494 name);
3495 ***************************/
3496 }
3497
3498 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00003499 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00003500 */
3501 if ((CUR == '/') && (NXT(1) == '>')) {
3502 SKIP(2);
3503 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3504 ctxt->sax->endElement(ctxt->userData, name);
3505 oldname = htmlnamePop(ctxt);
3506#ifdef DEBUG
3507 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
3508#endif
3509 if (oldname != NULL)
3510 xmlFree(oldname);
3511 return;
3512 }
3513
3514 if (CUR == '>') {
3515 NEXT;
3516 } else {
3517 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3518 ctxt->sax->error(ctxt->userData,
3519 "Couldn't find end of Start Tag %s\n",
3520 name);
3521 ctxt->wellFormed = 0;
3522
3523 /*
3524 * end of parsing of this node.
3525 */
3526 if (xmlStrEqual(name, ctxt->name)) {
3527 nodePop(ctxt);
3528 oldname = htmlnamePop(ctxt);
3529#ifdef DEBUG
3530 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
3531#endif
3532 if (oldname != NULL)
3533 xmlFree(oldname);
3534 }
3535
3536 /*
3537 * Capture end position and add node
3538 */
3539 if ( currentNode != NULL && ctxt->record_info ) {
3540 node_info.end_pos = ctxt->input->consumed +
3541 (CUR_PTR - ctxt->input->base);
3542 node_info.end_line = ctxt->input->line;
3543 node_info.node = ctxt->node;
3544 xmlParserAddNodeInfo(ctxt, &node_info);
3545 }
3546 return;
3547 }
3548
3549 /*
3550 * Check for an Empty Element from DTD definition
3551 */
3552 if ((info != NULL) && (info->empty)) {
3553 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3554 ctxt->sax->endElement(ctxt->userData, name);
3555 oldname = htmlnamePop(ctxt);
3556#ifdef DEBUG
3557 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
3558#endif
3559 if (oldname != NULL)
3560 xmlFree(oldname);
3561 return;
3562 }
3563
3564 /*
3565 * Parse the content of the element:
3566 */
3567 currentNode = xmlStrdup(ctxt->name);
3568 depth = ctxt->nameNr;
3569 while (IS_CHAR(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00003570 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00003571 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00003572 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00003573 if (ctxt->nameNr < depth) break;
3574 }
3575
Owen Taylor3473f882001-02-23 17:55:21 +00003576 /*
3577 * Capture end position and add node
3578 */
3579 if ( currentNode != NULL && ctxt->record_info ) {
3580 node_info.end_pos = ctxt->input->consumed +
3581 (CUR_PTR - ctxt->input->base);
3582 node_info.end_line = ctxt->input->line;
3583 node_info.node = ctxt->node;
3584 xmlParserAddNodeInfo(ctxt, &node_info);
3585 }
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003586 if (!IS_CHAR(CUR)) {
3587 htmlAutoCloseOnEnd(ctxt);
3588 }
3589
Owen Taylor3473f882001-02-23 17:55:21 +00003590 if (currentNode != NULL)
3591 xmlFree(currentNode);
3592}
3593
3594/**
3595 * htmlParseDocument :
3596 * @ctxt: an HTML parser context
3597 *
3598 * parse an HTML document (and build a tree if using the standard SAX
3599 * interface).
3600 *
3601 * Returns 0, -1 in case of error. the parser context is augmented
3602 * as a result of the parsing.
3603 */
3604
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00003605int
Owen Taylor3473f882001-02-23 17:55:21 +00003606htmlParseDocument(htmlParserCtxtPtr ctxt) {
3607 xmlDtdPtr dtd;
3608
Daniel Veillardd0463562001-10-13 09:15:48 +00003609 xmlInitParser();
3610
Owen Taylor3473f882001-02-23 17:55:21 +00003611 htmlDefaultSAXHandlerInit();
3612 ctxt->html = 1;
3613
3614 GROW;
3615 /*
3616 * SAX: beginning of the document processing.
3617 */
3618 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3619 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3620
3621 /*
3622 * Wipe out everything which is before the first '<'
3623 */
3624 SKIP_BLANKS;
3625 if (CUR == 0) {
3626 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3627 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3628 ctxt->wellFormed = 0;
3629 }
3630
3631 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3632 ctxt->sax->startDocument(ctxt->userData);
3633
3634
3635 /*
3636 * Parse possible comments before any content
3637 */
3638 while ((CUR == '<') && (NXT(1) == '!') &&
3639 (NXT(2) == '-') && (NXT(3) == '-')) {
3640 htmlParseComment(ctxt);
3641 SKIP_BLANKS;
3642 }
3643
3644
3645 /*
3646 * Then possibly doc type declaration(s) and more Misc
3647 * (doctypedecl Misc*)?
3648 */
3649 if ((CUR == '<') && (NXT(1) == '!') &&
3650 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3651 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3652 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3653 (UPP(8) == 'E')) {
3654 htmlParseDocTypeDecl(ctxt);
3655 }
3656 SKIP_BLANKS;
3657
3658 /*
3659 * Parse possible comments before any content
3660 */
3661 while ((CUR == '<') && (NXT(1) == '!') &&
3662 (NXT(2) == '-') && (NXT(3) == '-')) {
3663 htmlParseComment(ctxt);
3664 SKIP_BLANKS;
3665 }
3666
3667 /*
3668 * Time to start parsing the tree itself
3669 */
3670 htmlParseContent(ctxt);
3671
3672 /*
3673 * autoclose
3674 */
3675 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003676 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003677
3678
3679 /*
3680 * SAX: end of the document processing.
3681 */
3682 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3683 ctxt->sax->endDocument(ctxt->userData);
3684
3685 if (ctxt->myDoc != NULL) {
3686 dtd = xmlGetIntSubset(ctxt->myDoc);
3687 if (dtd == NULL)
3688 ctxt->myDoc->intSubset =
3689 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3690 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3691 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3692 }
3693 if (! ctxt->wellFormed) return(-1);
3694 return(0);
3695}
3696
3697
3698/************************************************************************
3699 * *
3700 * Parser contexts handling *
3701 * *
3702 ************************************************************************/
3703
3704/**
3705 * xmlInitParserCtxt:
3706 * @ctxt: an HTML parser context
3707 *
3708 * Initialize a parser context
3709 */
3710
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003711static void
Owen Taylor3473f882001-02-23 17:55:21 +00003712htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3713{
3714 htmlSAXHandler *sax;
3715
3716 if (ctxt == NULL) return;
3717 memset(ctxt, 0, sizeof(htmlParserCtxt));
3718
3719 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3720 if (sax == NULL) {
3721 xmlGenericError(xmlGenericErrorContext,
3722 "htmlInitParserCtxt: out of memory\n");
3723 }
3724 else
3725 memset(sax, 0, sizeof(htmlSAXHandler));
3726
3727 /* Allocate the Input stack */
3728 ctxt->inputTab = (htmlParserInputPtr *)
3729 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3730 if (ctxt->inputTab == NULL) {
3731 xmlGenericError(xmlGenericErrorContext,
3732 "htmlInitParserCtxt: out of memory\n");
3733 ctxt->inputNr = 0;
3734 ctxt->inputMax = 0;
3735 ctxt->input = NULL;
3736 return;
3737 }
3738 ctxt->inputNr = 0;
3739 ctxt->inputMax = 5;
3740 ctxt->input = NULL;
3741 ctxt->version = NULL;
3742 ctxt->encoding = NULL;
3743 ctxt->standalone = -1;
3744 ctxt->instate = XML_PARSER_START;
3745
3746 /* Allocate the Node stack */
3747 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3748 if (ctxt->nodeTab == NULL) {
3749 xmlGenericError(xmlGenericErrorContext,
3750 "htmlInitParserCtxt: out of memory\n");
3751 ctxt->nodeNr = 0;
3752 ctxt->nodeMax = 0;
3753 ctxt->node = NULL;
3754 ctxt->inputNr = 0;
3755 ctxt->inputMax = 0;
3756 ctxt->input = NULL;
3757 return;
3758 }
3759 ctxt->nodeNr = 0;
3760 ctxt->nodeMax = 10;
3761 ctxt->node = NULL;
3762
3763 /* Allocate the Name stack */
3764 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
3765 if (ctxt->nameTab == NULL) {
3766 xmlGenericError(xmlGenericErrorContext,
3767 "htmlInitParserCtxt: out of memory\n");
3768 ctxt->nameNr = 0;
3769 ctxt->nameMax = 10;
3770 ctxt->name = NULL;
3771 ctxt->nodeNr = 0;
3772 ctxt->nodeMax = 0;
3773 ctxt->node = NULL;
3774 ctxt->inputNr = 0;
3775 ctxt->inputMax = 0;
3776 ctxt->input = NULL;
3777 return;
3778 }
3779 ctxt->nameNr = 0;
3780 ctxt->nameMax = 10;
3781 ctxt->name = NULL;
3782
3783 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3784 else {
3785 ctxt->sax = sax;
3786 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3787 }
3788 ctxt->userData = ctxt;
3789 ctxt->myDoc = NULL;
3790 ctxt->wellFormed = 1;
3791 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00003792 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00003793 ctxt->html = 1;
3794 ctxt->record_info = 0;
3795 ctxt->validate = 0;
3796 ctxt->nbChars = 0;
3797 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00003798 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003799 xmlInitNodeInfoSeq(&ctxt->node_seq);
3800}
3801
3802/**
3803 * htmlFreeParserCtxt:
3804 * @ctxt: an HTML parser context
3805 *
3806 * Free all the memory used by a parser context. However the parsed
3807 * document in ctxt->myDoc is not freed.
3808 */
3809
3810void
3811htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3812{
3813 xmlFreeParserCtxt(ctxt);
3814}
3815
3816/**
3817 * htmlCreateDocParserCtxt :
3818 * @cur: a pointer to an array of xmlChar
3819 * @encoding: a free form C string describing the HTML document encoding, or NULL
3820 *
3821 * Create a parser context for an HTML document.
3822 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003823 * TODO: check the need to add encoding handling there
3824 *
Owen Taylor3473f882001-02-23 17:55:21 +00003825 * Returns the new parser context or NULL
3826 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003827static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00003828htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Owen Taylor3473f882001-02-23 17:55:21 +00003829 htmlParserCtxtPtr ctxt;
3830 htmlParserInputPtr input;
3831 /* htmlCharEncoding enc; */
3832
3833 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
3834 if (ctxt == NULL) {
3835 perror("malloc");
3836 return(NULL);
3837 }
3838 htmlInitParserCtxt(ctxt);
3839 input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
3840 if (input == NULL) {
3841 perror("malloc");
3842 xmlFree(ctxt);
3843 return(NULL);
3844 }
3845 memset(input, 0, sizeof(htmlParserInput));
3846
3847 input->line = 1;
3848 input->col = 1;
3849 input->base = cur;
3850 input->cur = cur;
3851
3852 inputPush(ctxt, input);
3853 return(ctxt);
3854}
3855
3856/************************************************************************
3857 * *
3858 * Progressive parsing interfaces *
3859 * *
3860 ************************************************************************/
3861
3862/**
3863 * htmlParseLookupSequence:
3864 * @ctxt: an HTML parser context
3865 * @first: the first char to lookup
3866 * @next: the next char to lookup or zero
3867 * @third: the next char to lookup or zero
3868 *
3869 * Try to find if a sequence (first, next, third) or just (first next) or
3870 * (first) is available in the input stream.
3871 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3872 * to avoid rescanning sequences of bytes, it DOES change the state of the
3873 * parser, do not use liberally.
3874 * This is basically similar to xmlParseLookupSequence()
3875 *
3876 * Returns the index to the current parsing point if the full sequence
3877 * is available, -1 otherwise.
3878 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003879static int
Owen Taylor3473f882001-02-23 17:55:21 +00003880htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3881 xmlChar next, xmlChar third) {
3882 int base, len;
3883 htmlParserInputPtr in;
3884 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00003885 int incomment = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003886
3887 in = ctxt->input;
3888 if (in == NULL) return(-1);
3889 base = in->cur - in->base;
3890 if (base < 0) return(-1);
3891 if (ctxt->checkIndex > base)
3892 base = ctxt->checkIndex;
3893 if (in->buf == NULL) {
3894 buf = in->base;
3895 len = in->length;
3896 } else {
3897 buf = in->buf->buffer->content;
3898 len = in->buf->buffer->use;
3899 }
3900 /* take into account the sequence length */
3901 if (third) len -= 2;
3902 else if (next) len --;
3903 for (;base < len;base++) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00003904 if (!incomment && (base + 4 < len)) {
3905 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
3906 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
3907 incomment = 1;
3908 }
3909 /* do not increment base, some people use <!--> */
3910 }
3911 if (incomment) {
3912 if (base + 3 < len)
3913 return(-1);
3914 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
3915 (buf[base + 2] == '>')) {
3916 incomment = 0;
3917 base += 2;
3918 }
3919 continue;
3920 }
Owen Taylor3473f882001-02-23 17:55:21 +00003921 if (buf[base] == first) {
3922 if (third != 0) {
3923 if ((buf[base + 1] != next) ||
3924 (buf[base + 2] != third)) continue;
3925 } else if (next != 0) {
3926 if (buf[base + 1] != next) continue;
3927 }
3928 ctxt->checkIndex = 0;
3929#ifdef DEBUG_PUSH
3930 if (next == 0)
3931 xmlGenericError(xmlGenericErrorContext,
3932 "HPP: lookup '%c' found at %d\n",
3933 first, base);
3934 else if (third == 0)
3935 xmlGenericError(xmlGenericErrorContext,
3936 "HPP: lookup '%c%c' found at %d\n",
3937 first, next, base);
3938 else
3939 xmlGenericError(xmlGenericErrorContext,
3940 "HPP: lookup '%c%c%c' found at %d\n",
3941 first, next, third, base);
3942#endif
3943 return(base - (in->cur - in->base));
3944 }
3945 }
3946 ctxt->checkIndex = base;
3947#ifdef DEBUG_PUSH
3948 if (next == 0)
3949 xmlGenericError(xmlGenericErrorContext,
3950 "HPP: lookup '%c' failed\n", first);
3951 else if (third == 0)
3952 xmlGenericError(xmlGenericErrorContext,
3953 "HPP: lookup '%c%c' failed\n", first, next);
3954 else
3955 xmlGenericError(xmlGenericErrorContext,
3956 "HPP: lookup '%c%c%c' failed\n", first, next, third);
3957#endif
3958 return(-1);
3959}
3960
3961/**
3962 * htmlParseTryOrFinish:
3963 * @ctxt: an HTML parser context
3964 * @terminate: last chunk indicator
3965 *
3966 * Try to progress on parsing
3967 *
3968 * Returns zero if no parsing was possible
3969 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003970static int
Owen Taylor3473f882001-02-23 17:55:21 +00003971htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
3972 int ret = 0;
3973 htmlParserInputPtr in;
3974 int avail = 0;
3975 xmlChar cur, next;
3976
3977#ifdef DEBUG_PUSH
3978 switch (ctxt->instate) {
3979 case XML_PARSER_EOF:
3980 xmlGenericError(xmlGenericErrorContext,
3981 "HPP: try EOF\n"); break;
3982 case XML_PARSER_START:
3983 xmlGenericError(xmlGenericErrorContext,
3984 "HPP: try START\n"); break;
3985 case XML_PARSER_MISC:
3986 xmlGenericError(xmlGenericErrorContext,
3987 "HPP: try MISC\n");break;
3988 case XML_PARSER_COMMENT:
3989 xmlGenericError(xmlGenericErrorContext,
3990 "HPP: try COMMENT\n");break;
3991 case XML_PARSER_PROLOG:
3992 xmlGenericError(xmlGenericErrorContext,
3993 "HPP: try PROLOG\n");break;
3994 case XML_PARSER_START_TAG:
3995 xmlGenericError(xmlGenericErrorContext,
3996 "HPP: try START_TAG\n");break;
3997 case XML_PARSER_CONTENT:
3998 xmlGenericError(xmlGenericErrorContext,
3999 "HPP: try CONTENT\n");break;
4000 case XML_PARSER_CDATA_SECTION:
4001 xmlGenericError(xmlGenericErrorContext,
4002 "HPP: try CDATA_SECTION\n");break;
4003 case XML_PARSER_END_TAG:
4004 xmlGenericError(xmlGenericErrorContext,
4005 "HPP: try END_TAG\n");break;
4006 case XML_PARSER_ENTITY_DECL:
4007 xmlGenericError(xmlGenericErrorContext,
4008 "HPP: try ENTITY_DECL\n");break;
4009 case XML_PARSER_ENTITY_VALUE:
4010 xmlGenericError(xmlGenericErrorContext,
4011 "HPP: try ENTITY_VALUE\n");break;
4012 case XML_PARSER_ATTRIBUTE_VALUE:
4013 xmlGenericError(xmlGenericErrorContext,
4014 "HPP: try ATTRIBUTE_VALUE\n");break;
4015 case XML_PARSER_DTD:
4016 xmlGenericError(xmlGenericErrorContext,
4017 "HPP: try DTD\n");break;
4018 case XML_PARSER_EPILOG:
4019 xmlGenericError(xmlGenericErrorContext,
4020 "HPP: try EPILOG\n");break;
4021 case XML_PARSER_PI:
4022 xmlGenericError(xmlGenericErrorContext,
4023 "HPP: try PI\n");break;
4024 case XML_PARSER_SYSTEM_LITERAL:
4025 xmlGenericError(xmlGenericErrorContext,
4026 "HPP: try SYSTEM_LITERAL\n");break;
4027 }
4028#endif
4029
4030 while (1) {
4031
4032 in = ctxt->input;
4033 if (in == NULL) break;
4034 if (in->buf == NULL)
4035 avail = in->length - (in->cur - in->base);
4036 else
4037 avail = in->buf->buffer->use - (in->cur - in->base);
4038 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004039 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004040 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4041 /*
4042 * SAX: end of the document processing.
4043 */
4044 ctxt->instate = XML_PARSER_EOF;
4045 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4046 ctxt->sax->endDocument(ctxt->userData);
4047 }
4048 }
4049 if (avail < 1)
4050 goto done;
4051 switch (ctxt->instate) {
4052 case XML_PARSER_EOF:
4053 /*
4054 * Document parsing is done !
4055 */
4056 goto done;
4057 case XML_PARSER_START:
4058 /*
4059 * Very first chars read from the document flow.
4060 */
4061 cur = in->cur[0];
4062 if (IS_BLANK(cur)) {
4063 SKIP_BLANKS;
4064 if (in->buf == NULL)
4065 avail = in->length - (in->cur - in->base);
4066 else
4067 avail = in->buf->buffer->use - (in->cur - in->base);
4068 }
4069 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4070 ctxt->sax->setDocumentLocator(ctxt->userData,
4071 &xmlDefaultSAXLocator);
4072 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4073 (!ctxt->disableSAX))
4074 ctxt->sax->startDocument(ctxt->userData);
4075
4076 cur = in->cur[0];
4077 next = in->cur[1];
4078 if ((cur == '<') && (next == '!') &&
4079 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4080 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4081 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4082 (UPP(8) == 'E')) {
4083 if ((!terminate) &&
4084 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4085 goto done;
4086#ifdef DEBUG_PUSH
4087 xmlGenericError(xmlGenericErrorContext,
4088 "HPP: Parsing internal subset\n");
4089#endif
4090 htmlParseDocTypeDecl(ctxt);
4091 ctxt->instate = XML_PARSER_PROLOG;
4092#ifdef DEBUG_PUSH
4093 xmlGenericError(xmlGenericErrorContext,
4094 "HPP: entering PROLOG\n");
4095#endif
4096 } else {
4097 ctxt->instate = XML_PARSER_MISC;
4098 }
4099#ifdef DEBUG_PUSH
4100 xmlGenericError(xmlGenericErrorContext,
4101 "HPP: entering MISC\n");
4102#endif
4103 break;
4104 case XML_PARSER_MISC:
4105 SKIP_BLANKS;
4106 if (in->buf == NULL)
4107 avail = in->length - (in->cur - in->base);
4108 else
4109 avail = in->buf->buffer->use - (in->cur - in->base);
4110 if (avail < 2)
4111 goto done;
4112 cur = in->cur[0];
4113 next = in->cur[1];
4114 if ((cur == '<') && (next == '!') &&
4115 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4116 if ((!terminate) &&
4117 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4118 goto done;
4119#ifdef DEBUG_PUSH
4120 xmlGenericError(xmlGenericErrorContext,
4121 "HPP: Parsing Comment\n");
4122#endif
4123 htmlParseComment(ctxt);
4124 ctxt->instate = XML_PARSER_MISC;
4125 } else if ((cur == '<') && (next == '!') &&
4126 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4127 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4128 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4129 (UPP(8) == 'E')) {
4130 if ((!terminate) &&
4131 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4132 goto done;
4133#ifdef DEBUG_PUSH
4134 xmlGenericError(xmlGenericErrorContext,
4135 "HPP: Parsing internal subset\n");
4136#endif
4137 htmlParseDocTypeDecl(ctxt);
4138 ctxt->instate = XML_PARSER_PROLOG;
4139#ifdef DEBUG_PUSH
4140 xmlGenericError(xmlGenericErrorContext,
4141 "HPP: entering PROLOG\n");
4142#endif
4143 } else if ((cur == '<') && (next == '!') &&
4144 (avail < 9)) {
4145 goto done;
4146 } else {
4147 ctxt->instate = XML_PARSER_START_TAG;
4148#ifdef DEBUG_PUSH
4149 xmlGenericError(xmlGenericErrorContext,
4150 "HPP: entering START_TAG\n");
4151#endif
4152 }
4153 break;
4154 case XML_PARSER_PROLOG:
4155 SKIP_BLANKS;
4156 if (in->buf == NULL)
4157 avail = in->length - (in->cur - in->base);
4158 else
4159 avail = in->buf->buffer->use - (in->cur - in->base);
4160 if (avail < 2)
4161 goto done;
4162 cur = in->cur[0];
4163 next = in->cur[1];
4164 if ((cur == '<') && (next == '!') &&
4165 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4166 if ((!terminate) &&
4167 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4168 goto done;
4169#ifdef DEBUG_PUSH
4170 xmlGenericError(xmlGenericErrorContext,
4171 "HPP: Parsing Comment\n");
4172#endif
4173 htmlParseComment(ctxt);
4174 ctxt->instate = XML_PARSER_PROLOG;
4175 } else if ((cur == '<') && (next == '!') &&
4176 (avail < 4)) {
4177 goto done;
4178 } else {
4179 ctxt->instate = XML_PARSER_START_TAG;
4180#ifdef DEBUG_PUSH
4181 xmlGenericError(xmlGenericErrorContext,
4182 "HPP: entering START_TAG\n");
4183#endif
4184 }
4185 break;
4186 case XML_PARSER_EPILOG:
4187 if (in->buf == NULL)
4188 avail = in->length - (in->cur - in->base);
4189 else
4190 avail = in->buf->buffer->use - (in->cur - in->base);
4191 if (avail < 1)
4192 goto done;
4193 cur = in->cur[0];
4194 if (IS_BLANK(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004195 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004196 goto done;
4197 }
4198 if (avail < 2)
4199 goto done;
4200 next = in->cur[1];
4201 if ((cur == '<') && (next == '!') &&
4202 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4203 if ((!terminate) &&
4204 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4205 goto done;
4206#ifdef DEBUG_PUSH
4207 xmlGenericError(xmlGenericErrorContext,
4208 "HPP: Parsing Comment\n");
4209#endif
4210 htmlParseComment(ctxt);
4211 ctxt->instate = XML_PARSER_EPILOG;
4212 } else if ((cur == '<') && (next == '!') &&
4213 (avail < 4)) {
4214 goto done;
4215 } else {
4216 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004217 ctxt->wellFormed = 0;
4218 ctxt->instate = XML_PARSER_EOF;
4219#ifdef DEBUG_PUSH
4220 xmlGenericError(xmlGenericErrorContext,
4221 "HPP: entering EOF\n");
4222#endif
4223 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4224 ctxt->sax->endDocument(ctxt->userData);
4225 goto done;
4226 }
4227 break;
4228 case XML_PARSER_START_TAG: {
4229 xmlChar *name, *oldname;
4230 int depth = ctxt->nameNr;
Daniel Veillardbb371292001-08-16 23:26:59 +00004231 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004232
4233 if (avail < 2)
4234 goto done;
4235 cur = in->cur[0];
4236 if (cur != '<') {
4237 ctxt->instate = XML_PARSER_CONTENT;
4238#ifdef DEBUG_PUSH
4239 xmlGenericError(xmlGenericErrorContext,
4240 "HPP: entering CONTENT\n");
4241#endif
4242 break;
4243 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004244 if (in->cur[1] == '/') {
4245 ctxt->instate = XML_PARSER_END_TAG;
4246 ctxt->checkIndex = 0;
4247#ifdef DEBUG_PUSH
4248 xmlGenericError(xmlGenericErrorContext,
4249 "HPP: entering END_TAG\n");
4250#endif
4251 break;
4252 }
Owen Taylor3473f882001-02-23 17:55:21 +00004253 if ((!terminate) &&
4254 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4255 goto done;
4256
4257 oldname = xmlStrdup(ctxt->name);
4258 htmlParseStartTag(ctxt);
4259 name = ctxt->name;
4260#ifdef DEBUG
4261 if (oldname == NULL)
4262 xmlGenericError(xmlGenericErrorContext,
4263 "Start of element %s\n", name);
4264 else if (name == NULL)
4265 xmlGenericError(xmlGenericErrorContext,
4266 "Start of element failed, was %s\n",
4267 oldname);
4268 else
4269 xmlGenericError(xmlGenericErrorContext,
4270 "Start of element %s, was %s\n",
4271 name, oldname);
4272#endif
4273 if (((depth == ctxt->nameNr) &&
4274 (xmlStrEqual(oldname, ctxt->name))) ||
4275 (name == NULL)) {
4276 if (CUR == '>')
4277 NEXT;
4278 if (oldname != NULL)
4279 xmlFree(oldname);
4280 break;
4281 }
4282 if (oldname != NULL)
4283 xmlFree(oldname);
4284
4285 /*
4286 * Lookup the info for that element.
4287 */
4288 info = htmlTagLookup(name);
4289 if (info == NULL) {
4290 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4291 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4292 name);
4293 ctxt->wellFormed = 0;
4294 } else if (info->depr) {
4295 /***************************
4296 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4297 ctxt->sax->warning(ctxt->userData,
4298 "Tag %s is deprecated\n",
4299 name);
4300 ***************************/
4301 }
4302
4303 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004304 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004305 */
4306 if ((CUR == '/') && (NXT(1) == '>')) {
4307 SKIP(2);
4308 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4309 ctxt->sax->endElement(ctxt->userData, name);
4310 oldname = htmlnamePop(ctxt);
4311#ifdef DEBUG
4312 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
4313 oldname);
4314#endif
4315 if (oldname != NULL)
4316 xmlFree(oldname);
4317 ctxt->instate = XML_PARSER_CONTENT;
4318#ifdef DEBUG_PUSH
4319 xmlGenericError(xmlGenericErrorContext,
4320 "HPP: entering CONTENT\n");
4321#endif
4322 break;
4323 }
4324
4325 if (CUR == '>') {
4326 NEXT;
4327 } else {
4328 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4329 ctxt->sax->error(ctxt->userData,
4330 "Couldn't find end of Start Tag %s\n",
4331 name);
4332 ctxt->wellFormed = 0;
4333
4334 /*
4335 * end of parsing of this node.
4336 */
4337 if (xmlStrEqual(name, ctxt->name)) {
4338 nodePop(ctxt);
4339 oldname = htmlnamePop(ctxt);
4340#ifdef DEBUG
4341 xmlGenericError(xmlGenericErrorContext,
4342 "End of start tag problem: popping out %s\n", oldname);
4343#endif
4344 if (oldname != NULL)
4345 xmlFree(oldname);
4346 }
4347
4348 ctxt->instate = XML_PARSER_CONTENT;
4349#ifdef DEBUG_PUSH
4350 xmlGenericError(xmlGenericErrorContext,
4351 "HPP: entering CONTENT\n");
4352#endif
4353 break;
4354 }
4355
4356 /*
4357 * Check for an Empty Element from DTD definition
4358 */
4359 if ((info != NULL) && (info->empty)) {
4360 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4361 ctxt->sax->endElement(ctxt->userData, name);
4362 oldname = htmlnamePop(ctxt);
4363#ifdef DEBUG
4364 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
4365#endif
4366 if (oldname != NULL)
4367 xmlFree(oldname);
4368 }
4369 ctxt->instate = XML_PARSER_CONTENT;
4370#ifdef DEBUG_PUSH
4371 xmlGenericError(xmlGenericErrorContext,
4372 "HPP: entering CONTENT\n");
4373#endif
4374 break;
4375 }
4376 case XML_PARSER_CONTENT: {
4377 long cons;
4378 /*
4379 * Handle preparsed entities and charRef
4380 */
4381 if (ctxt->token != 0) {
4382 xmlChar chr[2] = { 0 , 0 } ;
4383
4384 chr[0] = (xmlChar) ctxt->token;
4385 htmlCheckParagraph(ctxt);
4386 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4387 ctxt->sax->characters(ctxt->userData, chr, 1);
4388 ctxt->token = 0;
4389 ctxt->checkIndex = 0;
4390 }
4391 if ((avail == 1) && (terminate)) {
4392 cur = in->cur[0];
4393 if ((cur != '<') && (cur != '&')) {
4394 if (ctxt->sax != NULL) {
4395 if (IS_BLANK(cur)) {
4396 if (ctxt->sax->ignorableWhitespace != NULL)
4397 ctxt->sax->ignorableWhitespace(
4398 ctxt->userData, &cur, 1);
4399 } else {
4400 htmlCheckParagraph(ctxt);
4401 if (ctxt->sax->characters != NULL)
4402 ctxt->sax->characters(
4403 ctxt->userData, &cur, 1);
4404 }
4405 }
4406 ctxt->token = 0;
4407 ctxt->checkIndex = 0;
4408 NEXT;
William M. Brack1633d182001-10-05 15:41:19 +00004409 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004410 }
Owen Taylor3473f882001-02-23 17:55:21 +00004411 }
4412 if (avail < 2)
4413 goto done;
4414 cur = in->cur[0];
4415 next = in->cur[1];
4416 cons = ctxt->nbChars;
4417 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4418 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4419 /*
4420 * Handle SCRIPT/STYLE separately
4421 */
4422 if ((!terminate) &&
4423 (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4424 goto done;
4425 htmlParseScript(ctxt);
4426 if ((cur == '<') && (next == '/')) {
4427 ctxt->instate = XML_PARSER_END_TAG;
4428 ctxt->checkIndex = 0;
4429#ifdef DEBUG_PUSH
4430 xmlGenericError(xmlGenericErrorContext,
4431 "HPP: entering END_TAG\n");
4432#endif
4433 break;
4434 }
4435 } else {
4436 /*
4437 * Sometimes DOCTYPE arrives in the middle of the document
4438 */
4439 if ((cur == '<') && (next == '!') &&
4440 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4441 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4442 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4443 (UPP(8) == 'E')) {
4444 if ((!terminate) &&
4445 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4446 goto done;
4447 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4448 ctxt->sax->error(ctxt->userData,
4449 "Misplaced DOCTYPE declaration\n");
4450 ctxt->wellFormed = 0;
4451 htmlParseDocTypeDecl(ctxt);
4452 } else if ((cur == '<') && (next == '!') &&
4453 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4454 if ((!terminate) &&
4455 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4456 goto done;
4457#ifdef DEBUG_PUSH
4458 xmlGenericError(xmlGenericErrorContext,
4459 "HPP: Parsing Comment\n");
4460#endif
4461 htmlParseComment(ctxt);
4462 ctxt->instate = XML_PARSER_CONTENT;
4463 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4464 goto done;
4465 } else if ((cur == '<') && (next == '/')) {
4466 ctxt->instate = XML_PARSER_END_TAG;
4467 ctxt->checkIndex = 0;
4468#ifdef DEBUG_PUSH
4469 xmlGenericError(xmlGenericErrorContext,
4470 "HPP: entering END_TAG\n");
4471#endif
4472 break;
4473 } else if (cur == '<') {
4474 ctxt->instate = XML_PARSER_START_TAG;
4475 ctxt->checkIndex = 0;
4476#ifdef DEBUG_PUSH
4477 xmlGenericError(xmlGenericErrorContext,
4478 "HPP: entering START_TAG\n");
4479#endif
4480 break;
4481 } else if (cur == '&') {
4482 if ((!terminate) &&
4483 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4484 goto done;
4485#ifdef DEBUG_PUSH
4486 xmlGenericError(xmlGenericErrorContext,
4487 "HPP: Parsing Reference\n");
4488#endif
4489 /* TODO: check generation of subtrees if noent !!! */
4490 htmlParseReference(ctxt);
4491 } else {
4492 /* TODO Avoid the extra copy, handle directly !!!!!! */
4493 /*
4494 * Goal of the following test is :
4495 * - minimize calls to the SAX 'character' callback
4496 * when they are mergeable
4497 */
4498 if ((ctxt->inputNr == 1) &&
4499 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4500 if ((!terminate) &&
4501 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4502 goto done;
4503 }
4504 ctxt->checkIndex = 0;
4505#ifdef DEBUG_PUSH
4506 xmlGenericError(xmlGenericErrorContext,
4507 "HPP: Parsing char data\n");
4508#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004509 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004510 }
4511 }
4512 if (cons == ctxt->nbChars) {
4513 if (ctxt->node != NULL) {
4514 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4515 ctxt->sax->error(ctxt->userData,
4516 "detected an error in element content\n");
4517 ctxt->wellFormed = 0;
4518 }
4519 NEXT;
4520 break;
4521 }
4522
4523 break;
4524 }
4525 case XML_PARSER_END_TAG:
4526 if (avail < 2)
4527 goto done;
4528 if ((!terminate) &&
4529 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4530 goto done;
4531 htmlParseEndTag(ctxt);
4532 if (ctxt->nameNr == 0) {
4533 ctxt->instate = XML_PARSER_EPILOG;
4534 } else {
4535 ctxt->instate = XML_PARSER_CONTENT;
4536 }
4537 ctxt->checkIndex = 0;
4538#ifdef DEBUG_PUSH
4539 xmlGenericError(xmlGenericErrorContext,
4540 "HPP: entering CONTENT\n");
4541#endif
4542 break;
4543 case XML_PARSER_CDATA_SECTION:
4544 xmlGenericError(xmlGenericErrorContext,
4545 "HPP: internal error, state == CDATA\n");
4546 ctxt->instate = XML_PARSER_CONTENT;
4547 ctxt->checkIndex = 0;
4548#ifdef DEBUG_PUSH
4549 xmlGenericError(xmlGenericErrorContext,
4550 "HPP: entering CONTENT\n");
4551#endif
4552 break;
4553 case XML_PARSER_DTD:
4554 xmlGenericError(xmlGenericErrorContext,
4555 "HPP: internal error, state == DTD\n");
4556 ctxt->instate = XML_PARSER_CONTENT;
4557 ctxt->checkIndex = 0;
4558#ifdef DEBUG_PUSH
4559 xmlGenericError(xmlGenericErrorContext,
4560 "HPP: entering CONTENT\n");
4561#endif
4562 break;
4563 case XML_PARSER_COMMENT:
4564 xmlGenericError(xmlGenericErrorContext,
4565 "HPP: internal error, state == COMMENT\n");
4566 ctxt->instate = XML_PARSER_CONTENT;
4567 ctxt->checkIndex = 0;
4568#ifdef DEBUG_PUSH
4569 xmlGenericError(xmlGenericErrorContext,
4570 "HPP: entering CONTENT\n");
4571#endif
4572 break;
4573 case XML_PARSER_PI:
4574 xmlGenericError(xmlGenericErrorContext,
4575 "HPP: internal error, state == PI\n");
4576 ctxt->instate = XML_PARSER_CONTENT;
4577 ctxt->checkIndex = 0;
4578#ifdef DEBUG_PUSH
4579 xmlGenericError(xmlGenericErrorContext,
4580 "HPP: entering CONTENT\n");
4581#endif
4582 break;
4583 case XML_PARSER_ENTITY_DECL:
4584 xmlGenericError(xmlGenericErrorContext,
4585 "HPP: internal error, state == ENTITY_DECL\n");
4586 ctxt->instate = XML_PARSER_CONTENT;
4587 ctxt->checkIndex = 0;
4588#ifdef DEBUG_PUSH
4589 xmlGenericError(xmlGenericErrorContext,
4590 "HPP: entering CONTENT\n");
4591#endif
4592 break;
4593 case XML_PARSER_ENTITY_VALUE:
4594 xmlGenericError(xmlGenericErrorContext,
4595 "HPP: internal error, state == ENTITY_VALUE\n");
4596 ctxt->instate = XML_PARSER_CONTENT;
4597 ctxt->checkIndex = 0;
4598#ifdef DEBUG_PUSH
4599 xmlGenericError(xmlGenericErrorContext,
4600 "HPP: entering DTD\n");
4601#endif
4602 break;
4603 case XML_PARSER_ATTRIBUTE_VALUE:
4604 xmlGenericError(xmlGenericErrorContext,
4605 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
4606 ctxt->instate = XML_PARSER_START_TAG;
4607 ctxt->checkIndex = 0;
4608#ifdef DEBUG_PUSH
4609 xmlGenericError(xmlGenericErrorContext,
4610 "HPP: entering START_TAG\n");
4611#endif
4612 break;
4613 case XML_PARSER_SYSTEM_LITERAL:
4614 xmlGenericError(xmlGenericErrorContext,
4615 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
4616 ctxt->instate = XML_PARSER_CONTENT;
4617 ctxt->checkIndex = 0;
4618#ifdef DEBUG_PUSH
4619 xmlGenericError(xmlGenericErrorContext,
4620 "HPP: entering CONTENT\n");
4621#endif
4622 break;
4623 case XML_PARSER_IGNORE:
4624 xmlGenericError(xmlGenericErrorContext,
4625 "HPP: internal error, state == XML_PARSER_IGNORE\n");
4626 ctxt->instate = XML_PARSER_CONTENT;
4627 ctxt->checkIndex = 0;
4628#ifdef DEBUG_PUSH
4629 xmlGenericError(xmlGenericErrorContext,
4630 "HPP: entering CONTENT\n");
4631#endif
4632 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00004633 case XML_PARSER_PUBLIC_LITERAL:
4634 xmlGenericError(xmlGenericErrorContext,
4635 "HPP: internal error, state == XML_PARSER_LITERAL\n");
4636 ctxt->instate = XML_PARSER_CONTENT;
4637 ctxt->checkIndex = 0;
4638#ifdef DEBUG_PUSH
4639 xmlGenericError(xmlGenericErrorContext,
4640 "HPP: entering CONTENT\n");
4641#endif
4642 break;
4643
Owen Taylor3473f882001-02-23 17:55:21 +00004644 }
4645 }
4646done:
4647 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004648 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004649 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4650 /*
4651 * SAX: end of the document processing.
4652 */
4653 ctxt->instate = XML_PARSER_EOF;
4654 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4655 ctxt->sax->endDocument(ctxt->userData);
4656 }
4657 }
4658 if ((ctxt->myDoc != NULL) &&
4659 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4660 (ctxt->instate == XML_PARSER_EPILOG))) {
4661 xmlDtdPtr dtd;
4662 dtd = xmlGetIntSubset(ctxt->myDoc);
4663 if (dtd == NULL)
4664 ctxt->myDoc->intSubset =
4665 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4666 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4667 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4668 }
4669#ifdef DEBUG_PUSH
4670 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4671#endif
4672 return(ret);
4673}
4674
4675/**
Owen Taylor3473f882001-02-23 17:55:21 +00004676 * htmlParseChunk:
4677 * @ctxt: an XML parser context
4678 * @chunk: an char array
4679 * @size: the size in byte of the chunk
4680 * @terminate: last chunk indicator
4681 *
4682 * Parse a Chunk of memory
4683 *
4684 * Returns zero if no error, the xmlParserErrors otherwise.
4685 */
4686int
4687htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4688 int terminate) {
4689 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4690 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4691 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4692 int cur = ctxt->input->cur - ctxt->input->base;
4693
4694 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4695 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4696 ctxt->input->cur = ctxt->input->base + cur;
4697#ifdef DEBUG_PUSH
4698 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4699#endif
4700
4701 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4702 htmlParseTryOrFinish(ctxt, terminate);
4703 } else if (ctxt->instate != XML_PARSER_EOF) {
4704 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
4705 htmlParseTryOrFinish(ctxt, terminate);
4706 }
4707 if (terminate) {
4708 if ((ctxt->instate != XML_PARSER_EOF) &&
4709 (ctxt->instate != XML_PARSER_EPILOG) &&
4710 (ctxt->instate != XML_PARSER_MISC)) {
4711 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004712 ctxt->wellFormed = 0;
4713 }
4714 if (ctxt->instate != XML_PARSER_EOF) {
4715 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4716 ctxt->sax->endDocument(ctxt->userData);
4717 }
4718 ctxt->instate = XML_PARSER_EOF;
4719 }
4720 return((xmlParserErrors) ctxt->errNo);
4721}
4722
4723/************************************************************************
4724 * *
4725 * User entry points *
4726 * *
4727 ************************************************************************/
4728
4729/**
4730 * htmlCreatePushParserCtxt :
4731 * @sax: a SAX handler
4732 * @user_data: The user data returned on SAX callbacks
4733 * @chunk: a pointer to an array of chars
4734 * @size: number of chars in the array
4735 * @filename: an optional file name or URI
4736 * @enc: an optional encoding
4737 *
4738 * Create a parser context for using the HTML parser in push mode
4739 * To allow content encoding detection, @size should be >= 4
4740 * The value of @filename is used for fetching external entities
4741 * and error/warning reports.
4742 *
4743 * Returns the new parser context or NULL
4744 */
4745htmlParserCtxtPtr
4746htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4747 const char *chunk, int size, const char *filename,
4748 xmlCharEncoding enc) {
4749 htmlParserCtxtPtr ctxt;
4750 htmlParserInputPtr inputStream;
4751 xmlParserInputBufferPtr buf;
4752
Daniel Veillardd0463562001-10-13 09:15:48 +00004753 xmlInitParser();
4754
Owen Taylor3473f882001-02-23 17:55:21 +00004755 buf = xmlAllocParserInputBuffer(enc);
4756 if (buf == NULL) return(NULL);
4757
4758 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4759 if (ctxt == NULL) {
4760 xmlFree(buf);
4761 return(NULL);
4762 }
4763 memset(ctxt, 0, sizeof(htmlParserCtxt));
4764 htmlInitParserCtxt(ctxt);
4765 if (sax != NULL) {
4766 if (ctxt->sax != &htmlDefaultSAXHandler)
4767 xmlFree(ctxt->sax);
4768 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4769 if (ctxt->sax == NULL) {
4770 xmlFree(buf);
4771 xmlFree(ctxt);
4772 return(NULL);
4773 }
4774 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4775 if (user_data != NULL)
4776 ctxt->userData = user_data;
4777 }
4778 if (filename == NULL) {
4779 ctxt->directory = NULL;
4780 } else {
4781 ctxt->directory = xmlParserGetDirectory(filename);
4782 }
4783
4784 inputStream = htmlNewInputStream(ctxt);
4785 if (inputStream == NULL) {
4786 xmlFreeParserCtxt(ctxt);
4787 return(NULL);
4788 }
4789
4790 if (filename == NULL)
4791 inputStream->filename = NULL;
4792 else
4793 inputStream->filename = xmlMemStrdup(filename);
4794 inputStream->buf = buf;
4795 inputStream->base = inputStream->buf->buffer->content;
4796 inputStream->cur = inputStream->buf->buffer->content;
4797
4798 inputPush(ctxt, inputStream);
4799
4800 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4801 (ctxt->input->buf != NULL)) {
4802 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4803#ifdef DEBUG_PUSH
4804 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4805#endif
4806 }
4807
4808 return(ctxt);
4809}
4810
4811/**
4812 * htmlSAXParseDoc :
4813 * @cur: a pointer to an array of xmlChar
4814 * @encoding: a free form C string describing the HTML document encoding, or NULL
4815 * @sax: the SAX handler block
4816 * @userData: if using SAX, this pointer will be provided on callbacks.
4817 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004818 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
4819 * to handle parse events. If sax is NULL, fallback to the default DOM
4820 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00004821 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004822 * Returns the resulting document tree unless SAX is NULL or the document is
4823 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00004824 */
4825
4826htmlDocPtr
4827htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
4828 htmlDocPtr ret;
4829 htmlParserCtxtPtr ctxt;
4830
Daniel Veillardd0463562001-10-13 09:15:48 +00004831 xmlInitParser();
4832
Owen Taylor3473f882001-02-23 17:55:21 +00004833 if (cur == NULL) return(NULL);
4834
4835
4836 ctxt = htmlCreateDocParserCtxt(cur, encoding);
4837 if (ctxt == NULL) return(NULL);
4838 if (sax != NULL) {
4839 ctxt->sax = sax;
4840 ctxt->userData = userData;
4841 }
4842
4843 htmlParseDocument(ctxt);
4844 ret = ctxt->myDoc;
4845 if (sax != NULL) {
4846 ctxt->sax = NULL;
4847 ctxt->userData = NULL;
4848 }
4849 htmlFreeParserCtxt(ctxt);
4850
4851 return(ret);
4852}
4853
4854/**
4855 * htmlParseDoc :
4856 * @cur: a pointer to an array of xmlChar
4857 * @encoding: a free form C string describing the HTML document encoding, or NULL
4858 *
4859 * parse an HTML in-memory document and build a tree.
4860 *
4861 * Returns the resulting document tree
4862 */
4863
4864htmlDocPtr
4865htmlParseDoc(xmlChar *cur, const char *encoding) {
4866 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4867}
4868
4869
4870/**
4871 * htmlCreateFileParserCtxt :
4872 * @filename: the filename
4873 * @encoding: a free form C string describing the HTML document encoding, or NULL
4874 *
4875 * Create a parser context for a file content.
4876 * Automatic support for ZLIB/Compress compressed document is provided
4877 * by default if found at compile-time.
4878 *
4879 * Returns the new parser context or NULL
4880 */
4881htmlParserCtxtPtr
4882htmlCreateFileParserCtxt(const char *filename, const char *encoding)
4883{
4884 htmlParserCtxtPtr ctxt;
4885 htmlParserInputPtr inputStream;
4886 xmlParserInputBufferPtr buf;
4887 /* htmlCharEncoding enc; */
4888 xmlChar *content, *content_line = (xmlChar *) "charset=";
4889
4890 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
4891 if (buf == NULL) return(NULL);
4892
4893 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4894 if (ctxt == NULL) {
4895 perror("malloc");
4896 return(NULL);
4897 }
4898 memset(ctxt, 0, sizeof(htmlParserCtxt));
4899 htmlInitParserCtxt(ctxt);
4900 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
4901 if (inputStream == NULL) {
4902 perror("malloc");
4903 xmlFree(ctxt);
4904 return(NULL);
4905 }
4906 memset(inputStream, 0, sizeof(htmlParserInput));
4907
4908 inputStream->filename = xmlMemStrdup(filename);
4909 inputStream->line = 1;
4910 inputStream->col = 1;
4911 inputStream->buf = buf;
4912 inputStream->directory = NULL;
4913
4914 inputStream->base = inputStream->buf->buffer->content;
4915 inputStream->cur = inputStream->buf->buffer->content;
4916 inputStream->free = NULL;
4917
4918 inputPush(ctxt, inputStream);
4919
4920 /* set encoding */
4921 if (encoding) {
4922 content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
4923 if (content) {
4924 strcpy ((char *)content, (char *)content_line);
4925 strcat ((char *)content, (char *)encoding);
4926 htmlCheckEncoding (ctxt, content);
4927 xmlFree (content);
4928 }
4929 }
4930
4931 return(ctxt);
4932}
4933
4934/**
4935 * htmlSAXParseFile :
4936 * @filename: the filename
4937 * @encoding: a free form C string describing the HTML document encoding, or NULL
4938 * @sax: the SAX handler block
4939 * @userData: if using SAX, this pointer will be provided on callbacks.
4940 *
4941 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4942 * compressed document is provided by default if found at compile-time.
4943 * It use the given SAX function block to handle the parsing callback.
4944 * If sax is NULL, fallback to the default DOM tree building routines.
4945 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004946 * Returns the resulting document tree unless SAX is NULL or the document is
4947 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00004948 */
4949
4950htmlDocPtr
4951htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
4952 void *userData) {
4953 htmlDocPtr ret;
4954 htmlParserCtxtPtr ctxt;
4955 htmlSAXHandlerPtr oldsax = NULL;
4956
Daniel Veillardd0463562001-10-13 09:15:48 +00004957 xmlInitParser();
4958
Owen Taylor3473f882001-02-23 17:55:21 +00004959 ctxt = htmlCreateFileParserCtxt(filename, encoding);
4960 if (ctxt == NULL) return(NULL);
4961 if (sax != NULL) {
4962 oldsax = ctxt->sax;
4963 ctxt->sax = sax;
4964 ctxt->userData = userData;
4965 }
4966
4967 htmlParseDocument(ctxt);
4968
4969 ret = ctxt->myDoc;
4970 if (sax != NULL) {
4971 ctxt->sax = oldsax;
4972 ctxt->userData = NULL;
4973 }
4974 htmlFreeParserCtxt(ctxt);
4975
4976 return(ret);
4977}
4978
4979/**
4980 * htmlParseFile :
4981 * @filename: the filename
4982 * @encoding: a free form C string describing the HTML document encoding, or NULL
4983 *
4984 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4985 * compressed document is provided by default if found at compile-time.
4986 *
4987 * Returns the resulting document tree
4988 */
4989
4990htmlDocPtr
4991htmlParseFile(const char *filename, const char *encoding) {
4992 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
4993}
4994
4995/**
4996 * htmlHandleOmittedElem:
4997 * @val: int 0 or 1
4998 *
4999 * Set and return the previous value for handling HTML omitted tags.
5000 *
5001 * Returns the last value for 0 for no handling, 1 for auto insertion.
5002 */
5003
5004int
5005htmlHandleOmittedElem(int val) {
5006 int old = htmlOmittedDefaultValue;
5007
5008 htmlOmittedDefaultValue = val;
5009 return(old);
5010}
5011
5012#endif /* LIBXML_HTML_ENABLED */