blob: c8e9cca6b5ef71a2eeddf1c8cdf150b0fd63ddee [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Owen Taylor3473f882001-02-23 17:55:21 +000045
46#define HTML_MAX_NAMELEN 1000
47#define HTML_PARSER_BIG_BUFFER_SIZE 1000
48#define HTML_PARSER_BUFFER_SIZE 100
49
50/* #define DEBUG */
51/* #define DEBUG_PUSH */
52
Daniel Veillard22090732001-07-16 00:06:07 +000053static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000054
Daniel Veillard56a4cb82001-03-24 17:00:36 +000055xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
56 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000057static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000058
59/************************************************************************
60 * *
Owen Taylor3473f882001-02-23 17:55:21 +000061 * Parser stacks related functions and macros *
62 * *
63 ************************************************************************/
64
65/*
66 * Generic function for accessing stacks in the Parser Context
67 */
68
69#define PUSH_AND_POP(scope, type, name) \
70scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
71 if (ctxt->name##Nr >= ctxt->name##Max) { \
72 ctxt->name##Max *= 2; \
73 ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
74 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
75 if (ctxt->name##Tab == NULL) { \
76 xmlGenericError(xmlGenericErrorContext, \
77 "realloc failed !\n"); \
78 return(0); \
79 } \
80 } \
81 ctxt->name##Tab[ctxt->name##Nr] = value; \
82 ctxt->name = value; \
83 return(ctxt->name##Nr++); \
84} \
85scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
86 type ret; \
87 if (ctxt->name##Nr < 0) return(0); \
88 ctxt->name##Nr--; \
89 if (ctxt->name##Nr < 0) return(0); \
90 if (ctxt->name##Nr > 0) \
91 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
92 else \
93 ctxt->name = NULL; \
94 ret = ctxt->name##Tab[ctxt->name##Nr]; \
95 ctxt->name##Tab[ctxt->name##Nr] = 0; \
96 return(ret); \
97} \
98
Daniel Veillard56a4cb82001-03-24 17:00:36 +000099/* PUSH_AND_POP(static, xmlNodePtr, node) */
100PUSH_AND_POP(static, xmlChar*, name)
Owen Taylor3473f882001-02-23 17:55:21 +0000101
102/*
103 * Macros for accessing the content. Those should be used only by the parser,
104 * and not exported.
105 *
106 * Dirty macros, i.e. one need to make assumption on the context to use them
107 *
108 * CUR_PTR return the current pointer to the xmlChar to be parsed.
109 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
110 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
111 * in UNICODE mode. This should be used internally by the parser
112 * only to compare to ASCII values otherwise it would break when
113 * running with UTF-8 encoding.
114 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
115 * to compare on ASCII based substring.
116 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
117 * it should be used only to compare on ASCII based substring.
118 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
119 * strings within the parser.
120 *
121 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
122 *
123 * CURRENT Returns the current char value, with the full decoding of
124 * UTF-8 if we are using this mode. It returns an int.
125 * NEXT Skip to the next character, this does the proper decoding
126 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
127 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
128 */
129
130#define UPPER (toupper(*ctxt->input->cur))
131
132#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
133
134#define NXT(val) ctxt->input->cur[(val)]
135
136#define UPP(val) (toupper(ctxt->input->cur[(val)]))
137
138#define CUR_PTR ctxt->input->cur
139
140#define SHRINK xmlParserInputShrink(ctxt->input)
141
142#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
143
144#define CURRENT ((int) (*ctxt->input->cur))
145
146#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
147
148/* Inported from XML */
149
Daniel Veillard561b7f82002-03-20 21:55:57 +0000150/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
151#define CUR ((int) (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000152#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
153
Daniel Veillard561b7f82002-03-20 21:55:57 +0000154#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000155#define NXT(val) ctxt->input->cur[(val)]
156#define CUR_PTR ctxt->input->cur
157
158
159#define NEXTL(l) do { \
160 if (*(ctxt->input->cur) == '\n') { \
161 ctxt->input->line++; ctxt->input->col = 1; \
162 } else ctxt->input->col++; \
163 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
164 } while (0)
165
166/************
167 \
168 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
169 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
170 ************/
171
172#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
173#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
174
175#define COPY_BUF(l,b,i,v) \
176 if (l == 1) b[i++] = (xmlChar) v; \
177 else i += xmlCopyChar(l,&b[i],v)
178
179/**
180 * htmlCurrentChar:
181 * @ctxt: the HTML parser context
182 * @len: pointer to the length of the char read
183 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000184 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000185 * bytes in the input buffer. Implement the end of line normalization:
186 * 2.11 End-of-Line Handling
187 * If the encoding is unspecified, in the case we find an ISO-Latin-1
188 * char, then the encoding converter is plugged in automatically.
189 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000190 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000191 */
192
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000193static int
Owen Taylor3473f882001-02-23 17:55:21 +0000194htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
195 if (ctxt->instate == XML_PARSER_EOF)
196 return(0);
197
198 if (ctxt->token != 0) {
199 *len = 0;
200 return(ctxt->token);
201 }
202 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
203 /*
204 * We are supposed to handle UTF8, check it's valid
205 * From rfc2044: encoding of the Unicode values on UTF-8:
206 *
207 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
208 * 0000 0000-0000 007F 0xxxxxxx
209 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
210 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
211 *
212 * Check for the 0x110000 limit too
213 */
214 const unsigned char *cur = ctxt->input->cur;
215 unsigned char c;
216 unsigned int val;
217
218 c = *cur;
219 if (c & 0x80) {
220 if (cur[1] == 0)
221 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
222 if ((cur[1] & 0xc0) != 0x80)
223 goto encoding_error;
224 if ((c & 0xe0) == 0xe0) {
225
226 if (cur[2] == 0)
227 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
228 if ((cur[2] & 0xc0) != 0x80)
229 goto encoding_error;
230 if ((c & 0xf0) == 0xf0) {
231 if (cur[3] == 0)
232 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
233 if (((c & 0xf8) != 0xf0) ||
234 ((cur[3] & 0xc0) != 0x80))
235 goto encoding_error;
236 /* 4-byte code */
237 *len = 4;
238 val = (cur[0] & 0x7) << 18;
239 val |= (cur[1] & 0x3f) << 12;
240 val |= (cur[2] & 0x3f) << 6;
241 val |= cur[3] & 0x3f;
242 } else {
243 /* 3-byte code */
244 *len = 3;
245 val = (cur[0] & 0xf) << 12;
246 val |= (cur[1] & 0x3f) << 6;
247 val |= cur[2] & 0x3f;
248 }
249 } else {
250 /* 2-byte code */
251 *len = 2;
252 val = (cur[0] & 0x1f) << 6;
253 val |= cur[1] & 0x3f;
254 }
255 if (!IS_CHAR(val)) {
256 ctxt->errNo = XML_ERR_INVALID_ENCODING;
257 if ((ctxt->sax != NULL) &&
258 (ctxt->sax->error != NULL))
259 ctxt->sax->error(ctxt->userData,
260 "Char 0x%X out of allowed range\n", val);
261 ctxt->wellFormed = 0;
262 ctxt->disableSAX = 1;
263 }
264 return(val);
265 } else {
266 /* 1-byte code */
267 *len = 1;
268 return((int) *ctxt->input->cur);
269 }
270 }
271 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000272 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000273 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000274 * XML constructs only use < 128 chars
275 */
276 *len = 1;
277 if ((int) *ctxt->input->cur < 0x80)
278 return((int) *ctxt->input->cur);
279
280 /*
281 * Humm this is bad, do an automatic flow conversion
282 */
283 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
284 ctxt->charset = XML_CHAR_ENCODING_UTF8;
285 return(xmlCurrentChar(ctxt, len));
286
287encoding_error:
288 /*
289 * If we detect an UTF8 error that probably mean that the
290 * input encoding didn't get properly advertized in the
291 * declaration header. Report the error and switch the encoding
292 * to ISO-Latin-1 (if you don't like this policy, just declare the
293 * encoding !)
294 */
295 ctxt->errNo = XML_ERR_INVALID_ENCODING;
296 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
297 ctxt->sax->error(ctxt->userData,
298 "Input is not proper UTF-8, indicate encoding !\n");
299 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
300 ctxt->input->cur[0], ctxt->input->cur[1],
301 ctxt->input->cur[2], ctxt->input->cur[3]);
302 }
303
304 ctxt->charset = XML_CHAR_ENCODING_8859_1;
305 *len = 1;
306 return((int) *ctxt->input->cur);
307}
308
309/**
Owen Taylor3473f882001-02-23 17:55:21 +0000310 * htmlSkipBlankChars:
311 * @ctxt: the HTML parser context
312 *
313 * skip all blanks character found at that point in the input streams.
314 *
315 * Returns the number of space chars skipped
316 */
317
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000318static int
Owen Taylor3473f882001-02-23 17:55:21 +0000319htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
320 int res = 0;
321
322 while (IS_BLANK(*(ctxt->input->cur))) {
323 if ((*ctxt->input->cur == 0) &&
324 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
325 xmlPopInput(ctxt);
326 } else {
327 if (*(ctxt->input->cur) == '\n') {
328 ctxt->input->line++; ctxt->input->col = 1;
329 } else ctxt->input->col++;
330 ctxt->input->cur++;
331 ctxt->nbChars++;
332 if (*ctxt->input->cur == 0)
333 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
334 }
335 res++;
336 }
337 return(res);
338}
339
340
341
342/************************************************************************
343 * *
344 * The list of HTML elements and their properties *
345 * *
346 ************************************************************************/
347
348/*
349 * Start Tag: 1 means the start tag can be ommited
350 * End Tag: 1 means the end tag can be ommited
351 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000352 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000353 * Depr: this element is deprecated
354 * DTD: 1 means that this element is valid only in the Loose DTD
355 * 2 means that this element is valid only in the Frameset DTD
356 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000357 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Owen Taylor3473f882001-02-23 17:55:21 +0000358 */
Daniel Veillard22090732001-07-16 00:06:07 +0000359static const htmlElemDesc
360html40ElementTable[] = {
Daniel Veillard02bb1702001-06-13 21:11:59 +0000361{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor " },
362{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form" },
363{ "acronym", 0, 0, 0, 0, 0, 0, 1, "" },
364{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author " },
365{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet " },
366{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area " },
367{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style" },
368{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri " },
369{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " },
370{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride " },
371{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style" },
372{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation " },
373{ "body", 1, 1, 0, 0, 0, 0, 0, "document body " },
374{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break " },
375{ "button", 0, 0, 0, 0, 0, 0, 2, "push button " },
376{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption " },
377{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center " },
378{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation" },
379{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment" },
380{ "col", 0, 2, 2, 1, 0, 0, 0, "table column " },
381{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group " },
382{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description " },
383{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text " },
384{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition" },
385{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list" },
386{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container"},
387{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list " },
388{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term " },
389{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis" },
390{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group " },
391{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font " },
392{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form " },
393{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " },
394{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" },
395{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading " },
396{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading " },
397{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading " },
398{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading " },
399{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading " },
400{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading " },
401{ "head", 1, 1, 0, 0, 0, 0, 0, "document head " },
402{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " },
403{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element " },
404{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style" },
405{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow " },
406{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image " },
407{ "input", 0, 2, 2, 1, 0, 0, 1, "form control " },
408{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text" },
409{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt " },
410{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user" },
411{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text " },
412{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend " },
413{ "li", 0, 1, 1, 0, 0, 0, 0, "list item " },
414{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link " },
415{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map " },
416{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list " },
417{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation " },
418{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering " },
419{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
420{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object " },
421{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list " },
422{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group " },
423{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " },
424{ "p", 0, 1, 1, 0, 0, 0, 0, "paragraph " },
425{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value " },
426{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text " },
427{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation " },
428{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style" },
429{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc." },
430{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements " },
431{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector " },
432{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style" },
433{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container " },
434{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text" },
435{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis" },
436{ "style", 0, 0, 0, 0, 0, 0, 0, "style info " },
437{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript" },
438{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript " },
439{ "table", 0, 0, 0, 0, 0, 0, 0, "&#160;" },
440{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body " },
441{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell" },
442{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field " },
443{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer " },
444{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell" },
445{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header " },
446{ "title", 0, 0, 0, 0, 0, 0, 0, "document title " },
447{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row " },
448{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style" },
449{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style" },
450{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list " },
451{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument" },
Owen Taylor3473f882001-02-23 17:55:21 +0000452};
453
454/*
Owen Taylor3473f882001-02-23 17:55:21 +0000455 * start tags that imply the end of current element
456 */
Daniel Veillard22090732001-07-16 00:06:07 +0000457static const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000458"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
459 "dl", "ul", "ol", "menu", "dir", "address", "pre",
460 "listing", "xmp", "head", NULL,
461"head", "p", NULL,
462"title", "p", NULL,
463"body", "head", "style", "link", "title", "p", NULL,
464"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
465 "pre", "listing", "xmp", "head", "li", NULL,
466"hr", "p", "head", NULL,
467"h1", "p", "head", NULL,
468"h2", "p", "head", NULL,
469"h3", "p", "head", NULL,
470"h4", "p", "head", NULL,
471"h5", "p", "head", NULL,
472"h6", "p", "head", NULL,
473"dir", "p", "head", NULL,
474"address", "p", "head", "ul", NULL,
475"pre", "p", "head", "ul", NULL,
476"listing", "p", "head", NULL,
477"xmp", "p", "head", NULL,
478"blockquote", "p", "head", NULL,
479"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
480 "xmp", "head", NULL,
481"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
482 "head", "dd", NULL,
483"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
484 "head", "dt", NULL,
485"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
486 "listing", "xmp", NULL,
487"ol", "p", "head", "ul", NULL,
488"menu", "p", "head", "ul", NULL,
489"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
490"div", "p", "head", NULL,
491"noscript", "p", "head", NULL,
492"center", "font", "b", "i", "p", "head", NULL,
493"a", "a", NULL,
494"caption", "p", NULL,
495"colgroup", "caption", "colgroup", "col", "p", NULL,
496"col", "caption", "col", "p", NULL,
497"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
498 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000499"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
500"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000501"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
502"thead", "caption", "col", "colgroup", NULL,
503"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
504 "tbody", "p", NULL,
505"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
506 "tfoot", "tbody", "p", NULL,
507"optgroup", "option", NULL,
508"option", "option", NULL,
509"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
510 "pre", "listing", "xmp", "a", NULL,
511NULL
512};
513
514/*
515 * The list of HTML elements which are supposed not to have
516 * CDATA content and where a p element will be implied
517 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000518 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +0000519 * implied paragraph
520 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000521static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000522 "html",
523 "head",
524 "body",
525 NULL
526};
527
528/*
529 * The list of HTML attributes which are of content %Script;
530 * NOTE: when adding ones, check htmlIsScriptAttribute() since
531 * it assumes the name starts with 'on'
532 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000533static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000534 "onclick",
535 "ondblclick",
536 "onmousedown",
537 "onmouseup",
538 "onmouseover",
539 "onmousemove",
540 "onmouseout",
541 "onkeypress",
542 "onkeydown",
543 "onkeyup",
544 "onload",
545 "onunload",
546 "onfocus",
547 "onblur",
548 "onsubmit",
549 "onrest",
550 "onchange",
551 "onselect"
552};
553
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000554/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000555 * This table is used by the htmlparser to know what to do with
556 * broken html pages. By assigning different priorities to different
557 * elements the parser can decide how to handle extra endtags.
558 * Endtags are only allowed to close elements with lower or equal
559 * priority.
560 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000561
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000562typedef struct {
563 const char *name;
564 int priority;
565} elementPriority;
566
Daniel Veillard22090732001-07-16 00:06:07 +0000567static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000568 {"div", 150},
569 {"td", 160},
570 {"th", 160},
571 {"tr", 170},
572 {"thead", 180},
573 {"tbody", 180},
574 {"tfoot", 180},
575 {"table", 190},
576 {"head", 200},
577 {"body", 200},
578 {"html", 220},
579 {NULL, 100} /* Default priority */
580};
Owen Taylor3473f882001-02-23 17:55:21 +0000581
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000582static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +0000583static int htmlStartCloseIndexinitialized = 0;
584
585/************************************************************************
586 * *
587 * functions to handle HTML specific data *
588 * *
589 ************************************************************************/
590
591/**
592 * htmlInitAutoClose:
593 *
594 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
595 * This is not reentrant. Call xmlInitParser() once before processing in
596 * case of use in multithreaded programs.
597 */
598void
599htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000600 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000601
602 if (htmlStartCloseIndexinitialized) return;
603
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000604 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
605 indx = 0;
606 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
607 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +0000608 while (htmlStartClose[i] != NULL) i++;
609 i++;
610 }
611 htmlStartCloseIndexinitialized = 1;
612}
613
614/**
615 * htmlTagLookup:
616 * @tag: The tag name in lowercase
617 *
618 * Lookup the HTML tag in the ElementTable
619 *
620 * Returns the related htmlElemDescPtr or NULL if not found.
621 */
Daniel Veillardbb371292001-08-16 23:26:59 +0000622const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +0000623htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000624 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000625
626 for (i = 0; i < (sizeof(html40ElementTable) /
627 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000628 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillard22090732001-07-16 00:06:07 +0000629 return((const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +0000630 }
631 return(NULL);
632}
633
634/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000635 * htmlGetEndPriority:
636 * @name: The name of the element to look up the priority for.
637 *
638 * Return value: The "endtag" priority.
639 **/
640static int
641htmlGetEndPriority (const xmlChar *name) {
642 int i = 0;
643
644 while ((htmlEndPriority[i].name != NULL) &&
645 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
646 i++;
647
648 return(htmlEndPriority[i].priority);
649}
650
651/**
Owen Taylor3473f882001-02-23 17:55:21 +0000652 * htmlCheckAutoClose:
653 * @newtag: The new tag name
654 * @oldtag: The old tag name
655 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000656 * Checks whether the new tag is one of the registered valid tags for
657 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +0000658 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
659 *
660 * Returns 0 if no, 1 if yes.
661 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000662static int
Owen Taylor3473f882001-02-23 17:55:21 +0000663htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000664 int i, indx;
665 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +0000666
667 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
668
669 /* inefficient, but not a big deal */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000670 for (indx = 0; indx < 100;indx++) {
671 closed = htmlStartCloseIndex[indx];
672 if (closed == NULL) return(0);
673 if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor3473f882001-02-23 17:55:21 +0000674 }
675
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000676 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +0000677 i++;
678 while (htmlStartClose[i] != NULL) {
679 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
680 return(1);
681 }
682 i++;
683 }
684 return(0);
685}
686
687/**
688 * htmlAutoCloseOnClose:
689 * @ctxt: an HTML parser context
690 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000691 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +0000692 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000693 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000694 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000695static void
Owen Taylor3473f882001-02-23 17:55:21 +0000696htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillardbb371292001-08-16 23:26:59 +0000697 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +0000698 xmlChar *oldname;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000699 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +0000700
701#ifdef DEBUG
702 xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
703 for (i = 0;i < ctxt->nameNr;i++)
704 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
705#endif
706
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000707 priority = htmlGetEndPriority (newtag);
708
Owen Taylor3473f882001-02-23 17:55:21 +0000709 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000710
Owen Taylor3473f882001-02-23 17:55:21 +0000711 if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000712 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000713 * A missplaced endtag can only close elements with lower
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000714 * or equal priority, so if we find an element with higher
715 * priority before we find an element with
716 * matching name, we just ignore this endtag
717 */
718 if (htmlGetEndPriority (ctxt->nameTab[i]) > priority) return;
Owen Taylor3473f882001-02-23 17:55:21 +0000719 }
720 if (i < 0) return;
721
722 while (!xmlStrEqual(newtag, ctxt->name)) {
723 info = htmlTagLookup(ctxt->name);
724 if ((info == NULL) || (info->endTag == 1)) {
725#ifdef DEBUG
726 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
727#endif
Daniel Veillard56098d42001-04-24 12:51:09 +0000728 } else if (info->endTag == 3) {
729#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +0000730 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", newtag, ctxt->name);
William M. Brack1633d182001-10-05 15:41:19 +0000731
Daniel Veillard56098d42001-04-24 12:51:09 +0000732#endif
733 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
734 ctxt->sax->error(ctxt->userData,
735 "Opening and ending tag mismatch: %s and %s\n",
736 newtag, ctxt->name);
737 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000738 }
739 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
740 ctxt->sax->endElement(ctxt->userData, ctxt->name);
741 oldname = htmlnamePop(ctxt);
742 if (oldname != NULL) {
743#ifdef DEBUG
744 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
745#endif
746 xmlFree(oldname);
747 }
748 }
749}
750
751/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000752 * htmlAutoCloseOnEnd:
753 * @ctxt: an HTML parser context
754 *
755 * Close all remaining tags at the end of the stream
756 */
757static void
758htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
759 xmlChar *oldname;
760 int i;
761
762 if (ctxt->nameNr == 0)
763 return;
764#ifdef DEBUG
765 xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
766#endif
767
768 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
769#ifdef DEBUG
770 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
771#endif
772 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
773 ctxt->sax->endElement(ctxt->userData, ctxt->name);
774 oldname = htmlnamePop(ctxt);
775 if (oldname != NULL) {
776#ifdef DEBUG
777 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
778#endif
779 xmlFree(oldname);
780 }
781 }
782}
783
784/**
Owen Taylor3473f882001-02-23 17:55:21 +0000785 * htmlAutoClose:
786 * @ctxt: an HTML parser context
787 * @newtag: The new tag name or NULL
788 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000789 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000790 * The list is kept in htmlStartClose array. This function is
791 * called when a new tag has been detected and generates the
792 * appropriates closes if possible/needed.
793 * If newtag is NULL this mean we are at the end of the resource
794 * and we should check
795 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000796static void
Owen Taylor3473f882001-02-23 17:55:21 +0000797htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
798 xmlChar *oldname;
799 while ((newtag != NULL) && (ctxt->name != NULL) &&
800 (htmlCheckAutoClose(newtag, ctxt->name))) {
801#ifdef DEBUG
802 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
803#endif
804 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
805 ctxt->sax->endElement(ctxt->userData, ctxt->name);
806 oldname = htmlnamePop(ctxt);
807 if (oldname != NULL) {
808#ifdef DEBUG
809 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
810#endif
811 xmlFree(oldname);
812 }
813 }
814 if (newtag == NULL) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000815 htmlAutoCloseOnEnd(ctxt);
816 return;
Owen Taylor3473f882001-02-23 17:55:21 +0000817 }
818 while ((newtag == NULL) && (ctxt->name != NULL) &&
819 ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
820 (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
821 (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
822#ifdef DEBUG
823 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
824#endif
825 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
826 ctxt->sax->endElement(ctxt->userData, ctxt->name);
827 oldname = htmlnamePop(ctxt);
828 if (oldname != NULL) {
829#ifdef DEBUG
830 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
831#endif
832 xmlFree(oldname);
833 }
834 }
835
836}
837
838/**
839 * htmlAutoCloseTag:
840 * @doc: the HTML document
841 * @name: The tag name
842 * @elem: the HTML element
843 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000844 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000845 * The list is kept in htmlStartClose array. This function checks
846 * if the element or one of it's children would autoclose the
847 * given tag.
848 *
849 * Returns 1 if autoclose, 0 otherwise
850 */
851int
852htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
853 htmlNodePtr child;
854
855 if (elem == NULL) return(1);
856 if (xmlStrEqual(name, elem->name)) return(0);
857 if (htmlCheckAutoClose(elem->name, name)) return(1);
858 child = elem->children;
859 while (child != NULL) {
860 if (htmlAutoCloseTag(doc, name, child)) return(1);
861 child = child->next;
862 }
863 return(0);
864}
865
866/**
867 * htmlIsAutoClosed:
868 * @doc: the HTML document
869 * @elem: the HTML element
870 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000871 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000872 * The list is kept in htmlStartClose array. This function checks
873 * if a tag is autoclosed by one of it's child
874 *
875 * Returns 1 if autoclosed, 0 otherwise
876 */
877int
878htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
879 htmlNodePtr child;
880
881 if (elem == NULL) return(1);
882 child = elem->children;
883 while (child != NULL) {
884 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
885 child = child->next;
886 }
887 return(0);
888}
889
890/**
891 * htmlCheckImplied:
892 * @ctxt: an HTML parser context
893 * @newtag: The new tag name
894 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000895 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +0000896 * called when a new tag has been detected and generates the
897 * appropriates implicit tags if missing
898 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000899static void
Owen Taylor3473f882001-02-23 17:55:21 +0000900htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
901 if (!htmlOmittedDefaultValue)
902 return;
903 if (xmlStrEqual(newtag, BAD_CAST"html"))
904 return;
905 if (ctxt->nameNr <= 0) {
906#ifdef DEBUG
907 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
908#endif
909 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
910 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
911 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
912 }
913 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
914 return;
915 if ((ctxt->nameNr <= 1) &&
916 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
917 (xmlStrEqual(newtag, BAD_CAST"style")) ||
918 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
919 (xmlStrEqual(newtag, BAD_CAST"link")) ||
920 (xmlStrEqual(newtag, BAD_CAST"title")) ||
921 (xmlStrEqual(newtag, BAD_CAST"base")))) {
922 /*
923 * dropped OBJECT ... i you put it first BODY will be
924 * assumed !
925 */
926#ifdef DEBUG
927 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
928#endif
929 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
930 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
931 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
932 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
933 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
934 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
935 int i;
936 for (i = 0;i < ctxt->nameNr;i++) {
937 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
938 return;
939 }
940 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
941 return;
942 }
943 }
944
945#ifdef DEBUG
946 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
947#endif
948 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
949 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
950 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
951 }
952}
953
954/**
955 * htmlCheckParagraph
956 * @ctxt: an HTML parser context
957 *
958 * Check whether a p element need to be implied before inserting
959 * characters in the current element.
960 *
961 * Returns 1 if a paragraph has been inserted, 0 if not and -1
962 * in case of error.
963 */
964
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000965static int
Owen Taylor3473f882001-02-23 17:55:21 +0000966htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
967 const xmlChar *tag;
968 int i;
969
970 if (ctxt == NULL)
971 return(-1);
972 tag = ctxt->name;
973 if (tag == NULL) {
974 htmlAutoClose(ctxt, BAD_CAST"p");
975 htmlCheckImplied(ctxt, BAD_CAST"p");
976 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
977 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
978 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
979 return(1);
980 }
981 if (!htmlOmittedDefaultValue)
982 return(0);
983 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
984 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
985#ifdef DEBUG
986 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
987#endif
988 htmlAutoClose(ctxt, BAD_CAST"p");
989 htmlCheckImplied(ctxt, BAD_CAST"p");
990 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
991 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
992 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
993 return(1);
994 }
995 }
996 return(0);
997}
998
999/**
1000 * htmlIsScriptAttribute:
1001 * @name: an attribute name
1002 *
1003 * Check if an attribute is of content type Script
1004 *
1005 * Returns 1 is the attribute is a script 0 otherwise
1006 */
1007int
1008htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001009 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001010
1011 if (name == NULL)
1012 return(0);
1013 /*
1014 * all script attributes start with 'on'
1015 */
1016 if ((name[0] != 'o') || (name[1] != 'n'))
1017 return(0);
1018 for (i = 0;
1019 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1020 i++) {
1021 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1022 return(1);
1023 }
1024 return(0);
1025}
1026
1027/************************************************************************
1028 * *
1029 * The list of HTML predefined entities *
1030 * *
1031 ************************************************************************/
1032
1033
Daniel Veillard22090732001-07-16 00:06:07 +00001034static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001035/*
1036 * the 4 absolute ones, plus apostrophe.
1037 */
1038{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1039{ 38, "amp", "ampersand, U+0026 ISOnum" },
1040{ 39, "apos", "single quote" },
1041{ 60, "lt", "less-than sign, U+003C ISOnum" },
1042{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1043
1044/*
1045 * A bunch still in the 128-255 range
1046 * Replacing them depend really on the charset used.
1047 */
1048{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1049{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1050{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1051{ 163, "pound","pound sign, U+00A3 ISOnum" },
1052{ 164, "curren","currency sign, U+00A4 ISOnum" },
1053{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1054{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1055{ 167, "sect", "section sign, U+00A7 ISOnum" },
1056{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1057{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1058{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1059{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1060{ 172, "not", "not sign, U+00AC ISOnum" },
1061{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1062{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1063{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1064{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1065{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1066{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1067{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1068{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1069{ 181, "micro","micro sign, U+00B5 ISOnum" },
1070{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1071{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1072{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1073{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1074{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1075{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1076{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1077{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1078{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1079{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1080{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1081{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1082{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1083{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1084{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1085{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1086{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1087{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1088{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1089{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1090{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1091{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1092{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1093{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1094{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1095{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1096{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1097{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1098{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1099{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1100{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1101{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1102{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1103{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1104{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1105{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1106{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1107{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1108{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1109{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1110{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1111{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1112{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1113{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1114{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1115{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1116{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1117{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1118{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1119{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1120{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1121{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1122{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1123{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1124{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1125{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1126{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1127{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1128{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1129{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1130{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1131{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1132{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1133{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1134{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1135{ 247, "divide","division sign, U+00F7 ISOnum" },
1136{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1137{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1138{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1139{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1140{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1141{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1142{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1143{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1144
1145{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1146{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1147{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1148{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1149{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1150
1151/*
1152 * Anything below should really be kept as entities references
1153 */
1154{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1155
1156{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1157{ 732, "tilde","small tilde, U+02DC ISOdia" },
1158
1159{ 913, "Alpha","greek capital letter alpha, U+0391" },
1160{ 914, "Beta", "greek capital letter beta, U+0392" },
1161{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1162{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1163{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1164{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1165{ 919, "Eta", "greek capital letter eta, U+0397" },
1166{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1167{ 921, "Iota", "greek capital letter iota, U+0399" },
1168{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001169{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001170{ 924, "Mu", "greek capital letter mu, U+039C" },
1171{ 925, "Nu", "greek capital letter nu, U+039D" },
1172{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1173{ 927, "Omicron","greek capital letter omicron, U+039F" },
1174{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1175{ 929, "Rho", "greek capital letter rho, U+03A1" },
1176{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1177{ 932, "Tau", "greek capital letter tau, U+03A4" },
1178{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1179{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1180{ 935, "Chi", "greek capital letter chi, U+03A7" },
1181{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1182{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1183
1184{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1185{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1186{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1187{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1188{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1189{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1190{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1191{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1192{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1193{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1194{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1195{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1196{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1197{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1198{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1199{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1200{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1201{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1202{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1203{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1204{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1205{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1206{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1207{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1208{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1209{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1210{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1211{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1212
1213{ 8194, "ensp", "en space, U+2002 ISOpub" },
1214{ 8195, "emsp", "em space, U+2003 ISOpub" },
1215{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1216{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1217{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1218{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1219{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1220{ 8211, "ndash","en dash, U+2013 ISOpub" },
1221{ 8212, "mdash","em dash, U+2014 ISOpub" },
1222{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1223{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1224{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1225{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1226{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1227{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1228{ 8224, "dagger","dagger, U+2020 ISOpub" },
1229{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1230
1231{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1232{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1233
1234{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1235
1236{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1237{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1238
1239{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1240{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1241
1242{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1243{ 8260, "frasl","fraction slash, U+2044 NEW" },
1244
1245{ 8364, "euro", "euro sign, U+20AC NEW" },
1246
1247{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1248{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1249{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1250{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1251{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1252{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1253{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1254{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1255{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1256{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1257{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1258{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1259{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1260{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1261{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1262{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1263
1264{ 8704, "forall","for all, U+2200 ISOtech" },
1265{ 8706, "part", "partial differential, U+2202 ISOtech" },
1266{ 8707, "exist","there exists, U+2203 ISOtech" },
1267{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1268{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1269{ 8712, "isin", "element of, U+2208 ISOtech" },
1270{ 8713, "notin","not an element of, U+2209 ISOtech" },
1271{ 8715, "ni", "contains as member, U+220B ISOtech" },
1272{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001273{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001274{ 8722, "minus","minus sign, U+2212 ISOtech" },
1275{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1276{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1277{ 8733, "prop", "proportional to, U+221D ISOtech" },
1278{ 8734, "infin","infinity, U+221E ISOtech" },
1279{ 8736, "ang", "angle, U+2220 ISOamso" },
1280{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1281{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1282{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1283{ 8746, "cup", "union = cup, U+222A ISOtech" },
1284{ 8747, "int", "integral, U+222B ISOtech" },
1285{ 8756, "there4","therefore, U+2234 ISOtech" },
1286{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1287{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1288{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1289{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1290{ 8801, "equiv","identical to, U+2261 ISOtech" },
1291{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1292{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1293{ 8834, "sub", "subset of, U+2282 ISOtech" },
1294{ 8835, "sup", "superset of, U+2283 ISOtech" },
1295{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1296{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1297{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1298{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1299{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1300{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1301{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1302{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1303{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1304{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1305{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1306{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1307{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1308{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1309
1310{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1311{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1312{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1313{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1314
1315};
1316
1317/************************************************************************
1318 * *
1319 * Commodity functions to handle entities *
1320 * *
1321 ************************************************************************/
1322
1323/*
1324 * Macro used to grow the current buffer.
1325 */
1326#define growBuffer(buffer) { \
1327 buffer##_size *= 2; \
1328 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1329 if (buffer == NULL) { \
1330 perror("realloc failed"); \
1331 return(NULL); \
1332 } \
1333}
1334
1335/**
1336 * htmlEntityLookup:
1337 * @name: the entity name
1338 *
1339 * Lookup the given entity in EntitiesTable
1340 *
1341 * TODO: the linear scan is really ugly, an hash table is really needed.
1342 *
1343 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1344 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001345const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001346htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001347 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001348
1349 for (i = 0;i < (sizeof(html40EntitiesTable)/
1350 sizeof(html40EntitiesTable[0]));i++) {
1351 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1352#ifdef DEBUG
1353 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
1354#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001355 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001356 }
1357 }
1358 return(NULL);
1359}
1360
1361/**
1362 * htmlEntityValueLookup:
1363 * @value: the entity's unicode value
1364 *
1365 * Lookup the given entity in EntitiesTable
1366 *
1367 * TODO: the linear scan is really ugly, an hash table is really needed.
1368 *
1369 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1370 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001371const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001372htmlEntityValueLookup(unsigned int value) {
1373 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001374#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00001375 unsigned int lv = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001376#endif
1377
1378 for (i = 0;i < (sizeof(html40EntitiesTable)/
1379 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001380 if (html40EntitiesTable[i].value >= value) {
1381 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001382 break;
1383#ifdef DEBUG
1384 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
1385#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001386 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001387 }
1388#ifdef DEBUG
1389 if (lv > html40EntitiesTable[i].value) {
1390 xmlGenericError(xmlGenericErrorContext,
1391 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1392 lv, html40EntitiesTable[i].value);
1393 }
1394 lv = html40EntitiesTable[i].value;
1395#endif
1396 }
1397 return(NULL);
1398}
1399
1400/**
1401 * UTF8ToHtml:
1402 * @out: a pointer to an array of bytes to store the result
1403 * @outlen: the length of @out
1404 * @in: a pointer to an array of UTF-8 chars
1405 * @inlen: the length of @in
1406 *
1407 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1408 * plus HTML entities block of chars out.
1409 *
1410 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1411 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001412 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001413 * The value of @outlen after return is the number of octets consumed.
1414 */
1415int
1416UTF8ToHtml(unsigned char* out, int *outlen,
1417 const unsigned char* in, int *inlen) {
1418 const unsigned char* processed = in;
1419 const unsigned char* outend;
1420 const unsigned char* outstart = out;
1421 const unsigned char* instart = in;
1422 const unsigned char* inend;
1423 unsigned int c, d;
1424 int trailing;
1425
1426 if (in == NULL) {
1427 /*
1428 * initialization nothing to do
1429 */
1430 *outlen = 0;
1431 *inlen = 0;
1432 return(0);
1433 }
1434 inend = in + (*inlen);
1435 outend = out + (*outlen);
1436 while (in < inend) {
1437 d = *in++;
1438 if (d < 0x80) { c= d; trailing= 0; }
1439 else if (d < 0xC0) {
1440 /* trailing byte in leading position */
1441 *outlen = out - outstart;
1442 *inlen = processed - instart;
1443 return(-2);
1444 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1445 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1446 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1447 else {
1448 /* no chance for this in Ascii */
1449 *outlen = out - outstart;
1450 *inlen = processed - instart;
1451 return(-2);
1452 }
1453
1454 if (inend - in < trailing) {
1455 break;
1456 }
1457
1458 for ( ; trailing; trailing--) {
1459 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1460 break;
1461 c <<= 6;
1462 c |= d & 0x3F;
1463 }
1464
1465 /* assertion: c is a single UTF-4 value */
1466 if (c < 0x80) {
1467 if (out + 1 >= outend)
1468 break;
1469 *out++ = c;
1470 } else {
1471 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001472 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001473
1474 /*
1475 * Try to lookup a predefined HTML entity for it
1476 */
1477
1478 ent = htmlEntityValueLookup(c);
1479 if (ent == NULL) {
1480 /* no chance for this in Ascii */
1481 *outlen = out - outstart;
1482 *inlen = processed - instart;
1483 return(-2);
1484 }
1485 len = strlen(ent->name);
1486 if (out + 2 + len >= outend)
1487 break;
1488 *out++ = '&';
1489 memcpy(out, ent->name, len);
1490 out += len;
1491 *out++ = ';';
1492 }
1493 processed = in;
1494 }
1495 *outlen = out - outstart;
1496 *inlen = processed - instart;
1497 return(0);
1498}
1499
1500/**
1501 * htmlEncodeEntities:
1502 * @out: a pointer to an array of bytes to store the result
1503 * @outlen: the length of @out
1504 * @in: a pointer to an array of UTF-8 chars
1505 * @inlen: the length of @in
1506 * @quoteChar: the quote character to escape (' or ") or zero.
1507 *
1508 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1509 * plus HTML entities block of chars out.
1510 *
1511 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1512 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001513 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001514 * The value of @outlen after return is the number of octets consumed.
1515 */
1516int
1517htmlEncodeEntities(unsigned char* out, int *outlen,
1518 const unsigned char* in, int *inlen, int quoteChar) {
1519 const unsigned char* processed = in;
1520 const unsigned char* outend = out + (*outlen);
1521 const unsigned char* outstart = out;
1522 const unsigned char* instart = in;
1523 const unsigned char* inend = in + (*inlen);
1524 unsigned int c, d;
1525 int trailing;
1526
1527 while (in < inend) {
1528 d = *in++;
1529 if (d < 0x80) { c= d; trailing= 0; }
1530 else if (d < 0xC0) {
1531 /* trailing byte in leading position */
1532 *outlen = out - outstart;
1533 *inlen = processed - instart;
1534 return(-2);
1535 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1536 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1537 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1538 else {
1539 /* no chance for this in Ascii */
1540 *outlen = out - outstart;
1541 *inlen = processed - instart;
1542 return(-2);
1543 }
1544
1545 if (inend - in < trailing)
1546 break;
1547
1548 while (trailing--) {
1549 if (((d= *in++) & 0xC0) != 0x80) {
1550 *outlen = out - outstart;
1551 *inlen = processed - instart;
1552 return(-2);
1553 }
1554 c <<= 6;
1555 c |= d & 0x3F;
1556 }
1557
1558 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001559 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1560 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001561 if (out >= outend)
1562 break;
1563 *out++ = c;
1564 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001565 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001566 const char *cp;
1567 char nbuf[16];
1568 int len;
1569
1570 /*
1571 * Try to lookup a predefined HTML entity for it
1572 */
1573 ent = htmlEntityValueLookup(c);
1574 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00001575 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00001576 cp = nbuf;
1577 }
1578 else
1579 cp = ent->name;
1580 len = strlen(cp);
1581 if (out + 2 + len > outend)
1582 break;
1583 *out++ = '&';
1584 memcpy(out, cp, len);
1585 out += len;
1586 *out++ = ';';
1587 }
1588 processed = in;
1589 }
1590 *outlen = out - outstart;
1591 *inlen = processed - instart;
1592 return(0);
1593}
1594
1595/**
1596 * htmlDecodeEntities:
1597 * @ctxt: the parser context
1598 * @len: the len to decode (in bytes !), -1 for no size limit
1599 * @end: an end marker xmlChar, 0 if none
1600 * @end2: an end marker xmlChar, 0 if none
1601 * @end3: an end marker xmlChar, 0 if none
1602 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001603 * Substitute the HTML entities by their value
Owen Taylor3473f882001-02-23 17:55:21 +00001604 *
1605 * DEPRECATED !!!!
1606 *
1607 * Returns A newly allocated string with the substitution done. The caller
1608 * must deallocate it !
1609 */
1610xmlChar *
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00001611htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
1612 xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001613 static int deprecated = 0;
1614 if (!deprecated) {
1615 xmlGenericError(xmlGenericErrorContext,
1616 "htmlDecodeEntities() deprecated function reached\n");
1617 deprecated = 1;
1618 }
1619 return(NULL);
1620#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00001621 xmlChar *name = NULL;
1622 xmlChar *buffer = NULL;
1623 unsigned int buffer_size = 0;
1624 unsigned int nbchars = 0;
1625 htmlEntityDescPtr ent;
1626 unsigned int max = (unsigned int) len;
1627 int c,l;
1628
1629 if (ctxt->depth > 40) {
1630 ctxt->errNo = XML_ERR_ENTITY_LOOP;
1631 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1632 ctxt->sax->error(ctxt->userData,
1633 "Detected entity reference loop\n");
1634 ctxt->wellFormed = 0;
1635 ctxt->disableSAX = 1;
1636 return(NULL);
1637 }
1638
1639 /*
1640 * allocate a translation buffer.
1641 */
1642 buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1643 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1644 if (buffer == NULL) {
1645 perror("xmlDecodeEntities: malloc failed");
1646 return(NULL);
1647 }
1648
1649 /*
1650 * Ok loop until we reach one of the ending char or a size limit.
1651 */
1652 c = CUR_CHAR(l);
1653 while ((nbchars < max) && (c != end) &&
1654 (c != end2) && (c != end3)) {
1655
1656 if (c == 0) break;
1657 if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
1658 int val = htmlParseCharRef(ctxt);
1659 COPY_BUF(0,buffer,nbchars,val);
1660 NEXTL(l);
1661 } else if ((c == '&') && (ctxt->token != '&')) {
1662 ent = htmlParseEntityRef(ctxt, &name);
1663 if (name != NULL) {
1664 if (ent != NULL) {
1665 int val = ent->value;
1666 COPY_BUF(0,buffer,nbchars,val);
1667 NEXTL(l);
1668 } else {
1669 const xmlChar *cur = name;
1670
1671 buffer[nbchars++] = '&';
1672 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1673 growBuffer(buffer);
1674 }
1675 while (*cur != 0) {
1676 buffer[nbchars++] = *cur++;
1677 }
1678 buffer[nbchars++] = ';';
1679 }
1680 }
1681 } else {
1682 COPY_BUF(l,buffer,nbchars,c);
1683 NEXTL(l);
1684 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1685 growBuffer(buffer);
1686 }
1687 }
1688 c = CUR_CHAR(l);
1689 }
1690 buffer[nbchars++] = 0;
1691 return(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001692#endif
Owen Taylor3473f882001-02-23 17:55:21 +00001693}
1694
1695/************************************************************************
1696 * *
1697 * Commodity functions to handle streams *
1698 * *
1699 ************************************************************************/
1700
1701/**
Owen Taylor3473f882001-02-23 17:55:21 +00001702 * htmlNewInputStream:
1703 * @ctxt: an HTML parser context
1704 *
1705 * Create a new input stream structure
1706 * Returns the new input stream or NULL
1707 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001708static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001709htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1710 htmlParserInputPtr input;
1711
1712 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1713 if (input == NULL) {
1714 ctxt->errNo = XML_ERR_NO_MEMORY;
1715 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1716 ctxt->sax->error(ctxt->userData,
1717 "malloc: couldn't allocate a new input stream\n");
1718 return(NULL);
1719 }
1720 memset(input, 0, sizeof(htmlParserInput));
1721 input->filename = NULL;
1722 input->directory = NULL;
1723 input->base = NULL;
1724 input->cur = NULL;
1725 input->buf = NULL;
1726 input->line = 1;
1727 input->col = 1;
1728 input->buf = NULL;
1729 input->free = NULL;
1730 input->version = NULL;
1731 input->consumed = 0;
1732 input->length = 0;
1733 return(input);
1734}
1735
1736
1737/************************************************************************
1738 * *
1739 * Commodity functions, cleanup needed ? *
1740 * *
1741 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001742/*
1743 * all tags allowing pc data from the html 4.01 loose dtd
1744 * NOTE: it might be more apropriate to integrate this information
1745 * into the html40ElementTable array but I don't want to risk any
1746 * binary incomptibility
1747 */
1748static const char *allowPCData[] = {
1749 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
1750 "blockquote", "body", "button", "caption", "center", "cite", "code",
1751 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
1752 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
1753 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
1754 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
1755};
Owen Taylor3473f882001-02-23 17:55:21 +00001756
1757/**
1758 * areBlanks:
1759 * @ctxt: an HTML parser context
1760 * @str: a xmlChar *
1761 * @len: the size of @str
1762 *
1763 * Is this a sequence of blank chars that one can ignore ?
1764 *
1765 * Returns 1 if ignorable 0 otherwise.
1766 */
1767
1768static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001769 unsigned int i;
1770 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00001771 xmlNodePtr lastChild;
1772
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001773 for (j = 0;j < len;j++)
1774 if (!(IS_BLANK(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001775
1776 if (CUR == 0) return(1);
1777 if (CUR != '<') return(0);
1778 if (ctxt->name == NULL)
1779 return(1);
1780 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
1781 return(1);
1782 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
1783 return(1);
1784 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
1785 return(1);
1786 if (ctxt->node == NULL) return(0);
1787 lastChild = xmlGetLastChild(ctxt->node);
1788 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00001789 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
1790 (ctxt->node->content != NULL)) return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001791 /* keep ws in constructs like ...<b> </b>...
1792 for all tags "b" allowing PCDATA */
1793 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
1794 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
1795 return(0);
1796 }
1797 }
Owen Taylor3473f882001-02-23 17:55:21 +00001798 } else if (xmlNodeIsText(lastChild)) {
1799 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001800 } else {
1801 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
1802 for all tags "p" allowing PCDATA */
1803 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
1804 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
1805 return(0);
1806 }
1807 }
Owen Taylor3473f882001-02-23 17:55:21 +00001808 }
1809 return(1);
1810}
1811
1812/**
Owen Taylor3473f882001-02-23 17:55:21 +00001813 * htmlNewDocNoDtD:
1814 * @URI: URI for the dtd, or NULL
1815 * @ExternalID: the external ID of the DTD, or NULL
1816 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001817 * Creates a new HTML document without a DTD node if @URI and @ExternalID
1818 * are NULL
1819 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001820 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00001821 */
1822htmlDocPtr
1823htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
1824 xmlDocPtr cur;
1825
1826 /*
1827 * Allocate a new document and fill the fields.
1828 */
1829 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
1830 if (cur == NULL) {
1831 xmlGenericError(xmlGenericErrorContext,
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001832 "htmlNewDocNoDtD : malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00001833 return(NULL);
1834 }
1835 memset(cur, 0, sizeof(xmlDoc));
1836
1837 cur->type = XML_HTML_DOCUMENT_NODE;
1838 cur->version = NULL;
1839 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001840 cur->doc = cur;
1841 cur->name = NULL;
1842 cur->children = NULL;
1843 cur->extSubset = NULL;
1844 cur->oldNs = NULL;
1845 cur->encoding = NULL;
1846 cur->standalone = 1;
1847 cur->compression = 0;
1848 cur->ids = NULL;
1849 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001850 cur->_private = NULL;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00001851 if ((ExternalID != NULL) ||
1852 (URI != NULL))
Daniel Veillard5151c062001-10-23 13:10:19 +00001853 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00001854 return(cur);
1855}
1856
1857/**
1858 * htmlNewDoc:
1859 * @URI: URI for the dtd, or NULL
1860 * @ExternalID: the external ID of the DTD, or NULL
1861 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001862 * Creates a new HTML document
1863 *
Owen Taylor3473f882001-02-23 17:55:21 +00001864 * Returns a new document
1865 */
1866htmlDocPtr
1867htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1868 if ((URI == NULL) && (ExternalID == NULL))
1869 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00001870 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
1871 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00001872
1873 return(htmlNewDocNoDtD(URI, ExternalID));
1874}
1875
1876
1877/************************************************************************
1878 * *
1879 * The parser itself *
1880 * Relates to http://www.w3.org/TR/html40 *
1881 * *
1882 ************************************************************************/
1883
1884/************************************************************************
1885 * *
1886 * The parser itself *
1887 * *
1888 ************************************************************************/
1889
1890/**
1891 * htmlParseHTMLName:
1892 * @ctxt: an HTML parser context
1893 *
1894 * parse an HTML tag or attribute name, note that we convert it to lowercase
1895 * since HTML names are not case-sensitive.
1896 *
1897 * Returns the Tag Name parsed or NULL
1898 */
1899
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001900static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001901htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1902 xmlChar *ret = NULL;
1903 int i = 0;
1904 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
1905
1906 if (!IS_LETTER(CUR) && (CUR != '_') &&
1907 (CUR != ':')) return(NULL);
1908
1909 while ((i < HTML_PARSER_BUFFER_SIZE) &&
1910 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1911 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
1912 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
1913 else loc[i] = CUR;
1914 i++;
1915
1916 NEXT;
1917 }
1918
1919 ret = xmlStrndup(loc, i);
1920
1921 return(ret);
1922}
1923
1924/**
1925 * htmlParseName:
1926 * @ctxt: an HTML parser context
1927 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001928 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00001929 *
1930 * Returns the Name parsed or NULL
1931 */
1932
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001933static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001934htmlParseName(htmlParserCtxtPtr ctxt) {
1935 xmlChar buf[HTML_MAX_NAMELEN];
1936 int len = 0;
1937
1938 GROW;
1939 if (!IS_LETTER(CUR) && (CUR != '_')) {
1940 return(NULL);
1941 }
1942
1943 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1944 (CUR == '.') || (CUR == '-') ||
1945 (CUR == '_') || (CUR == ':') ||
1946 (IS_COMBINING(CUR)) ||
1947 (IS_EXTENDER(CUR))) {
1948 buf[len++] = CUR;
1949 NEXT;
1950 if (len >= HTML_MAX_NAMELEN) {
1951 xmlGenericError(xmlGenericErrorContext,
1952 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1953 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1954 (CUR == '.') || (CUR == '-') ||
1955 (CUR == '_') || (CUR == ':') ||
1956 (IS_COMBINING(CUR)) ||
1957 (IS_EXTENDER(CUR)))
1958 NEXT;
1959 break;
1960 }
1961 }
1962 return(xmlStrndup(buf, len));
1963}
1964
1965/**
1966 * htmlParseHTMLAttribute:
1967 * @ctxt: an HTML parser context
1968 * @stop: a char stop value
1969 *
1970 * parse an HTML attribute value till the stop (quote), if
1971 * stop is 0 then it stops at the first space
1972 *
1973 * Returns the attribute parsed or NULL
1974 */
1975
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001976static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001977htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
1978 xmlChar *buffer = NULL;
1979 int buffer_size = 0;
1980 xmlChar *out = NULL;
1981 xmlChar *name = NULL;
1982
1983 xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00001984 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001985
1986 /*
1987 * allocate a translation buffer.
1988 */
1989 buffer_size = HTML_PARSER_BUFFER_SIZE;
1990 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1991 if (buffer == NULL) {
1992 perror("htmlParseHTMLAttribute: malloc failed");
1993 return(NULL);
1994 }
1995 out = buffer;
1996
1997 /*
1998 * Ok loop until we reach one of the ending chars
1999 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002000 while ((CUR != 0) && (CUR != stop)) {
2001 if ((stop == 0) && (CUR == '>')) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002002 if ((stop == 0) && (IS_BLANK(CUR))) break;
2003 if (CUR == '&') {
2004 if (NXT(1) == '#') {
2005 unsigned int c;
2006 int bits;
2007
2008 c = htmlParseCharRef(ctxt);
2009 if (c < 0x80)
2010 { *out++ = c; bits= -6; }
2011 else if (c < 0x800)
2012 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2013 else if (c < 0x10000)
2014 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2015 else
2016 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2017
2018 for ( ; bits >= 0; bits-= 6) {
2019 *out++ = ((c >> bits) & 0x3F) | 0x80;
2020 }
2021 } else {
2022 ent = htmlParseEntityRef(ctxt, &name);
2023 if (name == NULL) {
2024 *out++ = '&';
2025 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002026 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002027
2028 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002029 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002030 }
2031 } else if (ent == NULL) {
2032 *out++ = '&';
2033 cur = name;
2034 while (*cur != 0) {
2035 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002036 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002037
2038 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002039 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002040 }
2041 *out++ = *cur++;
2042 }
2043 xmlFree(name);
2044 } else {
2045 unsigned int c;
2046 int bits;
2047
2048 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002049 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002050
2051 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002052 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002053 }
2054 c = (xmlChar)ent->value;
2055 if (c < 0x80)
2056 { *out++ = c; bits= -6; }
2057 else if (c < 0x800)
2058 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2059 else if (c < 0x10000)
2060 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2061 else
2062 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2063
2064 for ( ; bits >= 0; bits-= 6) {
2065 *out++ = ((c >> bits) & 0x3F) | 0x80;
2066 }
2067 xmlFree(name);
2068 }
2069 }
2070 } else {
2071 unsigned int c;
2072 int bits, l;
2073
2074 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002075 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002076
2077 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002078 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002079 }
2080 c = CUR_CHAR(l);
2081 if (c < 0x80)
2082 { *out++ = c; bits= -6; }
2083 else if (c < 0x800)
2084 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2085 else if (c < 0x10000)
2086 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2087 else
2088 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2089
2090 for ( ; bits >= 0; bits-= 6) {
2091 *out++ = ((c >> bits) & 0x3F) | 0x80;
2092 }
2093 NEXT;
2094 }
2095 }
2096 *out++ = 0;
2097 return(buffer);
2098}
2099
2100/**
Owen Taylor3473f882001-02-23 17:55:21 +00002101 * htmlParseEntityRef:
2102 * @ctxt: an HTML parser context
2103 * @str: location to store the entity name
2104 *
2105 * parse an HTML ENTITY references
2106 *
2107 * [68] EntityRef ::= '&' Name ';'
2108 *
2109 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2110 * if non-NULL *str will have to be freed by the caller.
2111 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002112const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00002113htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2114 xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002115 const htmlEntityDesc * ent = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002116 *str = NULL;
2117
2118 if (CUR == '&') {
2119 NEXT;
2120 name = htmlParseName(ctxt);
2121 if (name == NULL) {
2122 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2123 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2124 ctxt->wellFormed = 0;
2125 } else {
2126 GROW;
2127 if (CUR == ';') {
2128 *str = name;
2129
2130 /*
2131 * Lookup the entity in the table.
2132 */
2133 ent = htmlEntityLookup(name);
2134 if (ent != NULL) /* OK that's ugly !!! */
2135 NEXT;
2136 } else {
2137 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2138 ctxt->sax->error(ctxt->userData,
2139 "htmlParseEntityRef: expecting ';'\n");
2140 *str = name;
2141 }
2142 }
2143 }
2144 return(ent);
2145}
2146
2147/**
2148 * htmlParseAttValue:
2149 * @ctxt: an HTML parser context
2150 *
2151 * parse a value for an attribute
2152 * Note: the parser won't do substitution of entities here, this
2153 * will be handled later in xmlStringGetNodeList, unless it was
2154 * asked for ctxt->replaceEntities != 0
2155 *
2156 * Returns the AttValue parsed or NULL.
2157 */
2158
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002159static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002160htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2161 xmlChar *ret = NULL;
2162
2163 if (CUR == '"') {
2164 NEXT;
2165 ret = htmlParseHTMLAttribute(ctxt, '"');
2166 if (CUR != '"') {
2167 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2168 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2169 ctxt->wellFormed = 0;
2170 } else
2171 NEXT;
2172 } else if (CUR == '\'') {
2173 NEXT;
2174 ret = htmlParseHTMLAttribute(ctxt, '\'');
2175 if (CUR != '\'') {
2176 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2177 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2178 ctxt->wellFormed = 0;
2179 } else
2180 NEXT;
2181 } else {
2182 /*
2183 * That's an HTMLism, the attribute value may not be quoted
2184 */
2185 ret = htmlParseHTMLAttribute(ctxt, 0);
2186 if (ret == NULL) {
2187 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2188 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2189 ctxt->wellFormed = 0;
2190 }
2191 }
2192 return(ret);
2193}
2194
2195/**
2196 * htmlParseSystemLiteral:
2197 * @ctxt: an HTML parser context
2198 *
2199 * parse an HTML Literal
2200 *
2201 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2202 *
2203 * Returns the SystemLiteral parsed or NULL
2204 */
2205
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002206static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002207htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2208 const xmlChar *q;
2209 xmlChar *ret = NULL;
2210
2211 if (CUR == '"') {
2212 NEXT;
2213 q = CUR_PTR;
2214 while ((IS_CHAR(CUR)) && (CUR != '"'))
2215 NEXT;
2216 if (!IS_CHAR(CUR)) {
2217 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2218 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2219 ctxt->wellFormed = 0;
2220 } else {
2221 ret = xmlStrndup(q, CUR_PTR - q);
2222 NEXT;
2223 }
2224 } else if (CUR == '\'') {
2225 NEXT;
2226 q = CUR_PTR;
2227 while ((IS_CHAR(CUR)) && (CUR != '\''))
2228 NEXT;
2229 if (!IS_CHAR(CUR)) {
2230 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2231 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2232 ctxt->wellFormed = 0;
2233 } else {
2234 ret = xmlStrndup(q, CUR_PTR - q);
2235 NEXT;
2236 }
2237 } else {
2238 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2239 ctxt->sax->error(ctxt->userData,
2240 "SystemLiteral \" or ' expected\n");
2241 ctxt->wellFormed = 0;
2242 }
2243
2244 return(ret);
2245}
2246
2247/**
2248 * htmlParsePubidLiteral:
2249 * @ctxt: an HTML parser context
2250 *
2251 * parse an HTML public literal
2252 *
2253 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2254 *
2255 * Returns the PubidLiteral parsed or NULL.
2256 */
2257
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002258static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002259htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2260 const xmlChar *q;
2261 xmlChar *ret = NULL;
2262 /*
2263 * Name ::= (Letter | '_') (NameChar)*
2264 */
2265 if (CUR == '"') {
2266 NEXT;
2267 q = CUR_PTR;
2268 while (IS_PUBIDCHAR(CUR)) NEXT;
2269 if (CUR != '"') {
2270 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2271 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2272 ctxt->wellFormed = 0;
2273 } else {
2274 ret = xmlStrndup(q, CUR_PTR - q);
2275 NEXT;
2276 }
2277 } else if (CUR == '\'') {
2278 NEXT;
2279 q = CUR_PTR;
2280 while ((IS_LETTER(CUR)) && (CUR != '\''))
2281 NEXT;
2282 if (!IS_LETTER(CUR)) {
2283 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2284 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2285 ctxt->wellFormed = 0;
2286 } else {
2287 ret = xmlStrndup(q, CUR_PTR - q);
2288 NEXT;
2289 }
2290 } else {
2291 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2292 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2293 ctxt->wellFormed = 0;
2294 }
2295
2296 return(ret);
2297}
2298
2299/**
2300 * htmlParseScript:
2301 * @ctxt: an HTML parser context
2302 *
2303 * parse the content of an HTML SCRIPT or STYLE element
2304 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2305 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2306 * http://www.w3.org/TR/html4/types.html#type-script
2307 * http://www.w3.org/TR/html4/types.html#h-6.15
2308 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2309 *
2310 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2311 * element and the value of intrinsic event attributes. User agents must
2312 * not evaluate script data as HTML markup but instead must pass it on as
2313 * data to a script engine.
2314 * NOTES:
2315 * - The content is passed like CDATA
2316 * - the attributes for style and scripting "onXXX" are also described
2317 * as CDATA but SGML allows entities references in attributes so their
2318 * processing is identical as other attributes
2319 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002320static void
Owen Taylor3473f882001-02-23 17:55:21 +00002321htmlParseScript(htmlParserCtxtPtr ctxt) {
2322 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2323 int nbchar = 0;
2324 xmlChar cur;
2325
2326 SHRINK;
2327 cur = CUR;
2328 while (IS_CHAR(cur)) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00002329 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2330 (NXT(3) == '-')) {
2331 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2332 if (ctxt->sax->cdataBlock!= NULL) {
2333 /*
2334 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2335 */
2336 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2337 }
2338 }
2339 nbchar = 0;
2340 htmlParseComment(ctxt);
2341 cur = CUR;
2342 continue;
2343 } else if ((cur == '<') && (NXT(1) == '/')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002344 /*
2345 * One should break here, the specification is clear:
2346 * Authors should therefore escape "</" within the content.
2347 * Escape mechanisms are specific to each scripting or
2348 * style sheet language.
2349 */
2350 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2351 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2352 break; /* while */
2353 }
2354 buf[nbchar++] = cur;
2355 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2356 if (ctxt->sax->cdataBlock!= NULL) {
2357 /*
2358 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2359 */
2360 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2361 }
2362 nbchar = 0;
2363 }
2364 NEXT;
2365 cur = CUR;
2366 }
2367 if (!(IS_CHAR(cur))) {
2368 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2369 ctxt->sax->error(ctxt->userData,
2370 "Invalid char in CDATA 0x%X\n", cur);
2371 ctxt->wellFormed = 0;
2372 NEXT;
2373 }
2374
2375 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2376 if (ctxt->sax->cdataBlock!= NULL) {
2377 /*
2378 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2379 */
2380 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2381 }
2382 }
2383}
2384
2385
2386/**
2387 * htmlParseCharData:
2388 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002389 *
2390 * parse a CharData section.
2391 * if we are within a CDATA section ']]>' marks an end of section.
2392 *
2393 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2394 */
2395
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002396static void
2397htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002398 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2399 int nbchar = 0;
2400 int cur, l;
2401
2402 SHRINK;
2403 cur = CUR_CHAR(l);
2404 while (((cur != '<') || (ctxt->token == '<')) &&
2405 ((cur != '&') || (ctxt->token == '&')) &&
2406 (IS_CHAR(cur))) {
2407 COPY_BUF(l,buf,nbchar,cur);
2408 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2409 /*
2410 * Ok the segment is to be consumed as chars.
2411 */
2412 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2413 if (areBlanks(ctxt, buf, nbchar)) {
2414 if (ctxt->sax->ignorableWhitespace != NULL)
2415 ctxt->sax->ignorableWhitespace(ctxt->userData,
2416 buf, nbchar);
2417 } else {
2418 htmlCheckParagraph(ctxt);
2419 if (ctxt->sax->characters != NULL)
2420 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2421 }
2422 }
2423 nbchar = 0;
2424 }
2425 NEXTL(l);
2426 cur = CUR_CHAR(l);
2427 }
2428 if (nbchar != 0) {
2429 /*
2430 * Ok the segment is to be consumed as chars.
2431 */
2432 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2433 if (areBlanks(ctxt, buf, nbchar)) {
2434 if (ctxt->sax->ignorableWhitespace != NULL)
2435 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2436 } else {
2437 htmlCheckParagraph(ctxt);
2438 if (ctxt->sax->characters != NULL)
2439 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2440 }
2441 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002442 } else {
2443 /*
2444 * Loop detection
2445 */
2446 if (cur == 0)
2447 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002448 }
2449}
2450
2451/**
2452 * htmlParseExternalID:
2453 * @ctxt: an HTML parser context
2454 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002455 *
2456 * Parse an External ID or a Public ID
2457 *
Owen Taylor3473f882001-02-23 17:55:21 +00002458 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2459 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2460 *
2461 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2462 *
2463 * Returns the function returns SystemLiteral and in the second
2464 * case publicID receives PubidLiteral, is strict is off
2465 * it is possible to return NULL and have publicID set.
2466 */
2467
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002468static xmlChar *
2469htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002470 xmlChar *URI = NULL;
2471
2472 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2473 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2474 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2475 SKIP(6);
2476 if (!IS_BLANK(CUR)) {
2477 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2478 ctxt->sax->error(ctxt->userData,
2479 "Space required after 'SYSTEM'\n");
2480 ctxt->wellFormed = 0;
2481 }
2482 SKIP_BLANKS;
2483 URI = htmlParseSystemLiteral(ctxt);
2484 if (URI == NULL) {
2485 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2486 ctxt->sax->error(ctxt->userData,
2487 "htmlParseExternalID: SYSTEM, no URI\n");
2488 ctxt->wellFormed = 0;
2489 }
2490 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2491 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2492 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2493 SKIP(6);
2494 if (!IS_BLANK(CUR)) {
2495 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2496 ctxt->sax->error(ctxt->userData,
2497 "Space required after 'PUBLIC'\n");
2498 ctxt->wellFormed = 0;
2499 }
2500 SKIP_BLANKS;
2501 *publicID = htmlParsePubidLiteral(ctxt);
2502 if (*publicID == NULL) {
2503 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2504 ctxt->sax->error(ctxt->userData,
2505 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2506 ctxt->wellFormed = 0;
2507 }
2508 SKIP_BLANKS;
2509 if ((CUR == '"') || (CUR == '\'')) {
2510 URI = htmlParseSystemLiteral(ctxt);
2511 }
2512 }
2513 return(URI);
2514}
2515
2516/**
2517 * htmlParseComment:
2518 * @ctxt: an HTML parser context
2519 *
2520 * Parse an XML (SGML) comment <!-- .... -->
2521 *
2522 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2523 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002524static void
Owen Taylor3473f882001-02-23 17:55:21 +00002525htmlParseComment(htmlParserCtxtPtr ctxt) {
2526 xmlChar *buf = NULL;
2527 int len;
2528 int size = HTML_PARSER_BUFFER_SIZE;
2529 int q, ql;
2530 int r, rl;
2531 int cur, l;
2532 xmlParserInputState state;
2533
2534 /*
2535 * Check that there is a comment right here.
2536 */
2537 if ((RAW != '<') || (NXT(1) != '!') ||
2538 (NXT(2) != '-') || (NXT(3) != '-')) return;
2539
2540 state = ctxt->instate;
2541 ctxt->instate = XML_PARSER_COMMENT;
2542 SHRINK;
2543 SKIP(4);
2544 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2545 if (buf == NULL) {
2546 xmlGenericError(xmlGenericErrorContext,
2547 "malloc of %d byte failed\n", size);
2548 ctxt->instate = state;
2549 return;
2550 }
2551 q = CUR_CHAR(ql);
2552 NEXTL(ql);
2553 r = CUR_CHAR(rl);
2554 NEXTL(rl);
2555 cur = CUR_CHAR(l);
2556 len = 0;
2557 while (IS_CHAR(cur) &&
2558 ((cur != '>') ||
2559 (r != '-') || (q != '-'))) {
2560 if (len + 5 >= size) {
2561 size *= 2;
2562 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2563 if (buf == NULL) {
2564 xmlGenericError(xmlGenericErrorContext,
2565 "realloc of %d byte failed\n", size);
2566 ctxt->instate = state;
2567 return;
2568 }
2569 }
2570 COPY_BUF(ql,buf,len,q);
2571 q = r;
2572 ql = rl;
2573 r = cur;
2574 rl = l;
2575 NEXTL(l);
2576 cur = CUR_CHAR(l);
2577 if (cur == 0) {
2578 SHRINK;
2579 GROW;
2580 cur = CUR_CHAR(l);
2581 }
2582 }
2583 buf[len] = 0;
2584 if (!IS_CHAR(cur)) {
2585 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2586 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2587 ctxt->sax->error(ctxt->userData,
2588 "Comment not terminated \n<!--%.50s\n", buf);
2589 ctxt->wellFormed = 0;
2590 xmlFree(buf);
2591 } else {
2592 NEXT;
2593 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2594 (!ctxt->disableSAX))
2595 ctxt->sax->comment(ctxt->userData, buf);
2596 xmlFree(buf);
2597 }
2598 ctxt->instate = state;
2599}
2600
2601/**
2602 * htmlParseCharRef:
2603 * @ctxt: an HTML parser context
2604 *
2605 * parse Reference declarations
2606 *
2607 * [66] CharRef ::= '&#' [0-9]+ ';' |
2608 * '&#x' [0-9a-fA-F]+ ';'
2609 *
2610 * Returns the value parsed (as an int)
2611 */
2612int
2613htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2614 int val = 0;
2615
2616 if ((CUR == '&') && (NXT(1) == '#') &&
2617 (NXT(2) == 'x')) {
2618 SKIP(3);
2619 while (CUR != ';') {
2620 if ((CUR >= '0') && (CUR <= '9'))
2621 val = val * 16 + (CUR - '0');
2622 else if ((CUR >= 'a') && (CUR <= 'f'))
2623 val = val * 16 + (CUR - 'a') + 10;
2624 else if ((CUR >= 'A') && (CUR <= 'F'))
2625 val = val * 16 + (CUR - 'A') + 10;
2626 else {
2627 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2628 ctxt->sax->error(ctxt->userData,
2629 "htmlParseCharRef: invalid hexadecimal value\n");
2630 ctxt->wellFormed = 0;
2631 return(0);
2632 }
2633 NEXT;
2634 }
2635 if (CUR == ';')
2636 NEXT;
2637 } else if ((CUR == '&') && (NXT(1) == '#')) {
2638 SKIP(2);
2639 while (CUR != ';') {
2640 if ((CUR >= '0') && (CUR <= '9'))
2641 val = val * 10 + (CUR - '0');
2642 else {
2643 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2644 ctxt->sax->error(ctxt->userData,
2645 "htmlParseCharRef: invalid decimal value\n");
2646 ctxt->wellFormed = 0;
2647 return(0);
2648 }
2649 NEXT;
2650 }
2651 if (CUR == ';')
2652 NEXT;
2653 } else {
2654 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2655 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2656 ctxt->wellFormed = 0;
2657 }
2658 /*
2659 * Check the value IS_CHAR ...
2660 */
2661 if (IS_CHAR(val)) {
2662 return(val);
2663 } else {
2664 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2665 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
2666 val);
2667 ctxt->wellFormed = 0;
2668 }
2669 return(0);
2670}
2671
2672
2673/**
2674 * htmlParseDocTypeDecl :
2675 * @ctxt: an HTML parser context
2676 *
2677 * parse a DOCTYPE declaration
2678 *
2679 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2680 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2681 */
2682
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002683static void
Owen Taylor3473f882001-02-23 17:55:21 +00002684htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
2685 xmlChar *name;
2686 xmlChar *ExternalID = NULL;
2687 xmlChar *URI = NULL;
2688
2689 /*
2690 * We know that '<!DOCTYPE' has been detected.
2691 */
2692 SKIP(9);
2693
2694 SKIP_BLANKS;
2695
2696 /*
2697 * Parse the DOCTYPE name.
2698 */
2699 name = htmlParseName(ctxt);
2700 if (name == NULL) {
2701 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2702 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2703 ctxt->wellFormed = 0;
2704 }
2705 /*
2706 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2707 */
2708
2709 SKIP_BLANKS;
2710
2711 /*
2712 * Check for SystemID and ExternalID
2713 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002714 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00002715 SKIP_BLANKS;
2716
2717 /*
2718 * We should be at the end of the DOCTYPE declaration.
2719 */
2720 if (CUR != '>') {
2721 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardf6ed8bc2001-10-02 09:22:47 +00002722 ctxt->sax->error(ctxt->userData, "DOCTYPE improperly terminated\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002723 ctxt->wellFormed = 0;
2724 /* We shouldn't try to resynchronize ... */
2725 }
2726 NEXT;
2727
2728 /*
2729 * Create or update the document accordingly to the DOCTYPE
2730 */
2731 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2732 (!ctxt->disableSAX))
2733 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
2734
2735 /*
2736 * Cleanup, since we don't use all those identifiers
2737 */
2738 if (URI != NULL) xmlFree(URI);
2739 if (ExternalID != NULL) xmlFree(ExternalID);
2740 if (name != NULL) xmlFree(name);
2741}
2742
2743/**
2744 * htmlParseAttribute:
2745 * @ctxt: an HTML parser context
2746 * @value: a xmlChar ** used to store the value of the attribute
2747 *
2748 * parse an attribute
2749 *
2750 * [41] Attribute ::= Name Eq AttValue
2751 *
2752 * [25] Eq ::= S? '=' S?
2753 *
2754 * With namespace:
2755 *
2756 * [NS 11] Attribute ::= QName Eq AttValue
2757 *
2758 * Also the case QName == xmlns:??? is handled independently as a namespace
2759 * definition.
2760 *
2761 * Returns the attribute name, and the value in *value.
2762 */
2763
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002764static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002765htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
2766 xmlChar *name, *val = NULL;
2767
2768 *value = NULL;
2769 name = htmlParseHTMLName(ctxt);
2770 if (name == NULL) {
2771 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2772 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2773 ctxt->wellFormed = 0;
2774 return(NULL);
2775 }
2776
2777 /*
2778 * read the value
2779 */
2780 SKIP_BLANKS;
2781 if (CUR == '=') {
2782 NEXT;
2783 SKIP_BLANKS;
2784 val = htmlParseAttValue(ctxt);
2785 /******
2786 } else {
2787 * TODO : some attribute must have values, some may not
2788 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2789 ctxt->sax->warning(ctxt->userData,
2790 "No value for attribute %s\n", name); */
2791 }
2792
2793 *value = val;
2794 return(name);
2795}
2796
2797/**
2798 * htmlCheckEncoding:
2799 * @ctxt: an HTML parser context
2800 * @attvalue: the attribute value
2801 *
2802 * Checks an http-equiv attribute from a Meta tag to detect
2803 * the encoding
2804 * If a new encoding is detected the parser is switched to decode
2805 * it and pass UTF8
2806 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002807static void
Owen Taylor3473f882001-02-23 17:55:21 +00002808htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2809 const xmlChar *encoding;
2810
2811 if ((ctxt == NULL) || (attvalue == NULL))
2812 return;
2813
2814 /* do not change encoding */
2815 if (ctxt->input->encoding != NULL)
2816 return;
2817
2818 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
2819 if (encoding != NULL) {
2820 encoding += 8;
2821 } else {
2822 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
2823 if (encoding != NULL)
2824 encoding += 9;
2825 }
2826 if (encoding != NULL) {
2827 xmlCharEncoding enc;
2828 xmlCharEncodingHandlerPtr handler;
2829
2830 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2831
2832 if (ctxt->input->encoding != NULL)
2833 xmlFree((xmlChar *) ctxt->input->encoding);
2834 ctxt->input->encoding = xmlStrdup(encoding);
2835
2836 enc = xmlParseCharEncoding((const char *) encoding);
2837 /*
2838 * registered set of known encodings
2839 */
2840 if (enc != XML_CHAR_ENCODING_ERROR) {
2841 xmlSwitchEncoding(ctxt, enc);
2842 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2843 } else {
2844 /*
2845 * fallback for unknown encodings
2846 */
2847 handler = xmlFindCharEncodingHandler((const char *) encoding);
2848 if (handler != NULL) {
2849 xmlSwitchToEncoding(ctxt, handler);
2850 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2851 } else {
2852 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2853 }
2854 }
2855
2856 if ((ctxt->input->buf != NULL) &&
2857 (ctxt->input->buf->encoder != NULL) &&
2858 (ctxt->input->buf->raw != NULL) &&
2859 (ctxt->input->buf->buffer != NULL)) {
2860 int nbchars;
2861 int processed;
2862
2863 /*
2864 * convert as much as possible to the parser reading buffer.
2865 */
2866 processed = ctxt->input->cur - ctxt->input->base;
2867 xmlBufferShrink(ctxt->input->buf->buffer, processed);
2868 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2869 ctxt->input->buf->buffer,
2870 ctxt->input->buf->raw);
2871 if (nbchars < 0) {
2872 ctxt->errNo = XML_ERR_INVALID_ENCODING;
2873 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2874 ctxt->sax->error(ctxt->userData,
2875 "htmlCheckEncoding: encoder error\n");
2876 }
2877 ctxt->input->base =
2878 ctxt->input->cur = ctxt->input->buf->buffer->content;
2879 }
2880 }
2881}
2882
2883/**
2884 * htmlCheckMeta:
2885 * @ctxt: an HTML parser context
2886 * @atts: the attributes values
2887 *
2888 * Checks an attributes from a Meta tag
2889 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002890static void
Owen Taylor3473f882001-02-23 17:55:21 +00002891htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2892 int i;
2893 const xmlChar *att, *value;
2894 int http = 0;
2895 const xmlChar *content = NULL;
2896
2897 if ((ctxt == NULL) || (atts == NULL))
2898 return;
2899
2900 i = 0;
2901 att = atts[i++];
2902 while (att != NULL) {
2903 value = atts[i++];
2904 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
2905 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
2906 http = 1;
2907 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
2908 content = value;
2909 att = atts[i++];
2910 }
2911 if ((http) && (content != NULL))
2912 htmlCheckEncoding(ctxt, content);
2913
2914}
2915
2916/**
2917 * htmlParseStartTag:
2918 * @ctxt: an HTML parser context
2919 *
2920 * parse a start of tag either for rule element or
2921 * EmptyElement. In both case we don't parse the tag closing chars.
2922 *
2923 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2924 *
2925 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2926 *
2927 * With namespace:
2928 *
2929 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2930 *
2931 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2932 *
2933 */
2934
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002935static void
Owen Taylor3473f882001-02-23 17:55:21 +00002936htmlParseStartTag(htmlParserCtxtPtr ctxt) {
2937 xmlChar *name;
2938 xmlChar *attname;
2939 xmlChar *attvalue;
2940 const xmlChar **atts = NULL;
2941 int nbatts = 0;
2942 int maxatts = 0;
2943 int meta = 0;
2944 int i;
2945
2946 if (CUR != '<') return;
2947 NEXT;
2948
2949 GROW;
2950 name = htmlParseHTMLName(ctxt);
2951 if (name == NULL) {
2952 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2953 ctxt->sax->error(ctxt->userData,
2954 "htmlParseStartTag: invalid element name\n");
2955 ctxt->wellFormed = 0;
2956 /* Dump the bogus tag like browsers do */
2957 while ((IS_CHAR(CUR)) && (CUR != '>'))
2958 NEXT;
2959 return;
2960 }
2961 if (xmlStrEqual(name, BAD_CAST"meta"))
2962 meta = 1;
2963
2964 /*
2965 * Check for auto-closure of HTML elements.
2966 */
2967 htmlAutoClose(ctxt, name);
2968
2969 /*
2970 * Check for implied HTML elements.
2971 */
2972 htmlCheckImplied(ctxt, name);
2973
2974 /*
2975 * Avoid html at any level > 0, head at any level != 1
2976 * or any attempt to recurse body
2977 */
2978 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
2979 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2980 ctxt->sax->error(ctxt->userData,
2981 "htmlParseStartTag: misplaced <html> tag\n");
2982 ctxt->wellFormed = 0;
2983 xmlFree(name);
2984 return;
2985 }
2986 if ((ctxt->nameNr != 1) &&
2987 (xmlStrEqual(name, BAD_CAST"head"))) {
2988 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2989 ctxt->sax->error(ctxt->userData,
2990 "htmlParseStartTag: misplaced <head> tag\n");
2991 ctxt->wellFormed = 0;
2992 xmlFree(name);
2993 return;
2994 }
2995 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002996 int indx;
2997 for (indx = 0;indx < ctxt->nameNr;indx++) {
2998 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor3473f882001-02-23 17:55:21 +00002999 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3000 ctxt->sax->error(ctxt->userData,
3001 "htmlParseStartTag: misplaced <body> tag\n");
3002 ctxt->wellFormed = 0;
3003 xmlFree(name);
3004 return;
3005 }
3006 }
3007 }
3008
3009 /*
3010 * Now parse the attributes, it ends up with the ending
3011 *
3012 * (S Attribute)* S?
3013 */
3014 SKIP_BLANKS;
3015 while ((IS_CHAR(CUR)) &&
3016 (CUR != '>') &&
3017 ((CUR != '/') || (NXT(1) != '>'))) {
3018 long cons = ctxt->nbChars;
3019
3020 GROW;
3021 attname = htmlParseAttribute(ctxt, &attvalue);
3022 if (attname != NULL) {
3023
3024 /*
3025 * Well formedness requires at most one declaration of an attribute
3026 */
3027 for (i = 0; i < nbatts;i += 2) {
3028 if (xmlStrEqual(atts[i], attname)) {
3029 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3030 ctxt->sax->error(ctxt->userData,
3031 "Attribute %s redefined\n",
3032 attname);
3033 ctxt->wellFormed = 0;
3034 xmlFree(attname);
3035 if (attvalue != NULL)
3036 xmlFree(attvalue);
3037 goto failed;
3038 }
3039 }
3040
3041 /*
3042 * Add the pair to atts
3043 */
3044 if (atts == NULL) {
3045 maxatts = 10;
3046 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
3047 if (atts == NULL) {
3048 xmlGenericError(xmlGenericErrorContext,
3049 "malloc of %ld byte failed\n",
3050 maxatts * (long)sizeof(xmlChar *));
3051 if (name != NULL) xmlFree(name);
3052 return;
3053 }
3054 } else if (nbatts + 4 > maxatts) {
3055 maxatts *= 2;
3056 atts = (const xmlChar **) xmlRealloc((void *) atts,
3057 maxatts * sizeof(xmlChar *));
3058 if (atts == NULL) {
3059 xmlGenericError(xmlGenericErrorContext,
3060 "realloc of %ld byte failed\n",
3061 maxatts * (long)sizeof(xmlChar *));
3062 if (name != NULL) xmlFree(name);
3063 return;
3064 }
3065 }
3066 atts[nbatts++] = attname;
3067 atts[nbatts++] = attvalue;
3068 atts[nbatts] = NULL;
3069 atts[nbatts + 1] = NULL;
3070 }
3071 else {
3072 /* Dump the bogus attribute string up to the next blank or
3073 * the end of the tag. */
Daniel Veillard561b7f82002-03-20 21:55:57 +00003074 while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
3075 && ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003076 NEXT;
3077 }
3078
3079failed:
3080 SKIP_BLANKS;
3081 if (cons == ctxt->nbChars) {
3082 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3083 ctxt->sax->error(ctxt->userData,
3084 "htmlParseStartTag: problem parsing attributes\n");
3085 ctxt->wellFormed = 0;
3086 break;
3087 }
3088 }
3089
3090 /*
3091 * Handle specific association to the META tag
3092 */
3093 if (meta)
3094 htmlCheckMeta(ctxt, atts);
3095
3096 /*
3097 * SAX: Start of Element !
3098 */
3099 htmlnamePush(ctxt, xmlStrdup(name));
3100#ifdef DEBUG
3101 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
3102#endif
3103 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3104 ctxt->sax->startElement(ctxt->userData, name, atts);
3105
3106 if (atts != NULL) {
3107 for (i = 0;i < nbatts;i++) {
3108 if (atts[i] != NULL)
3109 xmlFree((xmlChar *) atts[i]);
3110 }
3111 xmlFree((void *) atts);
3112 }
3113 if (name != NULL) xmlFree(name);
3114}
3115
3116/**
3117 * htmlParseEndTag:
3118 * @ctxt: an HTML parser context
3119 *
3120 * parse an end of tag
3121 *
3122 * [42] ETag ::= '</' Name S? '>'
3123 *
3124 * With namespace
3125 *
3126 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003127 *
3128 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003129 */
3130
Daniel Veillardf420ac52001-07-04 16:04:09 +00003131static int
Owen Taylor3473f882001-02-23 17:55:21 +00003132htmlParseEndTag(htmlParserCtxtPtr ctxt) {
3133 xmlChar *name;
3134 xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003135 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003136
3137 if ((CUR != '<') || (NXT(1) != '/')) {
3138 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3139 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3140 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003141 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003142 }
3143 SKIP(2);
3144
3145 name = htmlParseHTMLName(ctxt);
Daniel Veillardf420ac52001-07-04 16:04:09 +00003146 if (name == NULL) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003147
3148 /*
3149 * We should definitely be at the ending "S? '>'" part
3150 */
3151 SKIP_BLANKS;
3152 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3153 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3154 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3155 ctxt->wellFormed = 0;
3156 } else
3157 NEXT;
3158
3159 /*
3160 * If the name read is not one of the element in the parsing stack
3161 * then return, it's just an error.
3162 */
3163 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
3164 if (xmlStrEqual(name, ctxt->nameTab[i])) break;
3165 }
3166 if (i < 0) {
3167 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3168 ctxt->sax->error(ctxt->userData,
3169 "Unexpected end tag : %s\n", name);
3170 xmlFree(name);
3171 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003172 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003173 }
3174
3175
3176 /*
3177 * Check for auto-closure of HTML elements.
3178 */
3179
3180 htmlAutoCloseOnClose(ctxt, name);
3181
3182 /*
3183 * Well formedness constraints, opening and closing must match.
3184 * With the exception that the autoclose may have popped stuff out
3185 * of the stack.
3186 */
3187 if (!xmlStrEqual(name, ctxt->name)) {
3188#ifdef DEBUG
3189 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
3190#endif
3191 if ((ctxt->name != NULL) &&
3192 (!xmlStrEqual(ctxt->name, name))) {
3193 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3194 ctxt->sax->error(ctxt->userData,
3195 "Opening and ending tag mismatch: %s and %s\n",
3196 name, ctxt->name);
3197 ctxt->wellFormed = 0;
3198 }
3199 }
3200
3201 /*
3202 * SAX: End of Tag
3203 */
3204 oldname = ctxt->name;
3205 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3206 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3207 ctxt->sax->endElement(ctxt->userData, name);
3208 oldname = htmlnamePop(ctxt);
3209 if (oldname != NULL) {
3210#ifdef DEBUG
3211 xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
3212#endif
3213 xmlFree(oldname);
3214#ifdef DEBUG
3215 } else {
3216 xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
3217#endif
3218 }
Daniel Veillardf420ac52001-07-04 16:04:09 +00003219 ret = 1;
3220 } else {
3221 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003222 }
3223
3224 if (name != NULL)
3225 xmlFree(name);
3226
Daniel Veillardf420ac52001-07-04 16:04:09 +00003227 return(ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003228}
3229
3230
3231/**
3232 * htmlParseReference:
3233 * @ctxt: an HTML parser context
3234 *
3235 * parse and handle entity references in content,
3236 * this will end-up in a call to character() since this is either a
3237 * CharRef, or a predefined entity.
3238 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003239static void
Owen Taylor3473f882001-02-23 17:55:21 +00003240htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003241 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003242 xmlChar out[6];
3243 xmlChar *name;
3244 if (CUR != '&') return;
3245
3246 if (NXT(1) == '#') {
3247 unsigned int c;
3248 int bits, i = 0;
3249
3250 c = htmlParseCharRef(ctxt);
3251 if (c == 0)
3252 return;
3253
3254 if (c < 0x80) { out[i++]= c; bits= -6; }
3255 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3256 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3257 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3258
3259 for ( ; bits >= 0; bits-= 6) {
3260 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3261 }
3262 out[i] = 0;
3263
3264 htmlCheckParagraph(ctxt);
3265 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3266 ctxt->sax->characters(ctxt->userData, out, i);
3267 } else {
3268 ent = htmlParseEntityRef(ctxt, &name);
3269 if (name == NULL) {
3270 htmlCheckParagraph(ctxt);
3271 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3272 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3273 return;
3274 }
3275 if ((ent == NULL) || (ent->value <= 0)) {
3276 htmlCheckParagraph(ctxt);
3277 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3278 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3279 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3280 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3281 }
3282 } else {
3283 unsigned int c;
3284 int bits, i = 0;
3285
3286 c = ent->value;
3287 if (c < 0x80)
3288 { out[i++]= c; bits= -6; }
3289 else if (c < 0x800)
3290 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3291 else if (c < 0x10000)
3292 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3293 else
3294 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3295
3296 for ( ; bits >= 0; bits-= 6) {
3297 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3298 }
3299 out[i] = 0;
3300
3301 htmlCheckParagraph(ctxt);
3302 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3303 ctxt->sax->characters(ctxt->userData, out, i);
3304 }
3305 xmlFree(name);
3306 }
3307}
3308
3309/**
3310 * htmlParseContent:
3311 * @ctxt: an HTML parser context
3312 * @name: the node name
3313 *
3314 * Parse a content: comment, sub-element, reference or text.
3315 *
3316 */
3317
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003318static void
Owen Taylor3473f882001-02-23 17:55:21 +00003319htmlParseContent(htmlParserCtxtPtr ctxt) {
3320 xmlChar *currentNode;
3321 int depth;
3322
3323 currentNode = xmlStrdup(ctxt->name);
3324 depth = ctxt->nameNr;
3325 while (1) {
3326 long cons = ctxt->nbChars;
3327
3328 GROW;
3329 /*
3330 * Our tag or one of it's parent or children is ending.
3331 */
3332 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003333 if (htmlParseEndTag(ctxt) &&
3334 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3335 if (currentNode != NULL)
3336 xmlFree(currentNode);
3337 return;
3338 }
3339 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003340 }
3341
3342 /*
3343 * Has this node been popped out during parsing of
3344 * the next element
3345 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003346 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3347 (!xmlStrEqual(currentNode, ctxt->name)))
3348 {
Owen Taylor3473f882001-02-23 17:55:21 +00003349 if (currentNode != NULL) xmlFree(currentNode);
3350 return;
3351 }
3352
Daniel Veillardf9533d12001-03-03 10:04:57 +00003353 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3354 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003355 /*
3356 * Handle SCRIPT/STYLE separately
3357 */
3358 htmlParseScript(ctxt);
3359 } else {
3360 /*
3361 * Sometimes DOCTYPE arrives in the middle of the document
3362 */
3363 if ((CUR == '<') && (NXT(1) == '!') &&
3364 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3365 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3366 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3367 (UPP(8) == 'E')) {
3368 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3369 ctxt->sax->error(ctxt->userData,
3370 "Misplaced DOCTYPE declaration\n");
3371 ctxt->wellFormed = 0;
3372 htmlParseDocTypeDecl(ctxt);
3373 }
3374
3375 /*
3376 * First case : a comment
3377 */
3378 if ((CUR == '<') && (NXT(1) == '!') &&
3379 (NXT(2) == '-') && (NXT(3) == '-')) {
3380 htmlParseComment(ctxt);
3381 }
3382
3383 /*
3384 * Second case : a sub-element.
3385 */
3386 else if (CUR == '<') {
3387 htmlParseElement(ctxt);
3388 }
3389
3390 /*
3391 * Third case : a reference. If if has not been resolved,
3392 * parsing returns it's Name, create the node
3393 */
3394 else if (CUR == '&') {
3395 htmlParseReference(ctxt);
3396 }
3397
3398 /*
3399 * Fourth : end of the resource
3400 */
3401 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003402 htmlAutoCloseOnEnd(ctxt);
3403 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003404 }
3405
3406 /*
3407 * Last case, text. Note that References are handled directly.
3408 */
3409 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003410 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003411 }
3412
3413 if (cons == ctxt->nbChars) {
3414 if (ctxt->node != NULL) {
3415 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3416 ctxt->sax->error(ctxt->userData,
3417 "detected an error in element content\n");
3418 ctxt->wellFormed = 0;
3419 }
3420 break;
3421 }
3422 }
3423 GROW;
3424 }
3425 if (currentNode != NULL) xmlFree(currentNode);
3426}
3427
3428/**
3429 * htmlParseElement:
3430 * @ctxt: an HTML parser context
3431 *
3432 * parse an HTML element, this is highly recursive
3433 *
3434 * [39] element ::= EmptyElemTag | STag content ETag
3435 *
3436 * [41] Attribute ::= Name Eq AttValue
3437 */
3438
3439void
3440htmlParseElement(htmlParserCtxtPtr ctxt) {
3441 xmlChar *name;
3442 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003443 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003444 htmlParserNodeInfo node_info;
3445 xmlChar *oldname;
3446 int depth = ctxt->nameNr;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003447 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003448
3449 /* Capture start position */
3450 if (ctxt->record_info) {
3451 node_info.begin_pos = ctxt->input->consumed +
3452 (CUR_PTR - ctxt->input->base);
3453 node_info.begin_line = ctxt->input->line;
3454 }
3455
3456 oldname = xmlStrdup(ctxt->name);
3457 htmlParseStartTag(ctxt);
3458 name = ctxt->name;
3459#ifdef DEBUG
3460 if (oldname == NULL)
3461 xmlGenericError(xmlGenericErrorContext,
3462 "Start of element %s\n", name);
3463 else if (name == NULL)
3464 xmlGenericError(xmlGenericErrorContext,
3465 "Start of element failed, was %s\n", oldname);
3466 else
3467 xmlGenericError(xmlGenericErrorContext,
3468 "Start of element %s, was %s\n", name, oldname);
3469#endif
3470 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3471 (name == NULL)) {
3472 if (CUR == '>')
3473 NEXT;
3474 if (oldname != NULL)
3475 xmlFree(oldname);
3476 return;
3477 }
3478 if (oldname != NULL)
3479 xmlFree(oldname);
3480
3481 /*
3482 * Lookup the info for that element.
3483 */
3484 info = htmlTagLookup(name);
3485 if (info == NULL) {
3486 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3487 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3488 name);
3489 ctxt->wellFormed = 0;
3490 } else if (info->depr) {
3491/***************************
3492 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3493 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3494 name);
3495 ***************************/
3496 }
3497
3498 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00003499 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00003500 */
3501 if ((CUR == '/') && (NXT(1) == '>')) {
3502 SKIP(2);
3503 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3504 ctxt->sax->endElement(ctxt->userData, name);
3505 oldname = htmlnamePop(ctxt);
3506#ifdef DEBUG
3507 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
3508#endif
3509 if (oldname != NULL)
3510 xmlFree(oldname);
3511 return;
3512 }
3513
3514 if (CUR == '>') {
3515 NEXT;
3516 } else {
3517 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3518 ctxt->sax->error(ctxt->userData,
3519 "Couldn't find end of Start Tag %s\n",
3520 name);
3521 ctxt->wellFormed = 0;
3522
3523 /*
3524 * end of parsing of this node.
3525 */
3526 if (xmlStrEqual(name, ctxt->name)) {
3527 nodePop(ctxt);
3528 oldname = htmlnamePop(ctxt);
3529#ifdef DEBUG
3530 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
3531#endif
3532 if (oldname != NULL)
3533 xmlFree(oldname);
3534 }
3535
3536 /*
3537 * Capture end position and add node
3538 */
3539 if ( currentNode != NULL && ctxt->record_info ) {
3540 node_info.end_pos = ctxt->input->consumed +
3541 (CUR_PTR - ctxt->input->base);
3542 node_info.end_line = ctxt->input->line;
3543 node_info.node = ctxt->node;
3544 xmlParserAddNodeInfo(ctxt, &node_info);
3545 }
3546 return;
3547 }
3548
3549 /*
3550 * Check for an Empty Element from DTD definition
3551 */
3552 if ((info != NULL) && (info->empty)) {
3553 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3554 ctxt->sax->endElement(ctxt->userData, name);
3555 oldname = htmlnamePop(ctxt);
3556#ifdef DEBUG
3557 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
3558#endif
3559 if (oldname != NULL)
3560 xmlFree(oldname);
3561 return;
3562 }
3563
3564 /*
3565 * Parse the content of the element:
3566 */
3567 currentNode = xmlStrdup(ctxt->name);
3568 depth = ctxt->nameNr;
3569 while (IS_CHAR(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00003570 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00003571 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00003572 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00003573 if (ctxt->nameNr < depth) break;
3574 }
3575
Owen Taylor3473f882001-02-23 17:55:21 +00003576 /*
3577 * Capture end position and add node
3578 */
3579 if ( currentNode != NULL && ctxt->record_info ) {
3580 node_info.end_pos = ctxt->input->consumed +
3581 (CUR_PTR - ctxt->input->base);
3582 node_info.end_line = ctxt->input->line;
3583 node_info.node = ctxt->node;
3584 xmlParserAddNodeInfo(ctxt, &node_info);
3585 }
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003586 if (!IS_CHAR(CUR)) {
3587 htmlAutoCloseOnEnd(ctxt);
3588 }
3589
Owen Taylor3473f882001-02-23 17:55:21 +00003590 if (currentNode != NULL)
3591 xmlFree(currentNode);
3592}
3593
3594/**
3595 * htmlParseDocument :
3596 * @ctxt: an HTML parser context
3597 *
3598 * parse an HTML document (and build a tree if using the standard SAX
3599 * interface).
3600 *
3601 * Returns 0, -1 in case of error. the parser context is augmented
3602 * as a result of the parsing.
3603 */
3604
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00003605int
Owen Taylor3473f882001-02-23 17:55:21 +00003606htmlParseDocument(htmlParserCtxtPtr ctxt) {
3607 xmlDtdPtr dtd;
3608
Daniel Veillardd0463562001-10-13 09:15:48 +00003609 xmlInitParser();
3610
Owen Taylor3473f882001-02-23 17:55:21 +00003611 htmlDefaultSAXHandlerInit();
3612 ctxt->html = 1;
3613
3614 GROW;
3615 /*
3616 * SAX: beginning of the document processing.
3617 */
3618 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3619 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3620
3621 /*
3622 * Wipe out everything which is before the first '<'
3623 */
3624 SKIP_BLANKS;
3625 if (CUR == 0) {
3626 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3627 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3628 ctxt->wellFormed = 0;
3629 }
3630
3631 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3632 ctxt->sax->startDocument(ctxt->userData);
3633
3634
3635 /*
3636 * Parse possible comments before any content
3637 */
3638 while ((CUR == '<') && (NXT(1) == '!') &&
3639 (NXT(2) == '-') && (NXT(3) == '-')) {
3640 htmlParseComment(ctxt);
3641 SKIP_BLANKS;
3642 }
3643
3644
3645 /*
3646 * Then possibly doc type declaration(s) and more Misc
3647 * (doctypedecl Misc*)?
3648 */
3649 if ((CUR == '<') && (NXT(1) == '!') &&
3650 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3651 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3652 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3653 (UPP(8) == 'E')) {
3654 htmlParseDocTypeDecl(ctxt);
3655 }
3656 SKIP_BLANKS;
3657
3658 /*
3659 * Parse possible comments before any content
3660 */
3661 while ((CUR == '<') && (NXT(1) == '!') &&
3662 (NXT(2) == '-') && (NXT(3) == '-')) {
3663 htmlParseComment(ctxt);
3664 SKIP_BLANKS;
3665 }
3666
3667 /*
3668 * Time to start parsing the tree itself
3669 */
3670 htmlParseContent(ctxt);
3671
3672 /*
3673 * autoclose
3674 */
3675 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003676 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003677
3678
3679 /*
3680 * SAX: end of the document processing.
3681 */
3682 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3683 ctxt->sax->endDocument(ctxt->userData);
3684
3685 if (ctxt->myDoc != NULL) {
3686 dtd = xmlGetIntSubset(ctxt->myDoc);
3687 if (dtd == NULL)
3688 ctxt->myDoc->intSubset =
3689 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3690 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3691 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3692 }
3693 if (! ctxt->wellFormed) return(-1);
3694 return(0);
3695}
3696
3697
3698/************************************************************************
3699 * *
3700 * Parser contexts handling *
3701 * *
3702 ************************************************************************/
3703
3704/**
3705 * xmlInitParserCtxt:
3706 * @ctxt: an HTML parser context
3707 *
3708 * Initialize a parser context
3709 */
3710
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003711static void
Owen Taylor3473f882001-02-23 17:55:21 +00003712htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3713{
3714 htmlSAXHandler *sax;
3715
3716 if (ctxt == NULL) return;
3717 memset(ctxt, 0, sizeof(htmlParserCtxt));
3718
3719 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3720 if (sax == NULL) {
3721 xmlGenericError(xmlGenericErrorContext,
3722 "htmlInitParserCtxt: out of memory\n");
3723 }
3724 else
3725 memset(sax, 0, sizeof(htmlSAXHandler));
3726
3727 /* Allocate the Input stack */
3728 ctxt->inputTab = (htmlParserInputPtr *)
3729 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3730 if (ctxt->inputTab == NULL) {
3731 xmlGenericError(xmlGenericErrorContext,
3732 "htmlInitParserCtxt: out of memory\n");
3733 ctxt->inputNr = 0;
3734 ctxt->inputMax = 0;
3735 ctxt->input = NULL;
3736 return;
3737 }
3738 ctxt->inputNr = 0;
3739 ctxt->inputMax = 5;
3740 ctxt->input = NULL;
3741 ctxt->version = NULL;
3742 ctxt->encoding = NULL;
3743 ctxt->standalone = -1;
3744 ctxt->instate = XML_PARSER_START;
3745
3746 /* Allocate the Node stack */
3747 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3748 if (ctxt->nodeTab == NULL) {
3749 xmlGenericError(xmlGenericErrorContext,
3750 "htmlInitParserCtxt: out of memory\n");
3751 ctxt->nodeNr = 0;
3752 ctxt->nodeMax = 0;
3753 ctxt->node = NULL;
3754 ctxt->inputNr = 0;
3755 ctxt->inputMax = 0;
3756 ctxt->input = NULL;
3757 return;
3758 }
3759 ctxt->nodeNr = 0;
3760 ctxt->nodeMax = 10;
3761 ctxt->node = NULL;
3762
3763 /* Allocate the Name stack */
3764 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
3765 if (ctxt->nameTab == NULL) {
3766 xmlGenericError(xmlGenericErrorContext,
3767 "htmlInitParserCtxt: out of memory\n");
3768 ctxt->nameNr = 0;
3769 ctxt->nameMax = 10;
3770 ctxt->name = NULL;
3771 ctxt->nodeNr = 0;
3772 ctxt->nodeMax = 0;
3773 ctxt->node = NULL;
3774 ctxt->inputNr = 0;
3775 ctxt->inputMax = 0;
3776 ctxt->input = NULL;
3777 return;
3778 }
3779 ctxt->nameNr = 0;
3780 ctxt->nameMax = 10;
3781 ctxt->name = NULL;
3782
3783 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3784 else {
3785 ctxt->sax = sax;
3786 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3787 }
3788 ctxt->userData = ctxt;
3789 ctxt->myDoc = NULL;
3790 ctxt->wellFormed = 1;
3791 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00003792 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00003793 ctxt->html = 1;
3794 ctxt->record_info = 0;
3795 ctxt->validate = 0;
3796 ctxt->nbChars = 0;
3797 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00003798 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003799 xmlInitNodeInfoSeq(&ctxt->node_seq);
3800}
3801
3802/**
3803 * htmlFreeParserCtxt:
3804 * @ctxt: an HTML parser context
3805 *
3806 * Free all the memory used by a parser context. However the parsed
3807 * document in ctxt->myDoc is not freed.
3808 */
3809
3810void
3811htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3812{
3813 xmlFreeParserCtxt(ctxt);
3814}
3815
3816/**
Daniel Veillard1d995272002-07-22 16:43:32 +00003817 * htmlNewParserCtxt:
3818 *
3819 * Allocate and initialize a new parser context.
3820 *
3821 * Returns the xmlParserCtxtPtr or NULL
3822 */
3823
3824static htmlParserCtxtPtr
3825htmlNewParserCtxt(void)
3826{
3827 xmlParserCtxtPtr ctxt;
3828
3829 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
3830 if (ctxt == NULL) {
3831 xmlGenericError(xmlGenericErrorContext,
3832 "xmlNewParserCtxt : cannot allocate context\n");
3833 perror("malloc");
3834 return(NULL);
3835 }
3836 memset(ctxt, 0, sizeof(xmlParserCtxt));
3837 htmlInitParserCtxt(ctxt);
3838 return(ctxt);
3839}
3840
3841/**
3842 * htmlCreateMemoryParserCtxt:
3843 * @buffer: a pointer to a char array
3844 * @size: the size of the array
3845 *
3846 * Create a parser context for an HTML in-memory document.
3847 *
3848 * Returns the new parser context or NULL
3849 */
3850static htmlParserCtxtPtr
3851htmlCreateMemoryParserCtxt(const char *buffer, int size) {
3852 xmlParserCtxtPtr ctxt;
3853 xmlParserInputPtr input;
3854 xmlParserInputBufferPtr buf;
3855
3856 if (buffer == NULL)
3857 return(NULL);
3858 if (size <= 0)
3859 return(NULL);
3860
3861 ctxt = htmlNewParserCtxt();
3862 if (ctxt == NULL)
3863 return(NULL);
3864
3865 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
3866 if (buf == NULL) return(NULL);
3867
3868 input = xmlNewInputStream(ctxt);
3869 if (input == NULL) {
3870 xmlFreeParserCtxt(ctxt);
3871 return(NULL);
3872 }
3873
3874 input->filename = NULL;
3875 input->buf = buf;
3876 input->base = input->buf->buffer->content;
3877 input->cur = input->buf->buffer->content;
3878 input->end = &input->buf->buffer->content[input->buf->buffer->use];
3879
3880 inputPush(ctxt, input);
3881 return(ctxt);
3882}
3883
3884/**
Owen Taylor3473f882001-02-23 17:55:21 +00003885 * htmlCreateDocParserCtxt :
3886 * @cur: a pointer to an array of xmlChar
3887 * @encoding: a free form C string describing the HTML document encoding, or NULL
3888 *
3889 * Create a parser context for an HTML document.
3890 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003891 * TODO: check the need to add encoding handling there
3892 *
Owen Taylor3473f882001-02-23 17:55:21 +00003893 * Returns the new parser context or NULL
3894 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003895static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00003896htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Daniel Veillard1d995272002-07-22 16:43:32 +00003897 int len;
Owen Taylor3473f882001-02-23 17:55:21 +00003898
Daniel Veillard1d995272002-07-22 16:43:32 +00003899 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00003900 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00003901 len = xmlStrlen(cur);
3902 return(htmlCreateMemoryParserCtxt((char *)cur, len));
Owen Taylor3473f882001-02-23 17:55:21 +00003903}
3904
3905/************************************************************************
3906 * *
3907 * Progressive parsing interfaces *
3908 * *
3909 ************************************************************************/
3910
3911/**
3912 * htmlParseLookupSequence:
3913 * @ctxt: an HTML parser context
3914 * @first: the first char to lookup
3915 * @next: the next char to lookup or zero
3916 * @third: the next char to lookup or zero
3917 *
3918 * Try to find if a sequence (first, next, third) or just (first next) or
3919 * (first) is available in the input stream.
3920 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3921 * to avoid rescanning sequences of bytes, it DOES change the state of the
3922 * parser, do not use liberally.
3923 * This is basically similar to xmlParseLookupSequence()
3924 *
3925 * Returns the index to the current parsing point if the full sequence
3926 * is available, -1 otherwise.
3927 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003928static int
Owen Taylor3473f882001-02-23 17:55:21 +00003929htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3930 xmlChar next, xmlChar third) {
3931 int base, len;
3932 htmlParserInputPtr in;
3933 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00003934 int incomment = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003935
3936 in = ctxt->input;
3937 if (in == NULL) return(-1);
3938 base = in->cur - in->base;
3939 if (base < 0) return(-1);
3940 if (ctxt->checkIndex > base)
3941 base = ctxt->checkIndex;
3942 if (in->buf == NULL) {
3943 buf = in->base;
3944 len = in->length;
3945 } else {
3946 buf = in->buf->buffer->content;
3947 len = in->buf->buffer->use;
3948 }
3949 /* take into account the sequence length */
3950 if (third) len -= 2;
3951 else if (next) len --;
3952 for (;base < len;base++) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00003953 if (!incomment && (base + 4 < len)) {
3954 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
3955 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
3956 incomment = 1;
3957 }
3958 /* do not increment base, some people use <!--> */
3959 }
3960 if (incomment) {
3961 if (base + 3 < len)
3962 return(-1);
3963 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
3964 (buf[base + 2] == '>')) {
3965 incomment = 0;
3966 base += 2;
3967 }
3968 continue;
3969 }
Owen Taylor3473f882001-02-23 17:55:21 +00003970 if (buf[base] == first) {
3971 if (third != 0) {
3972 if ((buf[base + 1] != next) ||
3973 (buf[base + 2] != third)) continue;
3974 } else if (next != 0) {
3975 if (buf[base + 1] != next) continue;
3976 }
3977 ctxt->checkIndex = 0;
3978#ifdef DEBUG_PUSH
3979 if (next == 0)
3980 xmlGenericError(xmlGenericErrorContext,
3981 "HPP: lookup '%c' found at %d\n",
3982 first, base);
3983 else if (third == 0)
3984 xmlGenericError(xmlGenericErrorContext,
3985 "HPP: lookup '%c%c' found at %d\n",
3986 first, next, base);
3987 else
3988 xmlGenericError(xmlGenericErrorContext,
3989 "HPP: lookup '%c%c%c' found at %d\n",
3990 first, next, third, base);
3991#endif
3992 return(base - (in->cur - in->base));
3993 }
3994 }
3995 ctxt->checkIndex = base;
3996#ifdef DEBUG_PUSH
3997 if (next == 0)
3998 xmlGenericError(xmlGenericErrorContext,
3999 "HPP: lookup '%c' failed\n", first);
4000 else if (third == 0)
4001 xmlGenericError(xmlGenericErrorContext,
4002 "HPP: lookup '%c%c' failed\n", first, next);
4003 else
4004 xmlGenericError(xmlGenericErrorContext,
4005 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4006#endif
4007 return(-1);
4008}
4009
4010/**
4011 * htmlParseTryOrFinish:
4012 * @ctxt: an HTML parser context
4013 * @terminate: last chunk indicator
4014 *
4015 * Try to progress on parsing
4016 *
4017 * Returns zero if no parsing was possible
4018 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004019static int
Owen Taylor3473f882001-02-23 17:55:21 +00004020htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4021 int ret = 0;
4022 htmlParserInputPtr in;
4023 int avail = 0;
4024 xmlChar cur, next;
4025
4026#ifdef DEBUG_PUSH
4027 switch (ctxt->instate) {
4028 case XML_PARSER_EOF:
4029 xmlGenericError(xmlGenericErrorContext,
4030 "HPP: try EOF\n"); break;
4031 case XML_PARSER_START:
4032 xmlGenericError(xmlGenericErrorContext,
4033 "HPP: try START\n"); break;
4034 case XML_PARSER_MISC:
4035 xmlGenericError(xmlGenericErrorContext,
4036 "HPP: try MISC\n");break;
4037 case XML_PARSER_COMMENT:
4038 xmlGenericError(xmlGenericErrorContext,
4039 "HPP: try COMMENT\n");break;
4040 case XML_PARSER_PROLOG:
4041 xmlGenericError(xmlGenericErrorContext,
4042 "HPP: try PROLOG\n");break;
4043 case XML_PARSER_START_TAG:
4044 xmlGenericError(xmlGenericErrorContext,
4045 "HPP: try START_TAG\n");break;
4046 case XML_PARSER_CONTENT:
4047 xmlGenericError(xmlGenericErrorContext,
4048 "HPP: try CONTENT\n");break;
4049 case XML_PARSER_CDATA_SECTION:
4050 xmlGenericError(xmlGenericErrorContext,
4051 "HPP: try CDATA_SECTION\n");break;
4052 case XML_PARSER_END_TAG:
4053 xmlGenericError(xmlGenericErrorContext,
4054 "HPP: try END_TAG\n");break;
4055 case XML_PARSER_ENTITY_DECL:
4056 xmlGenericError(xmlGenericErrorContext,
4057 "HPP: try ENTITY_DECL\n");break;
4058 case XML_PARSER_ENTITY_VALUE:
4059 xmlGenericError(xmlGenericErrorContext,
4060 "HPP: try ENTITY_VALUE\n");break;
4061 case XML_PARSER_ATTRIBUTE_VALUE:
4062 xmlGenericError(xmlGenericErrorContext,
4063 "HPP: try ATTRIBUTE_VALUE\n");break;
4064 case XML_PARSER_DTD:
4065 xmlGenericError(xmlGenericErrorContext,
4066 "HPP: try DTD\n");break;
4067 case XML_PARSER_EPILOG:
4068 xmlGenericError(xmlGenericErrorContext,
4069 "HPP: try EPILOG\n");break;
4070 case XML_PARSER_PI:
4071 xmlGenericError(xmlGenericErrorContext,
4072 "HPP: try PI\n");break;
4073 case XML_PARSER_SYSTEM_LITERAL:
4074 xmlGenericError(xmlGenericErrorContext,
4075 "HPP: try SYSTEM_LITERAL\n");break;
4076 }
4077#endif
4078
4079 while (1) {
4080
4081 in = ctxt->input;
4082 if (in == NULL) break;
4083 if (in->buf == NULL)
4084 avail = in->length - (in->cur - in->base);
4085 else
4086 avail = in->buf->buffer->use - (in->cur - in->base);
4087 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004088 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004089 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4090 /*
4091 * SAX: end of the document processing.
4092 */
4093 ctxt->instate = XML_PARSER_EOF;
4094 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4095 ctxt->sax->endDocument(ctxt->userData);
4096 }
4097 }
4098 if (avail < 1)
4099 goto done;
4100 switch (ctxt->instate) {
4101 case XML_PARSER_EOF:
4102 /*
4103 * Document parsing is done !
4104 */
4105 goto done;
4106 case XML_PARSER_START:
4107 /*
4108 * Very first chars read from the document flow.
4109 */
4110 cur = in->cur[0];
4111 if (IS_BLANK(cur)) {
4112 SKIP_BLANKS;
4113 if (in->buf == NULL)
4114 avail = in->length - (in->cur - in->base);
4115 else
4116 avail = in->buf->buffer->use - (in->cur - in->base);
4117 }
4118 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4119 ctxt->sax->setDocumentLocator(ctxt->userData,
4120 &xmlDefaultSAXLocator);
4121 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4122 (!ctxt->disableSAX))
4123 ctxt->sax->startDocument(ctxt->userData);
4124
4125 cur = in->cur[0];
4126 next = in->cur[1];
4127 if ((cur == '<') && (next == '!') &&
4128 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4129 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4130 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4131 (UPP(8) == 'E')) {
4132 if ((!terminate) &&
4133 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4134 goto done;
4135#ifdef DEBUG_PUSH
4136 xmlGenericError(xmlGenericErrorContext,
4137 "HPP: Parsing internal subset\n");
4138#endif
4139 htmlParseDocTypeDecl(ctxt);
4140 ctxt->instate = XML_PARSER_PROLOG;
4141#ifdef DEBUG_PUSH
4142 xmlGenericError(xmlGenericErrorContext,
4143 "HPP: entering PROLOG\n");
4144#endif
4145 } else {
4146 ctxt->instate = XML_PARSER_MISC;
4147 }
4148#ifdef DEBUG_PUSH
4149 xmlGenericError(xmlGenericErrorContext,
4150 "HPP: entering MISC\n");
4151#endif
4152 break;
4153 case XML_PARSER_MISC:
4154 SKIP_BLANKS;
4155 if (in->buf == NULL)
4156 avail = in->length - (in->cur - in->base);
4157 else
4158 avail = in->buf->buffer->use - (in->cur - in->base);
4159 if (avail < 2)
4160 goto done;
4161 cur = in->cur[0];
4162 next = in->cur[1];
4163 if ((cur == '<') && (next == '!') &&
4164 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4165 if ((!terminate) &&
4166 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4167 goto done;
4168#ifdef DEBUG_PUSH
4169 xmlGenericError(xmlGenericErrorContext,
4170 "HPP: Parsing Comment\n");
4171#endif
4172 htmlParseComment(ctxt);
4173 ctxt->instate = XML_PARSER_MISC;
4174 } else if ((cur == '<') && (next == '!') &&
4175 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4176 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4177 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4178 (UPP(8) == 'E')) {
4179 if ((!terminate) &&
4180 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4181 goto done;
4182#ifdef DEBUG_PUSH
4183 xmlGenericError(xmlGenericErrorContext,
4184 "HPP: Parsing internal subset\n");
4185#endif
4186 htmlParseDocTypeDecl(ctxt);
4187 ctxt->instate = XML_PARSER_PROLOG;
4188#ifdef DEBUG_PUSH
4189 xmlGenericError(xmlGenericErrorContext,
4190 "HPP: entering PROLOG\n");
4191#endif
4192 } else if ((cur == '<') && (next == '!') &&
4193 (avail < 9)) {
4194 goto done;
4195 } else {
4196 ctxt->instate = XML_PARSER_START_TAG;
4197#ifdef DEBUG_PUSH
4198 xmlGenericError(xmlGenericErrorContext,
4199 "HPP: entering START_TAG\n");
4200#endif
4201 }
4202 break;
4203 case XML_PARSER_PROLOG:
4204 SKIP_BLANKS;
4205 if (in->buf == NULL)
4206 avail = in->length - (in->cur - in->base);
4207 else
4208 avail = in->buf->buffer->use - (in->cur - in->base);
4209 if (avail < 2)
4210 goto done;
4211 cur = in->cur[0];
4212 next = in->cur[1];
4213 if ((cur == '<') && (next == '!') &&
4214 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4215 if ((!terminate) &&
4216 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4217 goto done;
4218#ifdef DEBUG_PUSH
4219 xmlGenericError(xmlGenericErrorContext,
4220 "HPP: Parsing Comment\n");
4221#endif
4222 htmlParseComment(ctxt);
4223 ctxt->instate = XML_PARSER_PROLOG;
4224 } else if ((cur == '<') && (next == '!') &&
4225 (avail < 4)) {
4226 goto done;
4227 } else {
4228 ctxt->instate = XML_PARSER_START_TAG;
4229#ifdef DEBUG_PUSH
4230 xmlGenericError(xmlGenericErrorContext,
4231 "HPP: entering START_TAG\n");
4232#endif
4233 }
4234 break;
4235 case XML_PARSER_EPILOG:
4236 if (in->buf == NULL)
4237 avail = in->length - (in->cur - in->base);
4238 else
4239 avail = in->buf->buffer->use - (in->cur - in->base);
4240 if (avail < 1)
4241 goto done;
4242 cur = in->cur[0];
4243 if (IS_BLANK(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004244 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004245 goto done;
4246 }
4247 if (avail < 2)
4248 goto done;
4249 next = in->cur[1];
4250 if ((cur == '<') && (next == '!') &&
4251 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4252 if ((!terminate) &&
4253 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4254 goto done;
4255#ifdef DEBUG_PUSH
4256 xmlGenericError(xmlGenericErrorContext,
4257 "HPP: Parsing Comment\n");
4258#endif
4259 htmlParseComment(ctxt);
4260 ctxt->instate = XML_PARSER_EPILOG;
4261 } else if ((cur == '<') && (next == '!') &&
4262 (avail < 4)) {
4263 goto done;
4264 } else {
4265 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004266 ctxt->wellFormed = 0;
4267 ctxt->instate = XML_PARSER_EOF;
4268#ifdef DEBUG_PUSH
4269 xmlGenericError(xmlGenericErrorContext,
4270 "HPP: entering EOF\n");
4271#endif
4272 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4273 ctxt->sax->endDocument(ctxt->userData);
4274 goto done;
4275 }
4276 break;
4277 case XML_PARSER_START_TAG: {
4278 xmlChar *name, *oldname;
4279 int depth = ctxt->nameNr;
Daniel Veillardbb371292001-08-16 23:26:59 +00004280 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004281
4282 if (avail < 2)
4283 goto done;
4284 cur = in->cur[0];
4285 if (cur != '<') {
4286 ctxt->instate = XML_PARSER_CONTENT;
4287#ifdef DEBUG_PUSH
4288 xmlGenericError(xmlGenericErrorContext,
4289 "HPP: entering CONTENT\n");
4290#endif
4291 break;
4292 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004293 if (in->cur[1] == '/') {
4294 ctxt->instate = XML_PARSER_END_TAG;
4295 ctxt->checkIndex = 0;
4296#ifdef DEBUG_PUSH
4297 xmlGenericError(xmlGenericErrorContext,
4298 "HPP: entering END_TAG\n");
4299#endif
4300 break;
4301 }
Owen Taylor3473f882001-02-23 17:55:21 +00004302 if ((!terminate) &&
4303 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4304 goto done;
4305
4306 oldname = xmlStrdup(ctxt->name);
4307 htmlParseStartTag(ctxt);
4308 name = ctxt->name;
4309#ifdef DEBUG
4310 if (oldname == NULL)
4311 xmlGenericError(xmlGenericErrorContext,
4312 "Start of element %s\n", name);
4313 else if (name == NULL)
4314 xmlGenericError(xmlGenericErrorContext,
4315 "Start of element failed, was %s\n",
4316 oldname);
4317 else
4318 xmlGenericError(xmlGenericErrorContext,
4319 "Start of element %s, was %s\n",
4320 name, oldname);
4321#endif
4322 if (((depth == ctxt->nameNr) &&
4323 (xmlStrEqual(oldname, ctxt->name))) ||
4324 (name == NULL)) {
4325 if (CUR == '>')
4326 NEXT;
4327 if (oldname != NULL)
4328 xmlFree(oldname);
4329 break;
4330 }
4331 if (oldname != NULL)
4332 xmlFree(oldname);
4333
4334 /*
4335 * Lookup the info for that element.
4336 */
4337 info = htmlTagLookup(name);
4338 if (info == NULL) {
4339 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4340 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4341 name);
4342 ctxt->wellFormed = 0;
4343 } else if (info->depr) {
4344 /***************************
4345 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4346 ctxt->sax->warning(ctxt->userData,
4347 "Tag %s is deprecated\n",
4348 name);
4349 ***************************/
4350 }
4351
4352 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004353 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004354 */
4355 if ((CUR == '/') && (NXT(1) == '>')) {
4356 SKIP(2);
4357 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4358 ctxt->sax->endElement(ctxt->userData, name);
4359 oldname = htmlnamePop(ctxt);
4360#ifdef DEBUG
4361 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
4362 oldname);
4363#endif
4364 if (oldname != NULL)
4365 xmlFree(oldname);
4366 ctxt->instate = XML_PARSER_CONTENT;
4367#ifdef DEBUG_PUSH
4368 xmlGenericError(xmlGenericErrorContext,
4369 "HPP: entering CONTENT\n");
4370#endif
4371 break;
4372 }
4373
4374 if (CUR == '>') {
4375 NEXT;
4376 } else {
4377 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4378 ctxt->sax->error(ctxt->userData,
4379 "Couldn't find end of Start Tag %s\n",
4380 name);
4381 ctxt->wellFormed = 0;
4382
4383 /*
4384 * end of parsing of this node.
4385 */
4386 if (xmlStrEqual(name, ctxt->name)) {
4387 nodePop(ctxt);
4388 oldname = htmlnamePop(ctxt);
4389#ifdef DEBUG
4390 xmlGenericError(xmlGenericErrorContext,
4391 "End of start tag problem: popping out %s\n", oldname);
4392#endif
4393 if (oldname != NULL)
4394 xmlFree(oldname);
4395 }
4396
4397 ctxt->instate = XML_PARSER_CONTENT;
4398#ifdef DEBUG_PUSH
4399 xmlGenericError(xmlGenericErrorContext,
4400 "HPP: entering CONTENT\n");
4401#endif
4402 break;
4403 }
4404
4405 /*
4406 * Check for an Empty Element from DTD definition
4407 */
4408 if ((info != NULL) && (info->empty)) {
4409 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4410 ctxt->sax->endElement(ctxt->userData, name);
4411 oldname = htmlnamePop(ctxt);
4412#ifdef DEBUG
4413 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
4414#endif
4415 if (oldname != NULL)
4416 xmlFree(oldname);
4417 }
4418 ctxt->instate = XML_PARSER_CONTENT;
4419#ifdef DEBUG_PUSH
4420 xmlGenericError(xmlGenericErrorContext,
4421 "HPP: entering CONTENT\n");
4422#endif
4423 break;
4424 }
4425 case XML_PARSER_CONTENT: {
4426 long cons;
4427 /*
4428 * Handle preparsed entities and charRef
4429 */
4430 if (ctxt->token != 0) {
4431 xmlChar chr[2] = { 0 , 0 } ;
4432
4433 chr[0] = (xmlChar) ctxt->token;
4434 htmlCheckParagraph(ctxt);
4435 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4436 ctxt->sax->characters(ctxt->userData, chr, 1);
4437 ctxt->token = 0;
4438 ctxt->checkIndex = 0;
4439 }
4440 if ((avail == 1) && (terminate)) {
4441 cur = in->cur[0];
4442 if ((cur != '<') && (cur != '&')) {
4443 if (ctxt->sax != NULL) {
4444 if (IS_BLANK(cur)) {
4445 if (ctxt->sax->ignorableWhitespace != NULL)
4446 ctxt->sax->ignorableWhitespace(
4447 ctxt->userData, &cur, 1);
4448 } else {
4449 htmlCheckParagraph(ctxt);
4450 if (ctxt->sax->characters != NULL)
4451 ctxt->sax->characters(
4452 ctxt->userData, &cur, 1);
4453 }
4454 }
4455 ctxt->token = 0;
4456 ctxt->checkIndex = 0;
4457 NEXT;
William M. Brack1633d182001-10-05 15:41:19 +00004458 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004459 }
Owen Taylor3473f882001-02-23 17:55:21 +00004460 }
4461 if (avail < 2)
4462 goto done;
4463 cur = in->cur[0];
4464 next = in->cur[1];
4465 cons = ctxt->nbChars;
4466 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4467 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4468 /*
4469 * Handle SCRIPT/STYLE separately
4470 */
4471 if ((!terminate) &&
4472 (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4473 goto done;
4474 htmlParseScript(ctxt);
4475 if ((cur == '<') && (next == '/')) {
4476 ctxt->instate = XML_PARSER_END_TAG;
4477 ctxt->checkIndex = 0;
4478#ifdef DEBUG_PUSH
4479 xmlGenericError(xmlGenericErrorContext,
4480 "HPP: entering END_TAG\n");
4481#endif
4482 break;
4483 }
4484 } else {
4485 /*
4486 * Sometimes DOCTYPE arrives in the middle of the document
4487 */
4488 if ((cur == '<') && (next == '!') &&
4489 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4490 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4491 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4492 (UPP(8) == 'E')) {
4493 if ((!terminate) &&
4494 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4495 goto done;
4496 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4497 ctxt->sax->error(ctxt->userData,
4498 "Misplaced DOCTYPE declaration\n");
4499 ctxt->wellFormed = 0;
4500 htmlParseDocTypeDecl(ctxt);
4501 } else if ((cur == '<') && (next == '!') &&
4502 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4503 if ((!terminate) &&
4504 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4505 goto done;
4506#ifdef DEBUG_PUSH
4507 xmlGenericError(xmlGenericErrorContext,
4508 "HPP: Parsing Comment\n");
4509#endif
4510 htmlParseComment(ctxt);
4511 ctxt->instate = XML_PARSER_CONTENT;
4512 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4513 goto done;
4514 } else if ((cur == '<') && (next == '/')) {
4515 ctxt->instate = XML_PARSER_END_TAG;
4516 ctxt->checkIndex = 0;
4517#ifdef DEBUG_PUSH
4518 xmlGenericError(xmlGenericErrorContext,
4519 "HPP: entering END_TAG\n");
4520#endif
4521 break;
4522 } else if (cur == '<') {
4523 ctxt->instate = XML_PARSER_START_TAG;
4524 ctxt->checkIndex = 0;
4525#ifdef DEBUG_PUSH
4526 xmlGenericError(xmlGenericErrorContext,
4527 "HPP: entering START_TAG\n");
4528#endif
4529 break;
4530 } else if (cur == '&') {
4531 if ((!terminate) &&
4532 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4533 goto done;
4534#ifdef DEBUG_PUSH
4535 xmlGenericError(xmlGenericErrorContext,
4536 "HPP: Parsing Reference\n");
4537#endif
4538 /* TODO: check generation of subtrees if noent !!! */
4539 htmlParseReference(ctxt);
4540 } else {
4541 /* TODO Avoid the extra copy, handle directly !!!!!! */
4542 /*
4543 * Goal of the following test is :
4544 * - minimize calls to the SAX 'character' callback
4545 * when they are mergeable
4546 */
4547 if ((ctxt->inputNr == 1) &&
4548 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4549 if ((!terminate) &&
4550 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4551 goto done;
4552 }
4553 ctxt->checkIndex = 0;
4554#ifdef DEBUG_PUSH
4555 xmlGenericError(xmlGenericErrorContext,
4556 "HPP: Parsing char data\n");
4557#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004558 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004559 }
4560 }
4561 if (cons == ctxt->nbChars) {
4562 if (ctxt->node != NULL) {
4563 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4564 ctxt->sax->error(ctxt->userData,
4565 "detected an error in element content\n");
4566 ctxt->wellFormed = 0;
4567 }
4568 NEXT;
4569 break;
4570 }
4571
4572 break;
4573 }
4574 case XML_PARSER_END_TAG:
4575 if (avail < 2)
4576 goto done;
4577 if ((!terminate) &&
4578 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4579 goto done;
4580 htmlParseEndTag(ctxt);
4581 if (ctxt->nameNr == 0) {
4582 ctxt->instate = XML_PARSER_EPILOG;
4583 } else {
4584 ctxt->instate = XML_PARSER_CONTENT;
4585 }
4586 ctxt->checkIndex = 0;
4587#ifdef DEBUG_PUSH
4588 xmlGenericError(xmlGenericErrorContext,
4589 "HPP: entering CONTENT\n");
4590#endif
4591 break;
4592 case XML_PARSER_CDATA_SECTION:
4593 xmlGenericError(xmlGenericErrorContext,
4594 "HPP: internal error, state == CDATA\n");
4595 ctxt->instate = XML_PARSER_CONTENT;
4596 ctxt->checkIndex = 0;
4597#ifdef DEBUG_PUSH
4598 xmlGenericError(xmlGenericErrorContext,
4599 "HPP: entering CONTENT\n");
4600#endif
4601 break;
4602 case XML_PARSER_DTD:
4603 xmlGenericError(xmlGenericErrorContext,
4604 "HPP: internal error, state == DTD\n");
4605 ctxt->instate = XML_PARSER_CONTENT;
4606 ctxt->checkIndex = 0;
4607#ifdef DEBUG_PUSH
4608 xmlGenericError(xmlGenericErrorContext,
4609 "HPP: entering CONTENT\n");
4610#endif
4611 break;
4612 case XML_PARSER_COMMENT:
4613 xmlGenericError(xmlGenericErrorContext,
4614 "HPP: internal error, state == COMMENT\n");
4615 ctxt->instate = XML_PARSER_CONTENT;
4616 ctxt->checkIndex = 0;
4617#ifdef DEBUG_PUSH
4618 xmlGenericError(xmlGenericErrorContext,
4619 "HPP: entering CONTENT\n");
4620#endif
4621 break;
4622 case XML_PARSER_PI:
4623 xmlGenericError(xmlGenericErrorContext,
4624 "HPP: internal error, state == PI\n");
4625 ctxt->instate = XML_PARSER_CONTENT;
4626 ctxt->checkIndex = 0;
4627#ifdef DEBUG_PUSH
4628 xmlGenericError(xmlGenericErrorContext,
4629 "HPP: entering CONTENT\n");
4630#endif
4631 break;
4632 case XML_PARSER_ENTITY_DECL:
4633 xmlGenericError(xmlGenericErrorContext,
4634 "HPP: internal error, state == ENTITY_DECL\n");
4635 ctxt->instate = XML_PARSER_CONTENT;
4636 ctxt->checkIndex = 0;
4637#ifdef DEBUG_PUSH
4638 xmlGenericError(xmlGenericErrorContext,
4639 "HPP: entering CONTENT\n");
4640#endif
4641 break;
4642 case XML_PARSER_ENTITY_VALUE:
4643 xmlGenericError(xmlGenericErrorContext,
4644 "HPP: internal error, state == ENTITY_VALUE\n");
4645 ctxt->instate = XML_PARSER_CONTENT;
4646 ctxt->checkIndex = 0;
4647#ifdef DEBUG_PUSH
4648 xmlGenericError(xmlGenericErrorContext,
4649 "HPP: entering DTD\n");
4650#endif
4651 break;
4652 case XML_PARSER_ATTRIBUTE_VALUE:
4653 xmlGenericError(xmlGenericErrorContext,
4654 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
4655 ctxt->instate = XML_PARSER_START_TAG;
4656 ctxt->checkIndex = 0;
4657#ifdef DEBUG_PUSH
4658 xmlGenericError(xmlGenericErrorContext,
4659 "HPP: entering START_TAG\n");
4660#endif
4661 break;
4662 case XML_PARSER_SYSTEM_LITERAL:
4663 xmlGenericError(xmlGenericErrorContext,
4664 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
4665 ctxt->instate = XML_PARSER_CONTENT;
4666 ctxt->checkIndex = 0;
4667#ifdef DEBUG_PUSH
4668 xmlGenericError(xmlGenericErrorContext,
4669 "HPP: entering CONTENT\n");
4670#endif
4671 break;
4672 case XML_PARSER_IGNORE:
4673 xmlGenericError(xmlGenericErrorContext,
4674 "HPP: internal error, state == XML_PARSER_IGNORE\n");
4675 ctxt->instate = XML_PARSER_CONTENT;
4676 ctxt->checkIndex = 0;
4677#ifdef DEBUG_PUSH
4678 xmlGenericError(xmlGenericErrorContext,
4679 "HPP: entering CONTENT\n");
4680#endif
4681 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00004682 case XML_PARSER_PUBLIC_LITERAL:
4683 xmlGenericError(xmlGenericErrorContext,
4684 "HPP: internal error, state == XML_PARSER_LITERAL\n");
4685 ctxt->instate = XML_PARSER_CONTENT;
4686 ctxt->checkIndex = 0;
4687#ifdef DEBUG_PUSH
4688 xmlGenericError(xmlGenericErrorContext,
4689 "HPP: entering CONTENT\n");
4690#endif
4691 break;
4692
Owen Taylor3473f882001-02-23 17:55:21 +00004693 }
4694 }
4695done:
4696 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004697 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004698 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4699 /*
4700 * SAX: end of the document processing.
4701 */
4702 ctxt->instate = XML_PARSER_EOF;
4703 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4704 ctxt->sax->endDocument(ctxt->userData);
4705 }
4706 }
4707 if ((ctxt->myDoc != NULL) &&
4708 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4709 (ctxt->instate == XML_PARSER_EPILOG))) {
4710 xmlDtdPtr dtd;
4711 dtd = xmlGetIntSubset(ctxt->myDoc);
4712 if (dtd == NULL)
4713 ctxt->myDoc->intSubset =
4714 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4715 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4716 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4717 }
4718#ifdef DEBUG_PUSH
4719 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4720#endif
4721 return(ret);
4722}
4723
4724/**
Owen Taylor3473f882001-02-23 17:55:21 +00004725 * htmlParseChunk:
4726 * @ctxt: an XML parser context
4727 * @chunk: an char array
4728 * @size: the size in byte of the chunk
4729 * @terminate: last chunk indicator
4730 *
4731 * Parse a Chunk of memory
4732 *
4733 * Returns zero if no error, the xmlParserErrors otherwise.
4734 */
4735int
4736htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4737 int terminate) {
4738 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4739 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4740 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4741 int cur = ctxt->input->cur - ctxt->input->base;
4742
4743 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4744 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4745 ctxt->input->cur = ctxt->input->base + cur;
4746#ifdef DEBUG_PUSH
4747 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4748#endif
4749
4750 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4751 htmlParseTryOrFinish(ctxt, terminate);
4752 } else if (ctxt->instate != XML_PARSER_EOF) {
4753 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
4754 htmlParseTryOrFinish(ctxt, terminate);
4755 }
4756 if (terminate) {
4757 if ((ctxt->instate != XML_PARSER_EOF) &&
4758 (ctxt->instate != XML_PARSER_EPILOG) &&
4759 (ctxt->instate != XML_PARSER_MISC)) {
4760 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004761 ctxt->wellFormed = 0;
4762 }
4763 if (ctxt->instate != XML_PARSER_EOF) {
4764 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4765 ctxt->sax->endDocument(ctxt->userData);
4766 }
4767 ctxt->instate = XML_PARSER_EOF;
4768 }
4769 return((xmlParserErrors) ctxt->errNo);
4770}
4771
4772/************************************************************************
4773 * *
4774 * User entry points *
4775 * *
4776 ************************************************************************/
4777
4778/**
4779 * htmlCreatePushParserCtxt :
4780 * @sax: a SAX handler
4781 * @user_data: The user data returned on SAX callbacks
4782 * @chunk: a pointer to an array of chars
4783 * @size: number of chars in the array
4784 * @filename: an optional file name or URI
4785 * @enc: an optional encoding
4786 *
4787 * Create a parser context for using the HTML parser in push mode
4788 * To allow content encoding detection, @size should be >= 4
4789 * The value of @filename is used for fetching external entities
4790 * and error/warning reports.
4791 *
4792 * Returns the new parser context or NULL
4793 */
4794htmlParserCtxtPtr
4795htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4796 const char *chunk, int size, const char *filename,
4797 xmlCharEncoding enc) {
4798 htmlParserCtxtPtr ctxt;
4799 htmlParserInputPtr inputStream;
4800 xmlParserInputBufferPtr buf;
4801
Daniel Veillardd0463562001-10-13 09:15:48 +00004802 xmlInitParser();
4803
Owen Taylor3473f882001-02-23 17:55:21 +00004804 buf = xmlAllocParserInputBuffer(enc);
4805 if (buf == NULL) return(NULL);
4806
4807 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4808 if (ctxt == NULL) {
4809 xmlFree(buf);
4810 return(NULL);
4811 }
4812 memset(ctxt, 0, sizeof(htmlParserCtxt));
4813 htmlInitParserCtxt(ctxt);
4814 if (sax != NULL) {
4815 if (ctxt->sax != &htmlDefaultSAXHandler)
4816 xmlFree(ctxt->sax);
4817 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4818 if (ctxt->sax == NULL) {
4819 xmlFree(buf);
4820 xmlFree(ctxt);
4821 return(NULL);
4822 }
4823 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4824 if (user_data != NULL)
4825 ctxt->userData = user_data;
4826 }
4827 if (filename == NULL) {
4828 ctxt->directory = NULL;
4829 } else {
4830 ctxt->directory = xmlParserGetDirectory(filename);
4831 }
4832
4833 inputStream = htmlNewInputStream(ctxt);
4834 if (inputStream == NULL) {
4835 xmlFreeParserCtxt(ctxt);
4836 return(NULL);
4837 }
4838
4839 if (filename == NULL)
4840 inputStream->filename = NULL;
4841 else
4842 inputStream->filename = xmlMemStrdup(filename);
4843 inputStream->buf = buf;
4844 inputStream->base = inputStream->buf->buffer->content;
4845 inputStream->cur = inputStream->buf->buffer->content;
4846
4847 inputPush(ctxt, inputStream);
4848
4849 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4850 (ctxt->input->buf != NULL)) {
4851 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4852#ifdef DEBUG_PUSH
4853 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4854#endif
4855 }
4856
4857 return(ctxt);
4858}
4859
4860/**
4861 * htmlSAXParseDoc :
4862 * @cur: a pointer to an array of xmlChar
4863 * @encoding: a free form C string describing the HTML document encoding, or NULL
4864 * @sax: the SAX handler block
4865 * @userData: if using SAX, this pointer will be provided on callbacks.
4866 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004867 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
4868 * to handle parse events. If sax is NULL, fallback to the default DOM
4869 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00004870 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004871 * Returns the resulting document tree unless SAX is NULL or the document is
4872 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00004873 */
4874
4875htmlDocPtr
4876htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
4877 htmlDocPtr ret;
4878 htmlParserCtxtPtr ctxt;
4879
Daniel Veillardd0463562001-10-13 09:15:48 +00004880 xmlInitParser();
4881
Owen Taylor3473f882001-02-23 17:55:21 +00004882 if (cur == NULL) return(NULL);
4883
4884
4885 ctxt = htmlCreateDocParserCtxt(cur, encoding);
4886 if (ctxt == NULL) return(NULL);
4887 if (sax != NULL) {
4888 ctxt->sax = sax;
4889 ctxt->userData = userData;
4890 }
4891
4892 htmlParseDocument(ctxt);
4893 ret = ctxt->myDoc;
4894 if (sax != NULL) {
4895 ctxt->sax = NULL;
4896 ctxt->userData = NULL;
4897 }
4898 htmlFreeParserCtxt(ctxt);
4899
4900 return(ret);
4901}
4902
4903/**
4904 * htmlParseDoc :
4905 * @cur: a pointer to an array of xmlChar
4906 * @encoding: a free form C string describing the HTML document encoding, or NULL
4907 *
4908 * parse an HTML in-memory document and build a tree.
4909 *
4910 * Returns the resulting document tree
4911 */
4912
4913htmlDocPtr
4914htmlParseDoc(xmlChar *cur, const char *encoding) {
4915 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4916}
4917
4918
4919/**
4920 * htmlCreateFileParserCtxt :
4921 * @filename: the filename
4922 * @encoding: a free form C string describing the HTML document encoding, or NULL
4923 *
4924 * Create a parser context for a file content.
4925 * Automatic support for ZLIB/Compress compressed document is provided
4926 * by default if found at compile-time.
4927 *
4928 * Returns the new parser context or NULL
4929 */
4930htmlParserCtxtPtr
4931htmlCreateFileParserCtxt(const char *filename, const char *encoding)
4932{
4933 htmlParserCtxtPtr ctxt;
4934 htmlParserInputPtr inputStream;
4935 xmlParserInputBufferPtr buf;
4936 /* htmlCharEncoding enc; */
4937 xmlChar *content, *content_line = (xmlChar *) "charset=";
4938
4939 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
4940 if (buf == NULL) return(NULL);
4941
4942 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4943 if (ctxt == NULL) {
4944 perror("malloc");
4945 return(NULL);
4946 }
4947 memset(ctxt, 0, sizeof(htmlParserCtxt));
4948 htmlInitParserCtxt(ctxt);
4949 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
4950 if (inputStream == NULL) {
4951 perror("malloc");
4952 xmlFree(ctxt);
4953 return(NULL);
4954 }
4955 memset(inputStream, 0, sizeof(htmlParserInput));
4956
4957 inputStream->filename = xmlMemStrdup(filename);
4958 inputStream->line = 1;
4959 inputStream->col = 1;
4960 inputStream->buf = buf;
4961 inputStream->directory = NULL;
4962
4963 inputStream->base = inputStream->buf->buffer->content;
4964 inputStream->cur = inputStream->buf->buffer->content;
4965 inputStream->free = NULL;
4966
4967 inputPush(ctxt, inputStream);
4968
4969 /* set encoding */
4970 if (encoding) {
4971 content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
4972 if (content) {
4973 strcpy ((char *)content, (char *)content_line);
4974 strcat ((char *)content, (char *)encoding);
4975 htmlCheckEncoding (ctxt, content);
4976 xmlFree (content);
4977 }
4978 }
4979
4980 return(ctxt);
4981}
4982
4983/**
4984 * htmlSAXParseFile :
4985 * @filename: the filename
4986 * @encoding: a free form C string describing the HTML document encoding, or NULL
4987 * @sax: the SAX handler block
4988 * @userData: if using SAX, this pointer will be provided on callbacks.
4989 *
4990 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4991 * compressed document is provided by default if found at compile-time.
4992 * It use the given SAX function block to handle the parsing callback.
4993 * If sax is NULL, fallback to the default DOM tree building routines.
4994 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004995 * Returns the resulting document tree unless SAX is NULL or the document is
4996 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00004997 */
4998
4999htmlDocPtr
5000htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5001 void *userData) {
5002 htmlDocPtr ret;
5003 htmlParserCtxtPtr ctxt;
5004 htmlSAXHandlerPtr oldsax = NULL;
5005
Daniel Veillardd0463562001-10-13 09:15:48 +00005006 xmlInitParser();
5007
Owen Taylor3473f882001-02-23 17:55:21 +00005008 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5009 if (ctxt == NULL) return(NULL);
5010 if (sax != NULL) {
5011 oldsax = ctxt->sax;
5012 ctxt->sax = sax;
5013 ctxt->userData = userData;
5014 }
5015
5016 htmlParseDocument(ctxt);
5017
5018 ret = ctxt->myDoc;
5019 if (sax != NULL) {
5020 ctxt->sax = oldsax;
5021 ctxt->userData = NULL;
5022 }
5023 htmlFreeParserCtxt(ctxt);
5024
5025 return(ret);
5026}
5027
5028/**
5029 * htmlParseFile :
5030 * @filename: the filename
5031 * @encoding: a free form C string describing the HTML document encoding, or NULL
5032 *
5033 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5034 * compressed document is provided by default if found at compile-time.
5035 *
5036 * Returns the resulting document tree
5037 */
5038
5039htmlDocPtr
5040htmlParseFile(const char *filename, const char *encoding) {
5041 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5042}
5043
5044/**
5045 * htmlHandleOmittedElem:
5046 * @val: int 0 or 1
5047 *
5048 * Set and return the previous value for handling HTML omitted tags.
5049 *
5050 * Returns the last value for 0 for no handling, 1 for auto insertion.
5051 */
5052
5053int
5054htmlHandleOmittedElem(int val) {
5055 int old = htmlOmittedDefaultValue;
5056
5057 htmlOmittedDefaultValue = val;
5058 return(old);
5059}
5060
5061#endif /* LIBXML_HTML_ENABLED */