blob: 7200a3b1aa9f0631fe4770887078f6b224bd534b [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Owen Taylor3473f882001-02-23 17:55:21 +000045
46#define HTML_MAX_NAMELEN 1000
47#define HTML_PARSER_BIG_BUFFER_SIZE 1000
48#define HTML_PARSER_BUFFER_SIZE 100
49
50/* #define DEBUG */
51/* #define DEBUG_PUSH */
52
Daniel Veillard22090732001-07-16 00:06:07 +000053static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000054
Daniel Veillard56a4cb82001-03-24 17:00:36 +000055xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
56 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000057static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000058
59/************************************************************************
60 * *
Owen Taylor3473f882001-02-23 17:55:21 +000061 * Parser stacks related functions and macros *
62 * *
63 ************************************************************************/
64
65/*
66 * Generic function for accessing stacks in the Parser Context
67 */
68
69#define PUSH_AND_POP(scope, type, name) \
70scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
71 if (ctxt->name##Nr >= ctxt->name##Max) { \
72 ctxt->name##Max *= 2; \
73 ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
74 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
75 if (ctxt->name##Tab == NULL) { \
76 xmlGenericError(xmlGenericErrorContext, \
77 "realloc failed !\n"); \
78 return(0); \
79 } \
80 } \
81 ctxt->name##Tab[ctxt->name##Nr] = value; \
82 ctxt->name = value; \
83 return(ctxt->name##Nr++); \
84} \
85scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
86 type ret; \
87 if (ctxt->name##Nr < 0) return(0); \
88 ctxt->name##Nr--; \
89 if (ctxt->name##Nr < 0) return(0); \
90 if (ctxt->name##Nr > 0) \
91 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
92 else \
93 ctxt->name = NULL; \
94 ret = ctxt->name##Tab[ctxt->name##Nr]; \
95 ctxt->name##Tab[ctxt->name##Nr] = 0; \
96 return(ret); \
97} \
98
Daniel Veillard56a4cb82001-03-24 17:00:36 +000099/* PUSH_AND_POP(static, xmlNodePtr, node) */
100PUSH_AND_POP(static, xmlChar*, name)
Owen Taylor3473f882001-02-23 17:55:21 +0000101
102/*
103 * Macros for accessing the content. Those should be used only by the parser,
104 * and not exported.
105 *
106 * Dirty macros, i.e. one need to make assumption on the context to use them
107 *
108 * CUR_PTR return the current pointer to the xmlChar to be parsed.
109 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
110 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
111 * in UNICODE mode. This should be used internally by the parser
112 * only to compare to ASCII values otherwise it would break when
113 * running with UTF-8 encoding.
114 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
115 * to compare on ASCII based substring.
116 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
117 * it should be used only to compare on ASCII based substring.
118 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
119 * strings within the parser.
120 *
121 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
122 *
123 * CURRENT Returns the current char value, with the full decoding of
124 * UTF-8 if we are using this mode. It returns an int.
125 * NEXT Skip to the next character, this does the proper decoding
126 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
127 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
128 */
129
130#define UPPER (toupper(*ctxt->input->cur))
131
132#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
133
134#define NXT(val) ctxt->input->cur[(val)]
135
136#define UPP(val) (toupper(ctxt->input->cur[(val)]))
137
138#define CUR_PTR ctxt->input->cur
139
140#define SHRINK xmlParserInputShrink(ctxt->input)
141
142#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
143
144#define CURRENT ((int) (*ctxt->input->cur))
145
146#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
147
148/* Inported from XML */
149
Daniel Veillard561b7f82002-03-20 21:55:57 +0000150/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
151#define CUR ((int) (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000152#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
153
Daniel Veillard561b7f82002-03-20 21:55:57 +0000154#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000155#define NXT(val) ctxt->input->cur[(val)]
156#define CUR_PTR ctxt->input->cur
157
158
159#define NEXTL(l) do { \
160 if (*(ctxt->input->cur) == '\n') { \
161 ctxt->input->line++; ctxt->input->col = 1; \
162 } else ctxt->input->col++; \
163 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
164 } while (0)
165
166/************
167 \
168 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
169 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
170 ************/
171
172#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
173#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
174
175#define COPY_BUF(l,b,i,v) \
176 if (l == 1) b[i++] = (xmlChar) v; \
177 else i += xmlCopyChar(l,&b[i],v)
178
179/**
180 * htmlCurrentChar:
181 * @ctxt: the HTML parser context
182 * @len: pointer to the length of the char read
183 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000184 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000185 * bytes in the input buffer. Implement the end of line normalization:
186 * 2.11 End-of-Line Handling
187 * If the encoding is unspecified, in the case we find an ISO-Latin-1
188 * char, then the encoding converter is plugged in automatically.
189 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000190 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000191 */
192
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000193static int
Owen Taylor3473f882001-02-23 17:55:21 +0000194htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
195 if (ctxt->instate == XML_PARSER_EOF)
196 return(0);
197
198 if (ctxt->token != 0) {
199 *len = 0;
200 return(ctxt->token);
201 }
202 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
203 /*
204 * We are supposed to handle UTF8, check it's valid
205 * From rfc2044: encoding of the Unicode values on UTF-8:
206 *
207 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
208 * 0000 0000-0000 007F 0xxxxxxx
209 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
210 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
211 *
212 * Check for the 0x110000 limit too
213 */
214 const unsigned char *cur = ctxt->input->cur;
215 unsigned char c;
216 unsigned int val;
217
218 c = *cur;
219 if (c & 0x80) {
220 if (cur[1] == 0)
221 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
222 if ((cur[1] & 0xc0) != 0x80)
223 goto encoding_error;
224 if ((c & 0xe0) == 0xe0) {
225
226 if (cur[2] == 0)
227 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
228 if ((cur[2] & 0xc0) != 0x80)
229 goto encoding_error;
230 if ((c & 0xf0) == 0xf0) {
231 if (cur[3] == 0)
232 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
233 if (((c & 0xf8) != 0xf0) ||
234 ((cur[3] & 0xc0) != 0x80))
235 goto encoding_error;
236 /* 4-byte code */
237 *len = 4;
238 val = (cur[0] & 0x7) << 18;
239 val |= (cur[1] & 0x3f) << 12;
240 val |= (cur[2] & 0x3f) << 6;
241 val |= cur[3] & 0x3f;
242 } else {
243 /* 3-byte code */
244 *len = 3;
245 val = (cur[0] & 0xf) << 12;
246 val |= (cur[1] & 0x3f) << 6;
247 val |= cur[2] & 0x3f;
248 }
249 } else {
250 /* 2-byte code */
251 *len = 2;
252 val = (cur[0] & 0x1f) << 6;
253 val |= cur[1] & 0x3f;
254 }
255 if (!IS_CHAR(val)) {
256 ctxt->errNo = XML_ERR_INVALID_ENCODING;
257 if ((ctxt->sax != NULL) &&
258 (ctxt->sax->error != NULL))
259 ctxt->sax->error(ctxt->userData,
260 "Char 0x%X out of allowed range\n", val);
261 ctxt->wellFormed = 0;
262 ctxt->disableSAX = 1;
263 }
264 return(val);
265 } else {
266 /* 1-byte code */
267 *len = 1;
268 return((int) *ctxt->input->cur);
269 }
270 }
271 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000272 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000273 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000274 * XML constructs only use < 128 chars
275 */
276 *len = 1;
277 if ((int) *ctxt->input->cur < 0x80)
278 return((int) *ctxt->input->cur);
279
280 /*
281 * Humm this is bad, do an automatic flow conversion
282 */
283 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
284 ctxt->charset = XML_CHAR_ENCODING_UTF8;
285 return(xmlCurrentChar(ctxt, len));
286
287encoding_error:
288 /*
289 * If we detect an UTF8 error that probably mean that the
290 * input encoding didn't get properly advertized in the
291 * declaration header. Report the error and switch the encoding
292 * to ISO-Latin-1 (if you don't like this policy, just declare the
293 * encoding !)
294 */
295 ctxt->errNo = XML_ERR_INVALID_ENCODING;
296 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
297 ctxt->sax->error(ctxt->userData,
298 "Input is not proper UTF-8, indicate encoding !\n");
299 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
300 ctxt->input->cur[0], ctxt->input->cur[1],
301 ctxt->input->cur[2], ctxt->input->cur[3]);
302 }
303
304 ctxt->charset = XML_CHAR_ENCODING_8859_1;
305 *len = 1;
306 return((int) *ctxt->input->cur);
307}
308
309/**
Owen Taylor3473f882001-02-23 17:55:21 +0000310 * htmlSkipBlankChars:
311 * @ctxt: the HTML parser context
312 *
313 * skip all blanks character found at that point in the input streams.
314 *
315 * Returns the number of space chars skipped
316 */
317
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000318static int
Owen Taylor3473f882001-02-23 17:55:21 +0000319htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
320 int res = 0;
321
322 while (IS_BLANK(*(ctxt->input->cur))) {
323 if ((*ctxt->input->cur == 0) &&
324 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
325 xmlPopInput(ctxt);
326 } else {
327 if (*(ctxt->input->cur) == '\n') {
328 ctxt->input->line++; ctxt->input->col = 1;
329 } else ctxt->input->col++;
330 ctxt->input->cur++;
331 ctxt->nbChars++;
332 if (*ctxt->input->cur == 0)
333 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
334 }
335 res++;
336 }
337 return(res);
338}
339
340
341
342/************************************************************************
343 * *
344 * The list of HTML elements and their properties *
345 * *
346 ************************************************************************/
347
348/*
349 * Start Tag: 1 means the start tag can be ommited
350 * End Tag: 1 means the end tag can be ommited
351 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000352 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000353 * Depr: this element is deprecated
354 * DTD: 1 means that this element is valid only in the Loose DTD
355 * 2 means that this element is valid only in the Frameset DTD
356 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000357 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Owen Taylor3473f882001-02-23 17:55:21 +0000358 */
Daniel Veillard22090732001-07-16 00:06:07 +0000359static const htmlElemDesc
360html40ElementTable[] = {
Daniel Veillard02bb1702001-06-13 21:11:59 +0000361{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor " },
362{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form" },
363{ "acronym", 0, 0, 0, 0, 0, 0, 1, "" },
364{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author " },
365{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet " },
366{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area " },
367{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style" },
368{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri " },
369{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " },
370{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride " },
371{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style" },
372{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation " },
373{ "body", 1, 1, 0, 0, 0, 0, 0, "document body " },
374{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break " },
375{ "button", 0, 0, 0, 0, 0, 0, 2, "push button " },
376{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption " },
377{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center " },
378{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation" },
379{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment" },
380{ "col", 0, 2, 2, 1, 0, 0, 0, "table column " },
381{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group " },
382{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description " },
383{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text " },
384{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition" },
385{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list" },
386{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container"},
387{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list " },
388{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term " },
389{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis" },
390{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group " },
391{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font " },
392{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form " },
393{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " },
394{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" },
395{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading " },
396{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading " },
397{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading " },
398{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading " },
399{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading " },
400{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading " },
401{ "head", 1, 1, 0, 0, 0, 0, 0, "document head " },
402{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " },
403{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element " },
404{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style" },
405{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow " },
406{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image " },
407{ "input", 0, 2, 2, 1, 0, 0, 1, "form control " },
408{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text" },
409{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt " },
410{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user" },
411{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text " },
412{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend " },
413{ "li", 0, 1, 1, 0, 0, 0, 0, "list item " },
414{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link " },
415{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map " },
416{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list " },
417{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation " },
418{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering " },
419{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
420{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object " },
421{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list " },
422{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group " },
423{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " },
424{ "p", 0, 1, 1, 0, 0, 0, 0, "paragraph " },
425{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value " },
426{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text " },
427{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation " },
428{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style" },
429{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc." },
430{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements " },
431{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector " },
432{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style" },
433{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container " },
434{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text" },
435{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis" },
436{ "style", 0, 0, 0, 0, 0, 0, 0, "style info " },
437{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript" },
438{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript " },
439{ "table", 0, 0, 0, 0, 0, 0, 0, "&#160;" },
440{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body " },
441{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell" },
442{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field " },
443{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer " },
444{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell" },
445{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header " },
446{ "title", 0, 0, 0, 0, 0, 0, 0, "document title " },
447{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row " },
448{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style" },
449{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style" },
450{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list " },
451{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument" },
Owen Taylor3473f882001-02-23 17:55:21 +0000452};
453
454/*
Owen Taylor3473f882001-02-23 17:55:21 +0000455 * start tags that imply the end of current element
456 */
Daniel Veillard22090732001-07-16 00:06:07 +0000457static const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000458"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
459 "dl", "ul", "ol", "menu", "dir", "address", "pre",
460 "listing", "xmp", "head", NULL,
461"head", "p", NULL,
462"title", "p", NULL,
463"body", "head", "style", "link", "title", "p", NULL,
464"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
465 "pre", "listing", "xmp", "head", "li", NULL,
466"hr", "p", "head", NULL,
467"h1", "p", "head", NULL,
468"h2", "p", "head", NULL,
469"h3", "p", "head", NULL,
470"h4", "p", "head", NULL,
471"h5", "p", "head", NULL,
472"h6", "p", "head", NULL,
473"dir", "p", "head", NULL,
474"address", "p", "head", "ul", NULL,
475"pre", "p", "head", "ul", NULL,
476"listing", "p", "head", NULL,
477"xmp", "p", "head", NULL,
478"blockquote", "p", "head", NULL,
479"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
480 "xmp", "head", NULL,
481"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
482 "head", "dd", NULL,
483"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
484 "head", "dt", NULL,
485"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
486 "listing", "xmp", NULL,
487"ol", "p", "head", "ul", NULL,
488"menu", "p", "head", "ul", NULL,
489"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
490"div", "p", "head", NULL,
491"noscript", "p", "head", NULL,
492"center", "font", "b", "i", "p", "head", NULL,
493"a", "a", NULL,
494"caption", "p", NULL,
495"colgroup", "caption", "colgroup", "col", "p", NULL,
496"col", "caption", "col", "p", NULL,
497"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
498 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000499"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
500"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000501"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
502"thead", "caption", "col", "colgroup", NULL,
503"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
504 "tbody", "p", NULL,
505"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
506 "tfoot", "tbody", "p", NULL,
507"optgroup", "option", NULL,
508"option", "option", NULL,
509"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
510 "pre", "listing", "xmp", "a", NULL,
511NULL
512};
513
514/*
515 * The list of HTML elements which are supposed not to have
516 * CDATA content and where a p element will be implied
517 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000518 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +0000519 * implied paragraph
520 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000521static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000522 "html",
523 "head",
524 "body",
525 NULL
526};
527
528/*
529 * The list of HTML attributes which are of content %Script;
530 * NOTE: when adding ones, check htmlIsScriptAttribute() since
531 * it assumes the name starts with 'on'
532 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000533static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000534 "onclick",
535 "ondblclick",
536 "onmousedown",
537 "onmouseup",
538 "onmouseover",
539 "onmousemove",
540 "onmouseout",
541 "onkeypress",
542 "onkeydown",
543 "onkeyup",
544 "onload",
545 "onunload",
546 "onfocus",
547 "onblur",
548 "onsubmit",
549 "onrest",
550 "onchange",
551 "onselect"
552};
553
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000554/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000555 * This table is used by the htmlparser to know what to do with
556 * broken html pages. By assigning different priorities to different
557 * elements the parser can decide how to handle extra endtags.
558 * Endtags are only allowed to close elements with lower or equal
559 * priority.
560 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000561
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000562typedef struct {
563 const char *name;
564 int priority;
565} elementPriority;
566
Daniel Veillard22090732001-07-16 00:06:07 +0000567static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000568 {"div", 150},
569 {"td", 160},
570 {"th", 160},
571 {"tr", 170},
572 {"thead", 180},
573 {"tbody", 180},
574 {"tfoot", 180},
575 {"table", 190},
576 {"head", 200},
577 {"body", 200},
578 {"html", 220},
579 {NULL, 100} /* Default priority */
580};
Owen Taylor3473f882001-02-23 17:55:21 +0000581
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000582static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +0000583static int htmlStartCloseIndexinitialized = 0;
584
585/************************************************************************
586 * *
587 * functions to handle HTML specific data *
588 * *
589 ************************************************************************/
590
591/**
592 * htmlInitAutoClose:
593 *
594 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
595 * This is not reentrant. Call xmlInitParser() once before processing in
596 * case of use in multithreaded programs.
597 */
598void
599htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000600 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000601
602 if (htmlStartCloseIndexinitialized) return;
603
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000604 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
605 indx = 0;
606 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
607 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +0000608 while (htmlStartClose[i] != NULL) i++;
609 i++;
610 }
611 htmlStartCloseIndexinitialized = 1;
612}
613
614/**
615 * htmlTagLookup:
616 * @tag: The tag name in lowercase
617 *
618 * Lookup the HTML tag in the ElementTable
619 *
620 * Returns the related htmlElemDescPtr or NULL if not found.
621 */
Daniel Veillardbb371292001-08-16 23:26:59 +0000622const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +0000623htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000624 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000625
626 for (i = 0; i < (sizeof(html40ElementTable) /
627 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000628 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillard22090732001-07-16 00:06:07 +0000629 return((const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +0000630 }
631 return(NULL);
632}
633
634/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000635 * htmlGetEndPriority:
636 * @name: The name of the element to look up the priority for.
637 *
638 * Return value: The "endtag" priority.
639 **/
640static int
641htmlGetEndPriority (const xmlChar *name) {
642 int i = 0;
643
644 while ((htmlEndPriority[i].name != NULL) &&
645 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
646 i++;
647
648 return(htmlEndPriority[i].priority);
649}
650
651/**
Owen Taylor3473f882001-02-23 17:55:21 +0000652 * htmlCheckAutoClose:
653 * @newtag: The new tag name
654 * @oldtag: The old tag name
655 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000656 * Checks whether the new tag is one of the registered valid tags for
657 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +0000658 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
659 *
660 * Returns 0 if no, 1 if yes.
661 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000662static int
Owen Taylor3473f882001-02-23 17:55:21 +0000663htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000664 int i, indx;
665 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +0000666
667 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
668
669 /* inefficient, but not a big deal */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000670 for (indx = 0; indx < 100;indx++) {
671 closed = htmlStartCloseIndex[indx];
672 if (closed == NULL) return(0);
673 if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor3473f882001-02-23 17:55:21 +0000674 }
675
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000676 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +0000677 i++;
678 while (htmlStartClose[i] != NULL) {
679 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
680 return(1);
681 }
682 i++;
683 }
684 return(0);
685}
686
687/**
688 * htmlAutoCloseOnClose:
689 * @ctxt: an HTML parser context
690 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000691 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +0000692 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000693 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000694 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000695static void
Owen Taylor3473f882001-02-23 17:55:21 +0000696htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillardbb371292001-08-16 23:26:59 +0000697 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +0000698 xmlChar *oldname;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000699 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +0000700
701#ifdef DEBUG
702 xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
703 for (i = 0;i < ctxt->nameNr;i++)
704 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
705#endif
706
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000707 priority = htmlGetEndPriority (newtag);
708
Owen Taylor3473f882001-02-23 17:55:21 +0000709 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000710
Owen Taylor3473f882001-02-23 17:55:21 +0000711 if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000712 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000713 * A missplaced endtag can only close elements with lower
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000714 * or equal priority, so if we find an element with higher
715 * priority before we find an element with
716 * matching name, we just ignore this endtag
717 */
718 if (htmlGetEndPriority (ctxt->nameTab[i]) > priority) return;
Owen Taylor3473f882001-02-23 17:55:21 +0000719 }
720 if (i < 0) return;
721
722 while (!xmlStrEqual(newtag, ctxt->name)) {
723 info = htmlTagLookup(ctxt->name);
724 if ((info == NULL) || (info->endTag == 1)) {
725#ifdef DEBUG
726 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
727#endif
Daniel Veillard56098d42001-04-24 12:51:09 +0000728 } else if (info->endTag == 3) {
729#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +0000730 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", newtag, ctxt->name);
William M. Brack1633d182001-10-05 15:41:19 +0000731
Daniel Veillard56098d42001-04-24 12:51:09 +0000732#endif
733 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
734 ctxt->sax->error(ctxt->userData,
735 "Opening and ending tag mismatch: %s and %s\n",
736 newtag, ctxt->name);
737 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000738 }
739 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
740 ctxt->sax->endElement(ctxt->userData, ctxt->name);
741 oldname = htmlnamePop(ctxt);
742 if (oldname != NULL) {
743#ifdef DEBUG
744 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
745#endif
746 xmlFree(oldname);
747 }
748 }
749}
750
751/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000752 * htmlAutoCloseOnEnd:
753 * @ctxt: an HTML parser context
754 *
755 * Close all remaining tags at the end of the stream
756 */
757static void
758htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
759 xmlChar *oldname;
760 int i;
761
762 if (ctxt->nameNr == 0)
763 return;
764#ifdef DEBUG
765 xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
766#endif
767
768 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
769#ifdef DEBUG
770 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
771#endif
772 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
773 ctxt->sax->endElement(ctxt->userData, ctxt->name);
774 oldname = htmlnamePop(ctxt);
775 if (oldname != NULL) {
776#ifdef DEBUG
777 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
778#endif
779 xmlFree(oldname);
780 }
781 }
782}
783
784/**
Owen Taylor3473f882001-02-23 17:55:21 +0000785 * htmlAutoClose:
786 * @ctxt: an HTML parser context
787 * @newtag: The new tag name or NULL
788 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000789 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000790 * The list is kept in htmlStartClose array. This function is
791 * called when a new tag has been detected and generates the
792 * appropriates closes if possible/needed.
793 * If newtag is NULL this mean we are at the end of the resource
794 * and we should check
795 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000796static void
Owen Taylor3473f882001-02-23 17:55:21 +0000797htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
798 xmlChar *oldname;
799 while ((newtag != NULL) && (ctxt->name != NULL) &&
800 (htmlCheckAutoClose(newtag, ctxt->name))) {
801#ifdef DEBUG
802 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
803#endif
804 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
805 ctxt->sax->endElement(ctxt->userData, ctxt->name);
806 oldname = htmlnamePop(ctxt);
807 if (oldname != NULL) {
808#ifdef DEBUG
809 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
810#endif
811 xmlFree(oldname);
812 }
813 }
814 if (newtag == NULL) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000815 htmlAutoCloseOnEnd(ctxt);
816 return;
Owen Taylor3473f882001-02-23 17:55:21 +0000817 }
818 while ((newtag == NULL) && (ctxt->name != NULL) &&
819 ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
820 (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
821 (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
822#ifdef DEBUG
823 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
824#endif
825 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
826 ctxt->sax->endElement(ctxt->userData, ctxt->name);
827 oldname = htmlnamePop(ctxt);
828 if (oldname != NULL) {
829#ifdef DEBUG
830 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
831#endif
832 xmlFree(oldname);
833 }
834 }
835
836}
837
838/**
839 * htmlAutoCloseTag:
840 * @doc: the HTML document
841 * @name: The tag name
842 * @elem: the HTML element
843 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000844 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000845 * The list is kept in htmlStartClose array. This function checks
846 * if the element or one of it's children would autoclose the
847 * given tag.
848 *
849 * Returns 1 if autoclose, 0 otherwise
850 */
851int
852htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
853 htmlNodePtr child;
854
855 if (elem == NULL) return(1);
856 if (xmlStrEqual(name, elem->name)) return(0);
857 if (htmlCheckAutoClose(elem->name, name)) return(1);
858 child = elem->children;
859 while (child != NULL) {
860 if (htmlAutoCloseTag(doc, name, child)) return(1);
861 child = child->next;
862 }
863 return(0);
864}
865
866/**
867 * htmlIsAutoClosed:
868 * @doc: the HTML document
869 * @elem: the HTML element
870 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000871 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000872 * The list is kept in htmlStartClose array. This function checks
873 * if a tag is autoclosed by one of it's child
874 *
875 * Returns 1 if autoclosed, 0 otherwise
876 */
877int
878htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
879 htmlNodePtr child;
880
881 if (elem == NULL) return(1);
882 child = elem->children;
883 while (child != NULL) {
884 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
885 child = child->next;
886 }
887 return(0);
888}
889
890/**
891 * htmlCheckImplied:
892 * @ctxt: an HTML parser context
893 * @newtag: The new tag name
894 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000895 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +0000896 * called when a new tag has been detected and generates the
897 * appropriates implicit tags if missing
898 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000899static void
Owen Taylor3473f882001-02-23 17:55:21 +0000900htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
901 if (!htmlOmittedDefaultValue)
902 return;
903 if (xmlStrEqual(newtag, BAD_CAST"html"))
904 return;
905 if (ctxt->nameNr <= 0) {
906#ifdef DEBUG
907 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
908#endif
909 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
910 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
911 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
912 }
913 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
914 return;
915 if ((ctxt->nameNr <= 1) &&
916 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
917 (xmlStrEqual(newtag, BAD_CAST"style")) ||
918 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
919 (xmlStrEqual(newtag, BAD_CAST"link")) ||
920 (xmlStrEqual(newtag, BAD_CAST"title")) ||
921 (xmlStrEqual(newtag, BAD_CAST"base")))) {
922 /*
923 * dropped OBJECT ... i you put it first BODY will be
924 * assumed !
925 */
926#ifdef DEBUG
927 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
928#endif
929 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
930 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
931 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
932 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
933 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
934 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
935 int i;
936 for (i = 0;i < ctxt->nameNr;i++) {
937 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
938 return;
939 }
940 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
941 return;
942 }
943 }
944
945#ifdef DEBUG
946 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
947#endif
948 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
949 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
950 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
951 }
952}
953
954/**
955 * htmlCheckParagraph
956 * @ctxt: an HTML parser context
957 *
958 * Check whether a p element need to be implied before inserting
959 * characters in the current element.
960 *
961 * Returns 1 if a paragraph has been inserted, 0 if not and -1
962 * in case of error.
963 */
964
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000965static int
Owen Taylor3473f882001-02-23 17:55:21 +0000966htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
967 const xmlChar *tag;
968 int i;
969
970 if (ctxt == NULL)
971 return(-1);
972 tag = ctxt->name;
973 if (tag == NULL) {
974 htmlAutoClose(ctxt, BAD_CAST"p");
975 htmlCheckImplied(ctxt, BAD_CAST"p");
976 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
977 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
978 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
979 return(1);
980 }
981 if (!htmlOmittedDefaultValue)
982 return(0);
983 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
984 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
985#ifdef DEBUG
986 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
987#endif
988 htmlAutoClose(ctxt, BAD_CAST"p");
989 htmlCheckImplied(ctxt, BAD_CAST"p");
990 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
991 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
992 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
993 return(1);
994 }
995 }
996 return(0);
997}
998
999/**
1000 * htmlIsScriptAttribute:
1001 * @name: an attribute name
1002 *
1003 * Check if an attribute is of content type Script
1004 *
1005 * Returns 1 is the attribute is a script 0 otherwise
1006 */
1007int
1008htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001009 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001010
1011 if (name == NULL)
1012 return(0);
1013 /*
1014 * all script attributes start with 'on'
1015 */
1016 if ((name[0] != 'o') || (name[1] != 'n'))
1017 return(0);
1018 for (i = 0;
1019 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1020 i++) {
1021 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1022 return(1);
1023 }
1024 return(0);
1025}
1026
1027/************************************************************************
1028 * *
1029 * The list of HTML predefined entities *
1030 * *
1031 ************************************************************************/
1032
1033
Daniel Veillard22090732001-07-16 00:06:07 +00001034static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001035/*
1036 * the 4 absolute ones, plus apostrophe.
1037 */
1038{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1039{ 38, "amp", "ampersand, U+0026 ISOnum" },
1040{ 39, "apos", "single quote" },
1041{ 60, "lt", "less-than sign, U+003C ISOnum" },
1042{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1043
1044/*
1045 * A bunch still in the 128-255 range
1046 * Replacing them depend really on the charset used.
1047 */
1048{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1049{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1050{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1051{ 163, "pound","pound sign, U+00A3 ISOnum" },
1052{ 164, "curren","currency sign, U+00A4 ISOnum" },
1053{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1054{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1055{ 167, "sect", "section sign, U+00A7 ISOnum" },
1056{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1057{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1058{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1059{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1060{ 172, "not", "not sign, U+00AC ISOnum" },
1061{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1062{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1063{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1064{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1065{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1066{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1067{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1068{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1069{ 181, "micro","micro sign, U+00B5 ISOnum" },
1070{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1071{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1072{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1073{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1074{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1075{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1076{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1077{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1078{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1079{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1080{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1081{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1082{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1083{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1084{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1085{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1086{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1087{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1088{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1089{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1090{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1091{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1092{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1093{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1094{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1095{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1096{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1097{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1098{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1099{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1100{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1101{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1102{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1103{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1104{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1105{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1106{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1107{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1108{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1109{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1110{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1111{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1112{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1113{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1114{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1115{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1116{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1117{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1118{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1119{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1120{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1121{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1122{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1123{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1124{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1125{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1126{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1127{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1128{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1129{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1130{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1131{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1132{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1133{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1134{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1135{ 247, "divide","division sign, U+00F7 ISOnum" },
1136{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1137{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1138{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1139{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1140{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1141{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1142{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1143{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1144
1145{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1146{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1147{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1148{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1149{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1150
1151/*
1152 * Anything below should really be kept as entities references
1153 */
1154{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1155
1156{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1157{ 732, "tilde","small tilde, U+02DC ISOdia" },
1158
1159{ 913, "Alpha","greek capital letter alpha, U+0391" },
1160{ 914, "Beta", "greek capital letter beta, U+0392" },
1161{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1162{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1163{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1164{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1165{ 919, "Eta", "greek capital letter eta, U+0397" },
1166{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1167{ 921, "Iota", "greek capital letter iota, U+0399" },
1168{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001169{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001170{ 924, "Mu", "greek capital letter mu, U+039C" },
1171{ 925, "Nu", "greek capital letter nu, U+039D" },
1172{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1173{ 927, "Omicron","greek capital letter omicron, U+039F" },
1174{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1175{ 929, "Rho", "greek capital letter rho, U+03A1" },
1176{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1177{ 932, "Tau", "greek capital letter tau, U+03A4" },
1178{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1179{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1180{ 935, "Chi", "greek capital letter chi, U+03A7" },
1181{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1182{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1183
1184{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1185{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1186{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1187{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1188{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1189{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1190{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1191{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1192{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1193{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1194{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1195{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1196{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1197{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1198{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1199{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1200{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1201{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1202{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1203{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1204{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1205{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1206{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1207{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1208{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1209{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1210{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1211{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1212
1213{ 8194, "ensp", "en space, U+2002 ISOpub" },
1214{ 8195, "emsp", "em space, U+2003 ISOpub" },
1215{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1216{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1217{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1218{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1219{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1220{ 8211, "ndash","en dash, U+2013 ISOpub" },
1221{ 8212, "mdash","em dash, U+2014 ISOpub" },
1222{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1223{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1224{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1225{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1226{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1227{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1228{ 8224, "dagger","dagger, U+2020 ISOpub" },
1229{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1230
1231{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1232{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1233
1234{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1235
1236{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1237{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1238
1239{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1240{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1241
1242{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1243{ 8260, "frasl","fraction slash, U+2044 NEW" },
1244
1245{ 8364, "euro", "euro sign, U+20AC NEW" },
1246
1247{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1248{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1249{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1250{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1251{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1252{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1253{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1254{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1255{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1256{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1257{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1258{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1259{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1260{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1261{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1262{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1263
1264{ 8704, "forall","for all, U+2200 ISOtech" },
1265{ 8706, "part", "partial differential, U+2202 ISOtech" },
1266{ 8707, "exist","there exists, U+2203 ISOtech" },
1267{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1268{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1269{ 8712, "isin", "element of, U+2208 ISOtech" },
1270{ 8713, "notin","not an element of, U+2209 ISOtech" },
1271{ 8715, "ni", "contains as member, U+220B ISOtech" },
1272{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001273{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001274{ 8722, "minus","minus sign, U+2212 ISOtech" },
1275{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1276{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1277{ 8733, "prop", "proportional to, U+221D ISOtech" },
1278{ 8734, "infin","infinity, U+221E ISOtech" },
1279{ 8736, "ang", "angle, U+2220 ISOamso" },
1280{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1281{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1282{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1283{ 8746, "cup", "union = cup, U+222A ISOtech" },
1284{ 8747, "int", "integral, U+222B ISOtech" },
1285{ 8756, "there4","therefore, U+2234 ISOtech" },
1286{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1287{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1288{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1289{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1290{ 8801, "equiv","identical to, U+2261 ISOtech" },
1291{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1292{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1293{ 8834, "sub", "subset of, U+2282 ISOtech" },
1294{ 8835, "sup", "superset of, U+2283 ISOtech" },
1295{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1296{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1297{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1298{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1299{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1300{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1301{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1302{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1303{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1304{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1305{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1306{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1307{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1308{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1309
1310{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1311{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1312{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1313{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1314
1315};
1316
1317/************************************************************************
1318 * *
1319 * Commodity functions to handle entities *
1320 * *
1321 ************************************************************************/
1322
1323/*
1324 * Macro used to grow the current buffer.
1325 */
1326#define growBuffer(buffer) { \
1327 buffer##_size *= 2; \
Daniel Veillard3487c8d2002-09-05 11:33:25 +00001328 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
Owen Taylor3473f882001-02-23 17:55:21 +00001329 if (buffer == NULL) { \
Daniel Veillard3487c8d2002-09-05 11:33:25 +00001330 xmlGenericError(xmlGenericErrorContext, "realloc failed\n"); \
Owen Taylor3473f882001-02-23 17:55:21 +00001331 return(NULL); \
1332 } \
1333}
1334
1335/**
1336 * htmlEntityLookup:
1337 * @name: the entity name
1338 *
1339 * Lookup the given entity in EntitiesTable
1340 *
1341 * TODO: the linear scan is really ugly, an hash table is really needed.
1342 *
1343 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1344 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001345const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001346htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001347 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001348
1349 for (i = 0;i < (sizeof(html40EntitiesTable)/
1350 sizeof(html40EntitiesTable[0]));i++) {
1351 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1352#ifdef DEBUG
1353 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
1354#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001355 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001356 }
1357 }
1358 return(NULL);
1359}
1360
1361/**
1362 * htmlEntityValueLookup:
1363 * @value: the entity's unicode value
1364 *
1365 * Lookup the given entity in EntitiesTable
1366 *
1367 * TODO: the linear scan is really ugly, an hash table is really needed.
1368 *
1369 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1370 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001371const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001372htmlEntityValueLookup(unsigned int value) {
1373 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001374#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00001375 unsigned int lv = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001376#endif
1377
1378 for (i = 0;i < (sizeof(html40EntitiesTable)/
1379 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001380 if (html40EntitiesTable[i].value >= value) {
1381 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001382 break;
1383#ifdef DEBUG
1384 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
1385#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001386 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001387 }
1388#ifdef DEBUG
1389 if (lv > html40EntitiesTable[i].value) {
1390 xmlGenericError(xmlGenericErrorContext,
1391 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1392 lv, html40EntitiesTable[i].value);
1393 }
1394 lv = html40EntitiesTable[i].value;
1395#endif
1396 }
1397 return(NULL);
1398}
1399
1400/**
1401 * UTF8ToHtml:
1402 * @out: a pointer to an array of bytes to store the result
1403 * @outlen: the length of @out
1404 * @in: a pointer to an array of UTF-8 chars
1405 * @inlen: the length of @in
1406 *
1407 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1408 * plus HTML entities block of chars out.
1409 *
1410 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1411 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001412 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001413 * The value of @outlen after return is the number of octets consumed.
1414 */
1415int
1416UTF8ToHtml(unsigned char* out, int *outlen,
1417 const unsigned char* in, int *inlen) {
1418 const unsigned char* processed = in;
1419 const unsigned char* outend;
1420 const unsigned char* outstart = out;
1421 const unsigned char* instart = in;
1422 const unsigned char* inend;
1423 unsigned int c, d;
1424 int trailing;
1425
1426 if (in == NULL) {
1427 /*
1428 * initialization nothing to do
1429 */
1430 *outlen = 0;
1431 *inlen = 0;
1432 return(0);
1433 }
1434 inend = in + (*inlen);
1435 outend = out + (*outlen);
1436 while (in < inend) {
1437 d = *in++;
1438 if (d < 0x80) { c= d; trailing= 0; }
1439 else if (d < 0xC0) {
1440 /* trailing byte in leading position */
1441 *outlen = out - outstart;
1442 *inlen = processed - instart;
1443 return(-2);
1444 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1445 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1446 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1447 else {
1448 /* no chance for this in Ascii */
1449 *outlen = out - outstart;
1450 *inlen = processed - instart;
1451 return(-2);
1452 }
1453
1454 if (inend - in < trailing) {
1455 break;
1456 }
1457
1458 for ( ; trailing; trailing--) {
1459 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1460 break;
1461 c <<= 6;
1462 c |= d & 0x3F;
1463 }
1464
1465 /* assertion: c is a single UTF-4 value */
1466 if (c < 0x80) {
1467 if (out + 1 >= outend)
1468 break;
1469 *out++ = c;
1470 } else {
1471 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001472 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001473
1474 /*
1475 * Try to lookup a predefined HTML entity for it
1476 */
1477
1478 ent = htmlEntityValueLookup(c);
1479 if (ent == NULL) {
1480 /* no chance for this in Ascii */
1481 *outlen = out - outstart;
1482 *inlen = processed - instart;
1483 return(-2);
1484 }
1485 len = strlen(ent->name);
1486 if (out + 2 + len >= outend)
1487 break;
1488 *out++ = '&';
1489 memcpy(out, ent->name, len);
1490 out += len;
1491 *out++ = ';';
1492 }
1493 processed = in;
1494 }
1495 *outlen = out - outstart;
1496 *inlen = processed - instart;
1497 return(0);
1498}
1499
1500/**
1501 * htmlEncodeEntities:
1502 * @out: a pointer to an array of bytes to store the result
1503 * @outlen: the length of @out
1504 * @in: a pointer to an array of UTF-8 chars
1505 * @inlen: the length of @in
1506 * @quoteChar: the quote character to escape (' or ") or zero.
1507 *
1508 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1509 * plus HTML entities block of chars out.
1510 *
1511 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1512 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001513 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001514 * The value of @outlen after return is the number of octets consumed.
1515 */
1516int
1517htmlEncodeEntities(unsigned char* out, int *outlen,
1518 const unsigned char* in, int *inlen, int quoteChar) {
1519 const unsigned char* processed = in;
1520 const unsigned char* outend = out + (*outlen);
1521 const unsigned char* outstart = out;
1522 const unsigned char* instart = in;
1523 const unsigned char* inend = in + (*inlen);
1524 unsigned int c, d;
1525 int trailing;
1526
1527 while (in < inend) {
1528 d = *in++;
1529 if (d < 0x80) { c= d; trailing= 0; }
1530 else if (d < 0xC0) {
1531 /* trailing byte in leading position */
1532 *outlen = out - outstart;
1533 *inlen = processed - instart;
1534 return(-2);
1535 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1536 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1537 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1538 else {
1539 /* no chance for this in Ascii */
1540 *outlen = out - outstart;
1541 *inlen = processed - instart;
1542 return(-2);
1543 }
1544
1545 if (inend - in < trailing)
1546 break;
1547
1548 while (trailing--) {
1549 if (((d= *in++) & 0xC0) != 0x80) {
1550 *outlen = out - outstart;
1551 *inlen = processed - instart;
1552 return(-2);
1553 }
1554 c <<= 6;
1555 c |= d & 0x3F;
1556 }
1557
1558 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001559 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1560 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001561 if (out >= outend)
1562 break;
1563 *out++ = c;
1564 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001565 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001566 const char *cp;
1567 char nbuf[16];
1568 int len;
1569
1570 /*
1571 * Try to lookup a predefined HTML entity for it
1572 */
1573 ent = htmlEntityValueLookup(c);
1574 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00001575 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00001576 cp = nbuf;
1577 }
1578 else
1579 cp = ent->name;
1580 len = strlen(cp);
1581 if (out + 2 + len > outend)
1582 break;
1583 *out++ = '&';
1584 memcpy(out, cp, len);
1585 out += len;
1586 *out++ = ';';
1587 }
1588 processed = in;
1589 }
1590 *outlen = out - outstart;
1591 *inlen = processed - instart;
1592 return(0);
1593}
1594
1595/**
1596 * htmlDecodeEntities:
1597 * @ctxt: the parser context
1598 * @len: the len to decode (in bytes !), -1 for no size limit
1599 * @end: an end marker xmlChar, 0 if none
1600 * @end2: an end marker xmlChar, 0 if none
1601 * @end3: an end marker xmlChar, 0 if none
1602 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001603 * Substitute the HTML entities by their value
Owen Taylor3473f882001-02-23 17:55:21 +00001604 *
1605 * DEPRECATED !!!!
1606 *
1607 * Returns A newly allocated string with the substitution done. The caller
1608 * must deallocate it !
1609 */
1610xmlChar *
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00001611htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
1612 xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001613 static int deprecated = 0;
1614 if (!deprecated) {
1615 xmlGenericError(xmlGenericErrorContext,
1616 "htmlDecodeEntities() deprecated function reached\n");
1617 deprecated = 1;
1618 }
1619 return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00001620}
1621
1622/************************************************************************
1623 * *
1624 * Commodity functions to handle streams *
1625 * *
1626 ************************************************************************/
1627
1628/**
Owen Taylor3473f882001-02-23 17:55:21 +00001629 * htmlNewInputStream:
1630 * @ctxt: an HTML parser context
1631 *
1632 * Create a new input stream structure
1633 * Returns the new input stream or NULL
1634 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001635static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001636htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1637 htmlParserInputPtr input;
1638
1639 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1640 if (input == NULL) {
1641 ctxt->errNo = XML_ERR_NO_MEMORY;
1642 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1643 ctxt->sax->error(ctxt->userData,
1644 "malloc: couldn't allocate a new input stream\n");
1645 return(NULL);
1646 }
1647 memset(input, 0, sizeof(htmlParserInput));
1648 input->filename = NULL;
1649 input->directory = NULL;
1650 input->base = NULL;
1651 input->cur = NULL;
1652 input->buf = NULL;
1653 input->line = 1;
1654 input->col = 1;
1655 input->buf = NULL;
1656 input->free = NULL;
1657 input->version = NULL;
1658 input->consumed = 0;
1659 input->length = 0;
1660 return(input);
1661}
1662
1663
1664/************************************************************************
1665 * *
1666 * Commodity functions, cleanup needed ? *
1667 * *
1668 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001669/*
1670 * all tags allowing pc data from the html 4.01 loose dtd
1671 * NOTE: it might be more apropriate to integrate this information
1672 * into the html40ElementTable array but I don't want to risk any
1673 * binary incomptibility
1674 */
1675static const char *allowPCData[] = {
1676 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
1677 "blockquote", "body", "button", "caption", "center", "cite", "code",
1678 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
1679 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
1680 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
1681 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
1682};
Owen Taylor3473f882001-02-23 17:55:21 +00001683
1684/**
1685 * areBlanks:
1686 * @ctxt: an HTML parser context
1687 * @str: a xmlChar *
1688 * @len: the size of @str
1689 *
1690 * Is this a sequence of blank chars that one can ignore ?
1691 *
1692 * Returns 1 if ignorable 0 otherwise.
1693 */
1694
1695static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001696 unsigned int i;
1697 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00001698 xmlNodePtr lastChild;
1699
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001700 for (j = 0;j < len;j++)
1701 if (!(IS_BLANK(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001702
1703 if (CUR == 0) return(1);
1704 if (CUR != '<') return(0);
1705 if (ctxt->name == NULL)
1706 return(1);
1707 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
1708 return(1);
1709 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
1710 return(1);
1711 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
1712 return(1);
1713 if (ctxt->node == NULL) return(0);
1714 lastChild = xmlGetLastChild(ctxt->node);
1715 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00001716 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
1717 (ctxt->node->content != NULL)) return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001718 /* keep ws in constructs like ...<b> </b>...
1719 for all tags "b" allowing PCDATA */
1720 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
1721 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
1722 return(0);
1723 }
1724 }
Owen Taylor3473f882001-02-23 17:55:21 +00001725 } else if (xmlNodeIsText(lastChild)) {
1726 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001727 } else {
1728 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
1729 for all tags "p" allowing PCDATA */
1730 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
1731 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
1732 return(0);
1733 }
1734 }
Owen Taylor3473f882001-02-23 17:55:21 +00001735 }
1736 return(1);
1737}
1738
1739/**
Owen Taylor3473f882001-02-23 17:55:21 +00001740 * htmlNewDocNoDtD:
1741 * @URI: URI for the dtd, or NULL
1742 * @ExternalID: the external ID of the DTD, or NULL
1743 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001744 * Creates a new HTML document without a DTD node if @URI and @ExternalID
1745 * are NULL
1746 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001747 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00001748 */
1749htmlDocPtr
1750htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
1751 xmlDocPtr cur;
1752
1753 /*
1754 * Allocate a new document and fill the fields.
1755 */
1756 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
1757 if (cur == NULL) {
1758 xmlGenericError(xmlGenericErrorContext,
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001759 "htmlNewDocNoDtD : malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00001760 return(NULL);
1761 }
1762 memset(cur, 0, sizeof(xmlDoc));
1763
1764 cur->type = XML_HTML_DOCUMENT_NODE;
1765 cur->version = NULL;
1766 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001767 cur->doc = cur;
1768 cur->name = NULL;
1769 cur->children = NULL;
1770 cur->extSubset = NULL;
1771 cur->oldNs = NULL;
1772 cur->encoding = NULL;
1773 cur->standalone = 1;
1774 cur->compression = 0;
1775 cur->ids = NULL;
1776 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001777 cur->_private = NULL;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00001778 if ((ExternalID != NULL) ||
1779 (URI != NULL))
Daniel Veillard5151c062001-10-23 13:10:19 +00001780 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00001781 return(cur);
1782}
1783
1784/**
1785 * htmlNewDoc:
1786 * @URI: URI for the dtd, or NULL
1787 * @ExternalID: the external ID of the DTD, or NULL
1788 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001789 * Creates a new HTML document
1790 *
Owen Taylor3473f882001-02-23 17:55:21 +00001791 * Returns a new document
1792 */
1793htmlDocPtr
1794htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1795 if ((URI == NULL) && (ExternalID == NULL))
1796 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00001797 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
1798 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00001799
1800 return(htmlNewDocNoDtD(URI, ExternalID));
1801}
1802
1803
1804/************************************************************************
1805 * *
1806 * The parser itself *
1807 * Relates to http://www.w3.org/TR/html40 *
1808 * *
1809 ************************************************************************/
1810
1811/************************************************************************
1812 * *
1813 * The parser itself *
1814 * *
1815 ************************************************************************/
1816
1817/**
1818 * htmlParseHTMLName:
1819 * @ctxt: an HTML parser context
1820 *
1821 * parse an HTML tag or attribute name, note that we convert it to lowercase
1822 * since HTML names are not case-sensitive.
1823 *
1824 * Returns the Tag Name parsed or NULL
1825 */
1826
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001827static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001828htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1829 xmlChar *ret = NULL;
1830 int i = 0;
1831 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
1832
1833 if (!IS_LETTER(CUR) && (CUR != '_') &&
1834 (CUR != ':')) return(NULL);
1835
1836 while ((i < HTML_PARSER_BUFFER_SIZE) &&
1837 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1838 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
1839 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
1840 else loc[i] = CUR;
1841 i++;
1842
1843 NEXT;
1844 }
1845
1846 ret = xmlStrndup(loc, i);
1847
1848 return(ret);
1849}
1850
1851/**
1852 * htmlParseName:
1853 * @ctxt: an HTML parser context
1854 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001855 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00001856 *
1857 * Returns the Name parsed or NULL
1858 */
1859
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001860static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001861htmlParseName(htmlParserCtxtPtr ctxt) {
1862 xmlChar buf[HTML_MAX_NAMELEN];
1863 int len = 0;
1864
1865 GROW;
1866 if (!IS_LETTER(CUR) && (CUR != '_')) {
1867 return(NULL);
1868 }
1869
1870 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1871 (CUR == '.') || (CUR == '-') ||
1872 (CUR == '_') || (CUR == ':') ||
1873 (IS_COMBINING(CUR)) ||
1874 (IS_EXTENDER(CUR))) {
1875 buf[len++] = CUR;
1876 NEXT;
1877 if (len >= HTML_MAX_NAMELEN) {
1878 xmlGenericError(xmlGenericErrorContext,
1879 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1880 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1881 (CUR == '.') || (CUR == '-') ||
1882 (CUR == '_') || (CUR == ':') ||
1883 (IS_COMBINING(CUR)) ||
1884 (IS_EXTENDER(CUR)))
1885 NEXT;
1886 break;
1887 }
1888 }
1889 return(xmlStrndup(buf, len));
1890}
1891
1892/**
1893 * htmlParseHTMLAttribute:
1894 * @ctxt: an HTML parser context
1895 * @stop: a char stop value
1896 *
1897 * parse an HTML attribute value till the stop (quote), if
1898 * stop is 0 then it stops at the first space
1899 *
1900 * Returns the attribute parsed or NULL
1901 */
1902
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001903static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001904htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
1905 xmlChar *buffer = NULL;
1906 int buffer_size = 0;
1907 xmlChar *out = NULL;
1908 xmlChar *name = NULL;
1909
1910 xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00001911 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001912
1913 /*
1914 * allocate a translation buffer.
1915 */
1916 buffer_size = HTML_PARSER_BUFFER_SIZE;
1917 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1918 if (buffer == NULL) {
Daniel Veillard3487c8d2002-09-05 11:33:25 +00001919 xmlGenericError(xmlGenericErrorContext,
1920 "htmlParseHTMLAttribute: malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00001921 return(NULL);
1922 }
1923 out = buffer;
1924
1925 /*
1926 * Ok loop until we reach one of the ending chars
1927 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00001928 while ((CUR != 0) && (CUR != stop)) {
1929 if ((stop == 0) && (CUR == '>')) break;
Owen Taylor3473f882001-02-23 17:55:21 +00001930 if ((stop == 0) && (IS_BLANK(CUR))) break;
1931 if (CUR == '&') {
1932 if (NXT(1) == '#') {
1933 unsigned int c;
1934 int bits;
1935
1936 c = htmlParseCharRef(ctxt);
1937 if (c < 0x80)
1938 { *out++ = c; bits= -6; }
1939 else if (c < 0x800)
1940 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1941 else if (c < 0x10000)
1942 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1943 else
1944 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1945
1946 for ( ; bits >= 0; bits-= 6) {
1947 *out++ = ((c >> bits) & 0x3F) | 0x80;
1948 }
1949 } else {
1950 ent = htmlParseEntityRef(ctxt, &name);
1951 if (name == NULL) {
1952 *out++ = '&';
1953 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001954 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00001955
1956 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001957 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00001958 }
1959 } else if (ent == NULL) {
1960 *out++ = '&';
1961 cur = name;
1962 while (*cur != 0) {
1963 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001964 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00001965
1966 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001967 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00001968 }
1969 *out++ = *cur++;
1970 }
1971 xmlFree(name);
1972 } else {
1973 unsigned int c;
1974 int bits;
1975
1976 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001977 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00001978
1979 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001980 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00001981 }
1982 c = (xmlChar)ent->value;
1983 if (c < 0x80)
1984 { *out++ = c; bits= -6; }
1985 else if (c < 0x800)
1986 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1987 else if (c < 0x10000)
1988 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1989 else
1990 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1991
1992 for ( ; bits >= 0; bits-= 6) {
1993 *out++ = ((c >> bits) & 0x3F) | 0x80;
1994 }
1995 xmlFree(name);
1996 }
1997 }
1998 } else {
1999 unsigned int c;
2000 int bits, l;
2001
2002 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002003 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002004
2005 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002006 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002007 }
2008 c = CUR_CHAR(l);
2009 if (c < 0x80)
2010 { *out++ = c; bits= -6; }
2011 else if (c < 0x800)
2012 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2013 else if (c < 0x10000)
2014 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2015 else
2016 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2017
2018 for ( ; bits >= 0; bits-= 6) {
2019 *out++ = ((c >> bits) & 0x3F) | 0x80;
2020 }
2021 NEXT;
2022 }
2023 }
2024 *out++ = 0;
2025 return(buffer);
2026}
2027
2028/**
Owen Taylor3473f882001-02-23 17:55:21 +00002029 * htmlParseEntityRef:
2030 * @ctxt: an HTML parser context
2031 * @str: location to store the entity name
2032 *
2033 * parse an HTML ENTITY references
2034 *
2035 * [68] EntityRef ::= '&' Name ';'
2036 *
2037 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2038 * if non-NULL *str will have to be freed by the caller.
2039 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002040const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00002041htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2042 xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002043 const htmlEntityDesc * ent = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002044 *str = NULL;
2045
2046 if (CUR == '&') {
2047 NEXT;
2048 name = htmlParseName(ctxt);
2049 if (name == NULL) {
2050 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2051 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2052 ctxt->wellFormed = 0;
2053 } else {
2054 GROW;
2055 if (CUR == ';') {
2056 *str = name;
2057
2058 /*
2059 * Lookup the entity in the table.
2060 */
2061 ent = htmlEntityLookup(name);
2062 if (ent != NULL) /* OK that's ugly !!! */
2063 NEXT;
2064 } else {
2065 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2066 ctxt->sax->error(ctxt->userData,
2067 "htmlParseEntityRef: expecting ';'\n");
2068 *str = name;
2069 }
2070 }
2071 }
2072 return(ent);
2073}
2074
2075/**
2076 * htmlParseAttValue:
2077 * @ctxt: an HTML parser context
2078 *
2079 * parse a value for an attribute
2080 * Note: the parser won't do substitution of entities here, this
2081 * will be handled later in xmlStringGetNodeList, unless it was
2082 * asked for ctxt->replaceEntities != 0
2083 *
2084 * Returns the AttValue parsed or NULL.
2085 */
2086
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002087static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002088htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2089 xmlChar *ret = NULL;
2090
2091 if (CUR == '"') {
2092 NEXT;
2093 ret = htmlParseHTMLAttribute(ctxt, '"');
2094 if (CUR != '"') {
2095 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2096 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2097 ctxt->wellFormed = 0;
2098 } else
2099 NEXT;
2100 } else if (CUR == '\'') {
2101 NEXT;
2102 ret = htmlParseHTMLAttribute(ctxt, '\'');
2103 if (CUR != '\'') {
2104 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2105 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2106 ctxt->wellFormed = 0;
2107 } else
2108 NEXT;
2109 } else {
2110 /*
2111 * That's an HTMLism, the attribute value may not be quoted
2112 */
2113 ret = htmlParseHTMLAttribute(ctxt, 0);
2114 if (ret == NULL) {
2115 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2116 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2117 ctxt->wellFormed = 0;
2118 }
2119 }
2120 return(ret);
2121}
2122
2123/**
2124 * htmlParseSystemLiteral:
2125 * @ctxt: an HTML parser context
2126 *
2127 * parse an HTML Literal
2128 *
2129 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2130 *
2131 * Returns the SystemLiteral parsed or NULL
2132 */
2133
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002134static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002135htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2136 const xmlChar *q;
2137 xmlChar *ret = NULL;
2138
2139 if (CUR == '"') {
2140 NEXT;
2141 q = CUR_PTR;
2142 while ((IS_CHAR(CUR)) && (CUR != '"'))
2143 NEXT;
2144 if (!IS_CHAR(CUR)) {
2145 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2146 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2147 ctxt->wellFormed = 0;
2148 } else {
2149 ret = xmlStrndup(q, CUR_PTR - q);
2150 NEXT;
2151 }
2152 } else if (CUR == '\'') {
2153 NEXT;
2154 q = CUR_PTR;
2155 while ((IS_CHAR(CUR)) && (CUR != '\''))
2156 NEXT;
2157 if (!IS_CHAR(CUR)) {
2158 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2159 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2160 ctxt->wellFormed = 0;
2161 } else {
2162 ret = xmlStrndup(q, CUR_PTR - q);
2163 NEXT;
2164 }
2165 } else {
2166 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2167 ctxt->sax->error(ctxt->userData,
2168 "SystemLiteral \" or ' expected\n");
2169 ctxt->wellFormed = 0;
2170 }
2171
2172 return(ret);
2173}
2174
2175/**
2176 * htmlParsePubidLiteral:
2177 * @ctxt: an HTML parser context
2178 *
2179 * parse an HTML public literal
2180 *
2181 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2182 *
2183 * Returns the PubidLiteral parsed or NULL.
2184 */
2185
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002186static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002187htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2188 const xmlChar *q;
2189 xmlChar *ret = NULL;
2190 /*
2191 * Name ::= (Letter | '_') (NameChar)*
2192 */
2193 if (CUR == '"') {
2194 NEXT;
2195 q = CUR_PTR;
2196 while (IS_PUBIDCHAR(CUR)) NEXT;
2197 if (CUR != '"') {
2198 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2199 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2200 ctxt->wellFormed = 0;
2201 } else {
2202 ret = xmlStrndup(q, CUR_PTR - q);
2203 NEXT;
2204 }
2205 } else if (CUR == '\'') {
2206 NEXT;
2207 q = CUR_PTR;
2208 while ((IS_LETTER(CUR)) && (CUR != '\''))
2209 NEXT;
2210 if (!IS_LETTER(CUR)) {
2211 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2212 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2213 ctxt->wellFormed = 0;
2214 } else {
2215 ret = xmlStrndup(q, CUR_PTR - q);
2216 NEXT;
2217 }
2218 } else {
2219 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2220 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2221 ctxt->wellFormed = 0;
2222 }
2223
2224 return(ret);
2225}
2226
2227/**
2228 * htmlParseScript:
2229 * @ctxt: an HTML parser context
2230 *
2231 * parse the content of an HTML SCRIPT or STYLE element
2232 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2233 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2234 * http://www.w3.org/TR/html4/types.html#type-script
2235 * http://www.w3.org/TR/html4/types.html#h-6.15
2236 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2237 *
2238 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2239 * element and the value of intrinsic event attributes. User agents must
2240 * not evaluate script data as HTML markup but instead must pass it on as
2241 * data to a script engine.
2242 * NOTES:
2243 * - The content is passed like CDATA
2244 * - the attributes for style and scripting "onXXX" are also described
2245 * as CDATA but SGML allows entities references in attributes so their
2246 * processing is identical as other attributes
2247 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002248static void
Owen Taylor3473f882001-02-23 17:55:21 +00002249htmlParseScript(htmlParserCtxtPtr ctxt) {
2250 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2251 int nbchar = 0;
2252 xmlChar cur;
2253
2254 SHRINK;
2255 cur = CUR;
2256 while (IS_CHAR(cur)) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00002257 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2258 (NXT(3) == '-')) {
2259 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2260 if (ctxt->sax->cdataBlock!= NULL) {
2261 /*
2262 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2263 */
2264 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2265 }
2266 }
2267 nbchar = 0;
2268 htmlParseComment(ctxt);
2269 cur = CUR;
2270 continue;
2271 } else if ((cur == '<') && (NXT(1) == '/')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002272 /*
2273 * One should break here, the specification is clear:
2274 * Authors should therefore escape "</" within the content.
2275 * Escape mechanisms are specific to each scripting or
2276 * style sheet language.
2277 */
2278 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2279 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2280 break; /* while */
2281 }
2282 buf[nbchar++] = cur;
2283 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2284 if (ctxt->sax->cdataBlock!= NULL) {
2285 /*
2286 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2287 */
2288 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2289 }
2290 nbchar = 0;
2291 }
2292 NEXT;
2293 cur = CUR;
2294 }
2295 if (!(IS_CHAR(cur))) {
2296 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2297 ctxt->sax->error(ctxt->userData,
2298 "Invalid char in CDATA 0x%X\n", cur);
2299 ctxt->wellFormed = 0;
2300 NEXT;
2301 }
2302
2303 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2304 if (ctxt->sax->cdataBlock!= NULL) {
2305 /*
2306 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2307 */
2308 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2309 }
2310 }
2311}
2312
2313
2314/**
2315 * htmlParseCharData:
2316 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002317 *
2318 * parse a CharData section.
2319 * if we are within a CDATA section ']]>' marks an end of section.
2320 *
2321 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2322 */
2323
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002324static void
2325htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002326 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2327 int nbchar = 0;
2328 int cur, l;
2329
2330 SHRINK;
2331 cur = CUR_CHAR(l);
2332 while (((cur != '<') || (ctxt->token == '<')) &&
2333 ((cur != '&') || (ctxt->token == '&')) &&
2334 (IS_CHAR(cur))) {
2335 COPY_BUF(l,buf,nbchar,cur);
2336 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2337 /*
2338 * Ok the segment is to be consumed as chars.
2339 */
2340 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2341 if (areBlanks(ctxt, buf, nbchar)) {
2342 if (ctxt->sax->ignorableWhitespace != NULL)
2343 ctxt->sax->ignorableWhitespace(ctxt->userData,
2344 buf, nbchar);
2345 } else {
2346 htmlCheckParagraph(ctxt);
2347 if (ctxt->sax->characters != NULL)
2348 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2349 }
2350 }
2351 nbchar = 0;
2352 }
2353 NEXTL(l);
2354 cur = CUR_CHAR(l);
2355 }
2356 if (nbchar != 0) {
2357 /*
2358 * Ok the segment is to be consumed as chars.
2359 */
2360 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2361 if (areBlanks(ctxt, buf, nbchar)) {
2362 if (ctxt->sax->ignorableWhitespace != NULL)
2363 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2364 } else {
2365 htmlCheckParagraph(ctxt);
2366 if (ctxt->sax->characters != NULL)
2367 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2368 }
2369 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002370 } else {
2371 /*
2372 * Loop detection
2373 */
2374 if (cur == 0)
2375 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002376 }
2377}
2378
2379/**
2380 * htmlParseExternalID:
2381 * @ctxt: an HTML parser context
2382 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002383 *
2384 * Parse an External ID or a Public ID
2385 *
Owen Taylor3473f882001-02-23 17:55:21 +00002386 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2387 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2388 *
2389 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2390 *
2391 * Returns the function returns SystemLiteral and in the second
2392 * case publicID receives PubidLiteral, is strict is off
2393 * it is possible to return NULL and have publicID set.
2394 */
2395
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002396static xmlChar *
2397htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002398 xmlChar *URI = NULL;
2399
2400 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2401 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2402 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2403 SKIP(6);
2404 if (!IS_BLANK(CUR)) {
2405 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2406 ctxt->sax->error(ctxt->userData,
2407 "Space required after 'SYSTEM'\n");
2408 ctxt->wellFormed = 0;
2409 }
2410 SKIP_BLANKS;
2411 URI = htmlParseSystemLiteral(ctxt);
2412 if (URI == NULL) {
2413 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2414 ctxt->sax->error(ctxt->userData,
2415 "htmlParseExternalID: SYSTEM, no URI\n");
2416 ctxt->wellFormed = 0;
2417 }
2418 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2419 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2420 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2421 SKIP(6);
2422 if (!IS_BLANK(CUR)) {
2423 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2424 ctxt->sax->error(ctxt->userData,
2425 "Space required after 'PUBLIC'\n");
2426 ctxt->wellFormed = 0;
2427 }
2428 SKIP_BLANKS;
2429 *publicID = htmlParsePubidLiteral(ctxt);
2430 if (*publicID == NULL) {
2431 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2432 ctxt->sax->error(ctxt->userData,
2433 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2434 ctxt->wellFormed = 0;
2435 }
2436 SKIP_BLANKS;
2437 if ((CUR == '"') || (CUR == '\'')) {
2438 URI = htmlParseSystemLiteral(ctxt);
2439 }
2440 }
2441 return(URI);
2442}
2443
2444/**
2445 * htmlParseComment:
2446 * @ctxt: an HTML parser context
2447 *
2448 * Parse an XML (SGML) comment <!-- .... -->
2449 *
2450 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2451 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002452static void
Owen Taylor3473f882001-02-23 17:55:21 +00002453htmlParseComment(htmlParserCtxtPtr ctxt) {
2454 xmlChar *buf = NULL;
2455 int len;
2456 int size = HTML_PARSER_BUFFER_SIZE;
2457 int q, ql;
2458 int r, rl;
2459 int cur, l;
2460 xmlParserInputState state;
2461
2462 /*
2463 * Check that there is a comment right here.
2464 */
2465 if ((RAW != '<') || (NXT(1) != '!') ||
2466 (NXT(2) != '-') || (NXT(3) != '-')) return;
2467
2468 state = ctxt->instate;
2469 ctxt->instate = XML_PARSER_COMMENT;
2470 SHRINK;
2471 SKIP(4);
2472 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2473 if (buf == NULL) {
2474 xmlGenericError(xmlGenericErrorContext,
2475 "malloc of %d byte failed\n", size);
2476 ctxt->instate = state;
2477 return;
2478 }
2479 q = CUR_CHAR(ql);
2480 NEXTL(ql);
2481 r = CUR_CHAR(rl);
2482 NEXTL(rl);
2483 cur = CUR_CHAR(l);
2484 len = 0;
2485 while (IS_CHAR(cur) &&
2486 ((cur != '>') ||
2487 (r != '-') || (q != '-'))) {
2488 if (len + 5 >= size) {
2489 size *= 2;
2490 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2491 if (buf == NULL) {
2492 xmlGenericError(xmlGenericErrorContext,
2493 "realloc of %d byte failed\n", size);
2494 ctxt->instate = state;
2495 return;
2496 }
2497 }
2498 COPY_BUF(ql,buf,len,q);
2499 q = r;
2500 ql = rl;
2501 r = cur;
2502 rl = l;
2503 NEXTL(l);
2504 cur = CUR_CHAR(l);
2505 if (cur == 0) {
2506 SHRINK;
2507 GROW;
2508 cur = CUR_CHAR(l);
2509 }
2510 }
2511 buf[len] = 0;
2512 if (!IS_CHAR(cur)) {
2513 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2514 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2515 ctxt->sax->error(ctxt->userData,
2516 "Comment not terminated \n<!--%.50s\n", buf);
2517 ctxt->wellFormed = 0;
2518 xmlFree(buf);
2519 } else {
2520 NEXT;
2521 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2522 (!ctxt->disableSAX))
2523 ctxt->sax->comment(ctxt->userData, buf);
2524 xmlFree(buf);
2525 }
2526 ctxt->instate = state;
2527}
2528
2529/**
2530 * htmlParseCharRef:
2531 * @ctxt: an HTML parser context
2532 *
2533 * parse Reference declarations
2534 *
2535 * [66] CharRef ::= '&#' [0-9]+ ';' |
2536 * '&#x' [0-9a-fA-F]+ ';'
2537 *
2538 * Returns the value parsed (as an int)
2539 */
2540int
2541htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2542 int val = 0;
2543
2544 if ((CUR == '&') && (NXT(1) == '#') &&
2545 (NXT(2) == 'x')) {
2546 SKIP(3);
2547 while (CUR != ';') {
2548 if ((CUR >= '0') && (CUR <= '9'))
2549 val = val * 16 + (CUR - '0');
2550 else if ((CUR >= 'a') && (CUR <= 'f'))
2551 val = val * 16 + (CUR - 'a') + 10;
2552 else if ((CUR >= 'A') && (CUR <= 'F'))
2553 val = val * 16 + (CUR - 'A') + 10;
2554 else {
2555 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2556 ctxt->sax->error(ctxt->userData,
2557 "htmlParseCharRef: invalid hexadecimal value\n");
2558 ctxt->wellFormed = 0;
2559 return(0);
2560 }
2561 NEXT;
2562 }
2563 if (CUR == ';')
2564 NEXT;
2565 } else if ((CUR == '&') && (NXT(1) == '#')) {
2566 SKIP(2);
2567 while (CUR != ';') {
2568 if ((CUR >= '0') && (CUR <= '9'))
2569 val = val * 10 + (CUR - '0');
2570 else {
2571 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2572 ctxt->sax->error(ctxt->userData,
2573 "htmlParseCharRef: invalid decimal value\n");
2574 ctxt->wellFormed = 0;
2575 return(0);
2576 }
2577 NEXT;
2578 }
2579 if (CUR == ';')
2580 NEXT;
2581 } else {
2582 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2583 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2584 ctxt->wellFormed = 0;
2585 }
2586 /*
2587 * Check the value IS_CHAR ...
2588 */
2589 if (IS_CHAR(val)) {
2590 return(val);
2591 } else {
2592 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2593 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
2594 val);
2595 ctxt->wellFormed = 0;
2596 }
2597 return(0);
2598}
2599
2600
2601/**
2602 * htmlParseDocTypeDecl :
2603 * @ctxt: an HTML parser context
2604 *
2605 * parse a DOCTYPE declaration
2606 *
2607 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2608 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2609 */
2610
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002611static void
Owen Taylor3473f882001-02-23 17:55:21 +00002612htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
2613 xmlChar *name;
2614 xmlChar *ExternalID = NULL;
2615 xmlChar *URI = NULL;
2616
2617 /*
2618 * We know that '<!DOCTYPE' has been detected.
2619 */
2620 SKIP(9);
2621
2622 SKIP_BLANKS;
2623
2624 /*
2625 * Parse the DOCTYPE name.
2626 */
2627 name = htmlParseName(ctxt);
2628 if (name == NULL) {
2629 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2630 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2631 ctxt->wellFormed = 0;
2632 }
2633 /*
2634 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2635 */
2636
2637 SKIP_BLANKS;
2638
2639 /*
2640 * Check for SystemID and ExternalID
2641 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002642 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00002643 SKIP_BLANKS;
2644
2645 /*
2646 * We should be at the end of the DOCTYPE declaration.
2647 */
2648 if (CUR != '>') {
2649 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardf6ed8bc2001-10-02 09:22:47 +00002650 ctxt->sax->error(ctxt->userData, "DOCTYPE improperly terminated\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002651 ctxt->wellFormed = 0;
2652 /* We shouldn't try to resynchronize ... */
2653 }
2654 NEXT;
2655
2656 /*
2657 * Create or update the document accordingly to the DOCTYPE
2658 */
2659 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2660 (!ctxt->disableSAX))
2661 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
2662
2663 /*
2664 * Cleanup, since we don't use all those identifiers
2665 */
2666 if (URI != NULL) xmlFree(URI);
2667 if (ExternalID != NULL) xmlFree(ExternalID);
2668 if (name != NULL) xmlFree(name);
2669}
2670
2671/**
2672 * htmlParseAttribute:
2673 * @ctxt: an HTML parser context
2674 * @value: a xmlChar ** used to store the value of the attribute
2675 *
2676 * parse an attribute
2677 *
2678 * [41] Attribute ::= Name Eq AttValue
2679 *
2680 * [25] Eq ::= S? '=' S?
2681 *
2682 * With namespace:
2683 *
2684 * [NS 11] Attribute ::= QName Eq AttValue
2685 *
2686 * Also the case QName == xmlns:??? is handled independently as a namespace
2687 * definition.
2688 *
2689 * Returns the attribute name, and the value in *value.
2690 */
2691
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002692static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002693htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
2694 xmlChar *name, *val = NULL;
2695
2696 *value = NULL;
2697 name = htmlParseHTMLName(ctxt);
2698 if (name == NULL) {
2699 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2700 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2701 ctxt->wellFormed = 0;
2702 return(NULL);
2703 }
2704
2705 /*
2706 * read the value
2707 */
2708 SKIP_BLANKS;
2709 if (CUR == '=') {
2710 NEXT;
2711 SKIP_BLANKS;
2712 val = htmlParseAttValue(ctxt);
2713 /******
2714 } else {
2715 * TODO : some attribute must have values, some may not
2716 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2717 ctxt->sax->warning(ctxt->userData,
2718 "No value for attribute %s\n", name); */
2719 }
2720
2721 *value = val;
2722 return(name);
2723}
2724
2725/**
2726 * htmlCheckEncoding:
2727 * @ctxt: an HTML parser context
2728 * @attvalue: the attribute value
2729 *
2730 * Checks an http-equiv attribute from a Meta tag to detect
2731 * the encoding
2732 * If a new encoding is detected the parser is switched to decode
2733 * it and pass UTF8
2734 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002735static void
Owen Taylor3473f882001-02-23 17:55:21 +00002736htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2737 const xmlChar *encoding;
2738
2739 if ((ctxt == NULL) || (attvalue == NULL))
2740 return;
2741
2742 /* do not change encoding */
2743 if (ctxt->input->encoding != NULL)
2744 return;
2745
2746 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
2747 if (encoding != NULL) {
2748 encoding += 8;
2749 } else {
2750 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
2751 if (encoding != NULL)
2752 encoding += 9;
2753 }
2754 if (encoding != NULL) {
2755 xmlCharEncoding enc;
2756 xmlCharEncodingHandlerPtr handler;
2757
2758 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2759
2760 if (ctxt->input->encoding != NULL)
2761 xmlFree((xmlChar *) ctxt->input->encoding);
2762 ctxt->input->encoding = xmlStrdup(encoding);
2763
2764 enc = xmlParseCharEncoding((const char *) encoding);
2765 /*
2766 * registered set of known encodings
2767 */
2768 if (enc != XML_CHAR_ENCODING_ERROR) {
2769 xmlSwitchEncoding(ctxt, enc);
2770 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2771 } else {
2772 /*
2773 * fallback for unknown encodings
2774 */
2775 handler = xmlFindCharEncodingHandler((const char *) encoding);
2776 if (handler != NULL) {
2777 xmlSwitchToEncoding(ctxt, handler);
2778 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2779 } else {
2780 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2781 }
2782 }
2783
2784 if ((ctxt->input->buf != NULL) &&
2785 (ctxt->input->buf->encoder != NULL) &&
2786 (ctxt->input->buf->raw != NULL) &&
2787 (ctxt->input->buf->buffer != NULL)) {
2788 int nbchars;
2789 int processed;
2790
2791 /*
2792 * convert as much as possible to the parser reading buffer.
2793 */
2794 processed = ctxt->input->cur - ctxt->input->base;
2795 xmlBufferShrink(ctxt->input->buf->buffer, processed);
2796 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2797 ctxt->input->buf->buffer,
2798 ctxt->input->buf->raw);
2799 if (nbchars < 0) {
2800 ctxt->errNo = XML_ERR_INVALID_ENCODING;
2801 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2802 ctxt->sax->error(ctxt->userData,
2803 "htmlCheckEncoding: encoder error\n");
2804 }
2805 ctxt->input->base =
2806 ctxt->input->cur = ctxt->input->buf->buffer->content;
2807 }
2808 }
2809}
2810
2811/**
2812 * htmlCheckMeta:
2813 * @ctxt: an HTML parser context
2814 * @atts: the attributes values
2815 *
2816 * Checks an attributes from a Meta tag
2817 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002818static void
Owen Taylor3473f882001-02-23 17:55:21 +00002819htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2820 int i;
2821 const xmlChar *att, *value;
2822 int http = 0;
2823 const xmlChar *content = NULL;
2824
2825 if ((ctxt == NULL) || (atts == NULL))
2826 return;
2827
2828 i = 0;
2829 att = atts[i++];
2830 while (att != NULL) {
2831 value = atts[i++];
2832 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
2833 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
2834 http = 1;
2835 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
2836 content = value;
2837 att = atts[i++];
2838 }
2839 if ((http) && (content != NULL))
2840 htmlCheckEncoding(ctxt, content);
2841
2842}
2843
2844/**
2845 * htmlParseStartTag:
2846 * @ctxt: an HTML parser context
2847 *
2848 * parse a start of tag either for rule element or
2849 * EmptyElement. In both case we don't parse the tag closing chars.
2850 *
2851 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2852 *
2853 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2854 *
2855 * With namespace:
2856 *
2857 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2858 *
2859 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2860 *
2861 */
2862
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002863static void
Owen Taylor3473f882001-02-23 17:55:21 +00002864htmlParseStartTag(htmlParserCtxtPtr ctxt) {
2865 xmlChar *name;
2866 xmlChar *attname;
2867 xmlChar *attvalue;
2868 const xmlChar **atts = NULL;
2869 int nbatts = 0;
2870 int maxatts = 0;
2871 int meta = 0;
2872 int i;
2873
2874 if (CUR != '<') return;
2875 NEXT;
2876
2877 GROW;
2878 name = htmlParseHTMLName(ctxt);
2879 if (name == NULL) {
2880 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2881 ctxt->sax->error(ctxt->userData,
2882 "htmlParseStartTag: invalid element name\n");
2883 ctxt->wellFormed = 0;
2884 /* Dump the bogus tag like browsers do */
2885 while ((IS_CHAR(CUR)) && (CUR != '>'))
2886 NEXT;
2887 return;
2888 }
2889 if (xmlStrEqual(name, BAD_CAST"meta"))
2890 meta = 1;
2891
2892 /*
2893 * Check for auto-closure of HTML elements.
2894 */
2895 htmlAutoClose(ctxt, name);
2896
2897 /*
2898 * Check for implied HTML elements.
2899 */
2900 htmlCheckImplied(ctxt, name);
2901
2902 /*
2903 * Avoid html at any level > 0, head at any level != 1
2904 * or any attempt to recurse body
2905 */
2906 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
2907 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2908 ctxt->sax->error(ctxt->userData,
2909 "htmlParseStartTag: misplaced <html> tag\n");
2910 ctxt->wellFormed = 0;
2911 xmlFree(name);
2912 return;
2913 }
2914 if ((ctxt->nameNr != 1) &&
2915 (xmlStrEqual(name, BAD_CAST"head"))) {
2916 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2917 ctxt->sax->error(ctxt->userData,
2918 "htmlParseStartTag: misplaced <head> tag\n");
2919 ctxt->wellFormed = 0;
2920 xmlFree(name);
2921 return;
2922 }
2923 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002924 int indx;
2925 for (indx = 0;indx < ctxt->nameNr;indx++) {
2926 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor3473f882001-02-23 17:55:21 +00002927 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2928 ctxt->sax->error(ctxt->userData,
2929 "htmlParseStartTag: misplaced <body> tag\n");
2930 ctxt->wellFormed = 0;
2931 xmlFree(name);
2932 return;
2933 }
2934 }
2935 }
2936
2937 /*
2938 * Now parse the attributes, it ends up with the ending
2939 *
2940 * (S Attribute)* S?
2941 */
2942 SKIP_BLANKS;
2943 while ((IS_CHAR(CUR)) &&
2944 (CUR != '>') &&
2945 ((CUR != '/') || (NXT(1) != '>'))) {
2946 long cons = ctxt->nbChars;
2947
2948 GROW;
2949 attname = htmlParseAttribute(ctxt, &attvalue);
2950 if (attname != NULL) {
2951
2952 /*
2953 * Well formedness requires at most one declaration of an attribute
2954 */
2955 for (i = 0; i < nbatts;i += 2) {
2956 if (xmlStrEqual(atts[i], attname)) {
2957 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2958 ctxt->sax->error(ctxt->userData,
2959 "Attribute %s redefined\n",
2960 attname);
2961 ctxt->wellFormed = 0;
2962 xmlFree(attname);
2963 if (attvalue != NULL)
2964 xmlFree(attvalue);
2965 goto failed;
2966 }
2967 }
2968
2969 /*
2970 * Add the pair to atts
2971 */
2972 if (atts == NULL) {
2973 maxatts = 10;
2974 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
2975 if (atts == NULL) {
2976 xmlGenericError(xmlGenericErrorContext,
2977 "malloc of %ld byte failed\n",
2978 maxatts * (long)sizeof(xmlChar *));
2979 if (name != NULL) xmlFree(name);
2980 return;
2981 }
2982 } else if (nbatts + 4 > maxatts) {
2983 maxatts *= 2;
2984 atts = (const xmlChar **) xmlRealloc((void *) atts,
2985 maxatts * sizeof(xmlChar *));
2986 if (atts == NULL) {
2987 xmlGenericError(xmlGenericErrorContext,
2988 "realloc of %ld byte failed\n",
2989 maxatts * (long)sizeof(xmlChar *));
2990 if (name != NULL) xmlFree(name);
2991 return;
2992 }
2993 }
2994 atts[nbatts++] = attname;
2995 atts[nbatts++] = attvalue;
2996 atts[nbatts] = NULL;
2997 atts[nbatts + 1] = NULL;
2998 }
2999 else {
3000 /* Dump the bogus attribute string up to the next blank or
3001 * the end of the tag. */
Daniel Veillard561b7f82002-03-20 21:55:57 +00003002 while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
3003 && ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003004 NEXT;
3005 }
3006
3007failed:
3008 SKIP_BLANKS;
3009 if (cons == ctxt->nbChars) {
3010 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3011 ctxt->sax->error(ctxt->userData,
3012 "htmlParseStartTag: problem parsing attributes\n");
3013 ctxt->wellFormed = 0;
3014 break;
3015 }
3016 }
3017
3018 /*
3019 * Handle specific association to the META tag
3020 */
3021 if (meta)
3022 htmlCheckMeta(ctxt, atts);
3023
3024 /*
3025 * SAX: Start of Element !
3026 */
3027 htmlnamePush(ctxt, xmlStrdup(name));
3028#ifdef DEBUG
3029 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
3030#endif
3031 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3032 ctxt->sax->startElement(ctxt->userData, name, atts);
3033
3034 if (atts != NULL) {
3035 for (i = 0;i < nbatts;i++) {
3036 if (atts[i] != NULL)
3037 xmlFree((xmlChar *) atts[i]);
3038 }
3039 xmlFree((void *) atts);
3040 }
3041 if (name != NULL) xmlFree(name);
3042}
3043
3044/**
3045 * htmlParseEndTag:
3046 * @ctxt: an HTML parser context
3047 *
3048 * parse an end of tag
3049 *
3050 * [42] ETag ::= '</' Name S? '>'
3051 *
3052 * With namespace
3053 *
3054 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003055 *
3056 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003057 */
3058
Daniel Veillardf420ac52001-07-04 16:04:09 +00003059static int
Owen Taylor3473f882001-02-23 17:55:21 +00003060htmlParseEndTag(htmlParserCtxtPtr ctxt) {
3061 xmlChar *name;
3062 xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003063 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003064
3065 if ((CUR != '<') || (NXT(1) != '/')) {
3066 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3067 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3068 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003069 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003070 }
3071 SKIP(2);
3072
3073 name = htmlParseHTMLName(ctxt);
Daniel Veillardf420ac52001-07-04 16:04:09 +00003074 if (name == NULL) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003075
3076 /*
3077 * We should definitely be at the ending "S? '>'" part
3078 */
3079 SKIP_BLANKS;
3080 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3081 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3082 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3083 ctxt->wellFormed = 0;
3084 } else
3085 NEXT;
3086
3087 /*
3088 * If the name read is not one of the element in the parsing stack
3089 * then return, it's just an error.
3090 */
3091 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
3092 if (xmlStrEqual(name, ctxt->nameTab[i])) break;
3093 }
3094 if (i < 0) {
3095 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3096 ctxt->sax->error(ctxt->userData,
3097 "Unexpected end tag : %s\n", name);
3098 xmlFree(name);
3099 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003100 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003101 }
3102
3103
3104 /*
3105 * Check for auto-closure of HTML elements.
3106 */
3107
3108 htmlAutoCloseOnClose(ctxt, name);
3109
3110 /*
3111 * Well formedness constraints, opening and closing must match.
3112 * With the exception that the autoclose may have popped stuff out
3113 * of the stack.
3114 */
3115 if (!xmlStrEqual(name, ctxt->name)) {
3116#ifdef DEBUG
3117 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
3118#endif
3119 if ((ctxt->name != NULL) &&
3120 (!xmlStrEqual(ctxt->name, name))) {
3121 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3122 ctxt->sax->error(ctxt->userData,
3123 "Opening and ending tag mismatch: %s and %s\n",
3124 name, ctxt->name);
3125 ctxt->wellFormed = 0;
3126 }
3127 }
3128
3129 /*
3130 * SAX: End of Tag
3131 */
3132 oldname = ctxt->name;
3133 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3134 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3135 ctxt->sax->endElement(ctxt->userData, name);
3136 oldname = htmlnamePop(ctxt);
3137 if (oldname != NULL) {
3138#ifdef DEBUG
3139 xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
3140#endif
3141 xmlFree(oldname);
3142#ifdef DEBUG
3143 } else {
3144 xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
3145#endif
3146 }
Daniel Veillardf420ac52001-07-04 16:04:09 +00003147 ret = 1;
3148 } else {
3149 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003150 }
3151
3152 if (name != NULL)
3153 xmlFree(name);
3154
Daniel Veillardf420ac52001-07-04 16:04:09 +00003155 return(ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003156}
3157
3158
3159/**
3160 * htmlParseReference:
3161 * @ctxt: an HTML parser context
3162 *
3163 * parse and handle entity references in content,
3164 * this will end-up in a call to character() since this is either a
3165 * CharRef, or a predefined entity.
3166 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003167static void
Owen Taylor3473f882001-02-23 17:55:21 +00003168htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003169 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003170 xmlChar out[6];
3171 xmlChar *name;
3172 if (CUR != '&') return;
3173
3174 if (NXT(1) == '#') {
3175 unsigned int c;
3176 int bits, i = 0;
3177
3178 c = htmlParseCharRef(ctxt);
3179 if (c == 0)
3180 return;
3181
3182 if (c < 0x80) { out[i++]= c; bits= -6; }
3183 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3184 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3185 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3186
3187 for ( ; bits >= 0; bits-= 6) {
3188 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3189 }
3190 out[i] = 0;
3191
3192 htmlCheckParagraph(ctxt);
3193 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3194 ctxt->sax->characters(ctxt->userData, out, i);
3195 } else {
3196 ent = htmlParseEntityRef(ctxt, &name);
3197 if (name == NULL) {
3198 htmlCheckParagraph(ctxt);
3199 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3200 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3201 return;
3202 }
3203 if ((ent == NULL) || (ent->value <= 0)) {
3204 htmlCheckParagraph(ctxt);
3205 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3206 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3207 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3208 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3209 }
3210 } else {
3211 unsigned int c;
3212 int bits, i = 0;
3213
3214 c = ent->value;
3215 if (c < 0x80)
3216 { out[i++]= c; bits= -6; }
3217 else if (c < 0x800)
3218 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3219 else if (c < 0x10000)
3220 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3221 else
3222 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3223
3224 for ( ; bits >= 0; bits-= 6) {
3225 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3226 }
3227 out[i] = 0;
3228
3229 htmlCheckParagraph(ctxt);
3230 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3231 ctxt->sax->characters(ctxt->userData, out, i);
3232 }
3233 xmlFree(name);
3234 }
3235}
3236
3237/**
3238 * htmlParseContent:
3239 * @ctxt: an HTML parser context
3240 * @name: the node name
3241 *
3242 * Parse a content: comment, sub-element, reference or text.
3243 *
3244 */
3245
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003246static void
Owen Taylor3473f882001-02-23 17:55:21 +00003247htmlParseContent(htmlParserCtxtPtr ctxt) {
3248 xmlChar *currentNode;
3249 int depth;
3250
3251 currentNode = xmlStrdup(ctxt->name);
3252 depth = ctxt->nameNr;
3253 while (1) {
3254 long cons = ctxt->nbChars;
3255
3256 GROW;
3257 /*
3258 * Our tag or one of it's parent or children is ending.
3259 */
3260 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003261 if (htmlParseEndTag(ctxt) &&
3262 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3263 if (currentNode != NULL)
3264 xmlFree(currentNode);
3265 return;
3266 }
3267 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003268 }
3269
3270 /*
3271 * Has this node been popped out during parsing of
3272 * the next element
3273 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003274 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3275 (!xmlStrEqual(currentNode, ctxt->name)))
3276 {
Owen Taylor3473f882001-02-23 17:55:21 +00003277 if (currentNode != NULL) xmlFree(currentNode);
3278 return;
3279 }
3280
Daniel Veillardf9533d12001-03-03 10:04:57 +00003281 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3282 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003283 /*
3284 * Handle SCRIPT/STYLE separately
3285 */
3286 htmlParseScript(ctxt);
3287 } else {
3288 /*
3289 * Sometimes DOCTYPE arrives in the middle of the document
3290 */
3291 if ((CUR == '<') && (NXT(1) == '!') &&
3292 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3293 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3294 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3295 (UPP(8) == 'E')) {
3296 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3297 ctxt->sax->error(ctxt->userData,
3298 "Misplaced DOCTYPE declaration\n");
3299 ctxt->wellFormed = 0;
3300 htmlParseDocTypeDecl(ctxt);
3301 }
3302
3303 /*
3304 * First case : a comment
3305 */
3306 if ((CUR == '<') && (NXT(1) == '!') &&
3307 (NXT(2) == '-') && (NXT(3) == '-')) {
3308 htmlParseComment(ctxt);
3309 }
3310
3311 /*
3312 * Second case : a sub-element.
3313 */
3314 else if (CUR == '<') {
3315 htmlParseElement(ctxt);
3316 }
3317
3318 /*
3319 * Third case : a reference. If if has not been resolved,
3320 * parsing returns it's Name, create the node
3321 */
3322 else if (CUR == '&') {
3323 htmlParseReference(ctxt);
3324 }
3325
3326 /*
3327 * Fourth : end of the resource
3328 */
3329 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003330 htmlAutoCloseOnEnd(ctxt);
3331 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003332 }
3333
3334 /*
3335 * Last case, text. Note that References are handled directly.
3336 */
3337 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003338 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003339 }
3340
3341 if (cons == ctxt->nbChars) {
3342 if (ctxt->node != NULL) {
3343 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3344 ctxt->sax->error(ctxt->userData,
3345 "detected an error in element content\n");
3346 ctxt->wellFormed = 0;
3347 }
3348 break;
3349 }
3350 }
3351 GROW;
3352 }
3353 if (currentNode != NULL) xmlFree(currentNode);
3354}
3355
3356/**
3357 * htmlParseElement:
3358 * @ctxt: an HTML parser context
3359 *
3360 * parse an HTML element, this is highly recursive
3361 *
3362 * [39] element ::= EmptyElemTag | STag content ETag
3363 *
3364 * [41] Attribute ::= Name Eq AttValue
3365 */
3366
3367void
3368htmlParseElement(htmlParserCtxtPtr ctxt) {
3369 xmlChar *name;
3370 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003371 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003372 htmlParserNodeInfo node_info;
3373 xmlChar *oldname;
3374 int depth = ctxt->nameNr;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003375 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003376
3377 /* Capture start position */
3378 if (ctxt->record_info) {
3379 node_info.begin_pos = ctxt->input->consumed +
3380 (CUR_PTR - ctxt->input->base);
3381 node_info.begin_line = ctxt->input->line;
3382 }
3383
3384 oldname = xmlStrdup(ctxt->name);
3385 htmlParseStartTag(ctxt);
3386 name = ctxt->name;
3387#ifdef DEBUG
3388 if (oldname == NULL)
3389 xmlGenericError(xmlGenericErrorContext,
3390 "Start of element %s\n", name);
3391 else if (name == NULL)
3392 xmlGenericError(xmlGenericErrorContext,
3393 "Start of element failed, was %s\n", oldname);
3394 else
3395 xmlGenericError(xmlGenericErrorContext,
3396 "Start of element %s, was %s\n", name, oldname);
3397#endif
3398 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3399 (name == NULL)) {
3400 if (CUR == '>')
3401 NEXT;
3402 if (oldname != NULL)
3403 xmlFree(oldname);
3404 return;
3405 }
3406 if (oldname != NULL)
3407 xmlFree(oldname);
3408
3409 /*
3410 * Lookup the info for that element.
3411 */
3412 info = htmlTagLookup(name);
3413 if (info == NULL) {
3414 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3415 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3416 name);
3417 ctxt->wellFormed = 0;
3418 } else if (info->depr) {
3419/***************************
3420 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3421 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3422 name);
3423 ***************************/
3424 }
3425
3426 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00003427 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00003428 */
3429 if ((CUR == '/') && (NXT(1) == '>')) {
3430 SKIP(2);
3431 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3432 ctxt->sax->endElement(ctxt->userData, name);
3433 oldname = htmlnamePop(ctxt);
3434#ifdef DEBUG
3435 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
3436#endif
3437 if (oldname != NULL)
3438 xmlFree(oldname);
3439 return;
3440 }
3441
3442 if (CUR == '>') {
3443 NEXT;
3444 } else {
3445 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3446 ctxt->sax->error(ctxt->userData,
3447 "Couldn't find end of Start Tag %s\n",
3448 name);
3449 ctxt->wellFormed = 0;
3450
3451 /*
3452 * end of parsing of this node.
3453 */
3454 if (xmlStrEqual(name, ctxt->name)) {
3455 nodePop(ctxt);
3456 oldname = htmlnamePop(ctxt);
3457#ifdef DEBUG
3458 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
3459#endif
3460 if (oldname != NULL)
3461 xmlFree(oldname);
3462 }
3463
3464 /*
3465 * Capture end position and add node
3466 */
3467 if ( currentNode != NULL && ctxt->record_info ) {
3468 node_info.end_pos = ctxt->input->consumed +
3469 (CUR_PTR - ctxt->input->base);
3470 node_info.end_line = ctxt->input->line;
3471 node_info.node = ctxt->node;
3472 xmlParserAddNodeInfo(ctxt, &node_info);
3473 }
3474 return;
3475 }
3476
3477 /*
3478 * Check for an Empty Element from DTD definition
3479 */
3480 if ((info != NULL) && (info->empty)) {
3481 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3482 ctxt->sax->endElement(ctxt->userData, name);
3483 oldname = htmlnamePop(ctxt);
3484#ifdef DEBUG
3485 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
3486#endif
3487 if (oldname != NULL)
3488 xmlFree(oldname);
3489 return;
3490 }
3491
3492 /*
3493 * Parse the content of the element:
3494 */
3495 currentNode = xmlStrdup(ctxt->name);
3496 depth = ctxt->nameNr;
3497 while (IS_CHAR(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00003498 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00003499 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00003500 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00003501 if (ctxt->nameNr < depth) break;
3502 }
3503
Owen Taylor3473f882001-02-23 17:55:21 +00003504 /*
3505 * Capture end position and add node
3506 */
3507 if ( currentNode != NULL && ctxt->record_info ) {
3508 node_info.end_pos = ctxt->input->consumed +
3509 (CUR_PTR - ctxt->input->base);
3510 node_info.end_line = ctxt->input->line;
3511 node_info.node = ctxt->node;
3512 xmlParserAddNodeInfo(ctxt, &node_info);
3513 }
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003514 if (!IS_CHAR(CUR)) {
3515 htmlAutoCloseOnEnd(ctxt);
3516 }
3517
Owen Taylor3473f882001-02-23 17:55:21 +00003518 if (currentNode != NULL)
3519 xmlFree(currentNode);
3520}
3521
3522/**
3523 * htmlParseDocument :
3524 * @ctxt: an HTML parser context
3525 *
3526 * parse an HTML document (and build a tree if using the standard SAX
3527 * interface).
3528 *
3529 * Returns 0, -1 in case of error. the parser context is augmented
3530 * as a result of the parsing.
3531 */
3532
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00003533int
Owen Taylor3473f882001-02-23 17:55:21 +00003534htmlParseDocument(htmlParserCtxtPtr ctxt) {
3535 xmlDtdPtr dtd;
3536
Daniel Veillardd0463562001-10-13 09:15:48 +00003537 xmlInitParser();
3538
Owen Taylor3473f882001-02-23 17:55:21 +00003539 htmlDefaultSAXHandlerInit();
3540 ctxt->html = 1;
3541
3542 GROW;
3543 /*
3544 * SAX: beginning of the document processing.
3545 */
3546 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3547 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3548
3549 /*
3550 * Wipe out everything which is before the first '<'
3551 */
3552 SKIP_BLANKS;
3553 if (CUR == 0) {
3554 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3555 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3556 ctxt->wellFormed = 0;
3557 }
3558
3559 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3560 ctxt->sax->startDocument(ctxt->userData);
3561
3562
3563 /*
3564 * Parse possible comments before any content
3565 */
3566 while ((CUR == '<') && (NXT(1) == '!') &&
3567 (NXT(2) == '-') && (NXT(3) == '-')) {
3568 htmlParseComment(ctxt);
3569 SKIP_BLANKS;
3570 }
3571
3572
3573 /*
3574 * Then possibly doc type declaration(s) and more Misc
3575 * (doctypedecl Misc*)?
3576 */
3577 if ((CUR == '<') && (NXT(1) == '!') &&
3578 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3579 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3580 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3581 (UPP(8) == 'E')) {
3582 htmlParseDocTypeDecl(ctxt);
3583 }
3584 SKIP_BLANKS;
3585
3586 /*
3587 * Parse possible comments before any content
3588 */
3589 while ((CUR == '<') && (NXT(1) == '!') &&
3590 (NXT(2) == '-') && (NXT(3) == '-')) {
3591 htmlParseComment(ctxt);
3592 SKIP_BLANKS;
3593 }
3594
3595 /*
3596 * Time to start parsing the tree itself
3597 */
3598 htmlParseContent(ctxt);
3599
3600 /*
3601 * autoclose
3602 */
3603 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003604 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003605
3606
3607 /*
3608 * SAX: end of the document processing.
3609 */
3610 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3611 ctxt->sax->endDocument(ctxt->userData);
3612
3613 if (ctxt->myDoc != NULL) {
3614 dtd = xmlGetIntSubset(ctxt->myDoc);
3615 if (dtd == NULL)
3616 ctxt->myDoc->intSubset =
3617 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3618 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3619 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3620 }
3621 if (! ctxt->wellFormed) return(-1);
3622 return(0);
3623}
3624
3625
3626/************************************************************************
3627 * *
3628 * Parser contexts handling *
3629 * *
3630 ************************************************************************/
3631
3632/**
3633 * xmlInitParserCtxt:
3634 * @ctxt: an HTML parser context
3635 *
3636 * Initialize a parser context
3637 */
3638
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003639static void
Owen Taylor3473f882001-02-23 17:55:21 +00003640htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3641{
3642 htmlSAXHandler *sax;
3643
3644 if (ctxt == NULL) return;
3645 memset(ctxt, 0, sizeof(htmlParserCtxt));
3646
3647 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3648 if (sax == NULL) {
3649 xmlGenericError(xmlGenericErrorContext,
3650 "htmlInitParserCtxt: out of memory\n");
3651 }
3652 else
3653 memset(sax, 0, sizeof(htmlSAXHandler));
3654
3655 /* Allocate the Input stack */
3656 ctxt->inputTab = (htmlParserInputPtr *)
3657 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3658 if (ctxt->inputTab == NULL) {
3659 xmlGenericError(xmlGenericErrorContext,
3660 "htmlInitParserCtxt: out of memory\n");
3661 ctxt->inputNr = 0;
3662 ctxt->inputMax = 0;
3663 ctxt->input = NULL;
3664 return;
3665 }
3666 ctxt->inputNr = 0;
3667 ctxt->inputMax = 5;
3668 ctxt->input = NULL;
3669 ctxt->version = NULL;
3670 ctxt->encoding = NULL;
3671 ctxt->standalone = -1;
3672 ctxt->instate = XML_PARSER_START;
3673
3674 /* Allocate the Node stack */
3675 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3676 if (ctxt->nodeTab == NULL) {
3677 xmlGenericError(xmlGenericErrorContext,
3678 "htmlInitParserCtxt: out of memory\n");
3679 ctxt->nodeNr = 0;
3680 ctxt->nodeMax = 0;
3681 ctxt->node = NULL;
3682 ctxt->inputNr = 0;
3683 ctxt->inputMax = 0;
3684 ctxt->input = NULL;
3685 return;
3686 }
3687 ctxt->nodeNr = 0;
3688 ctxt->nodeMax = 10;
3689 ctxt->node = NULL;
3690
3691 /* Allocate the Name stack */
3692 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
3693 if (ctxt->nameTab == NULL) {
3694 xmlGenericError(xmlGenericErrorContext,
3695 "htmlInitParserCtxt: out of memory\n");
3696 ctxt->nameNr = 0;
3697 ctxt->nameMax = 10;
3698 ctxt->name = NULL;
3699 ctxt->nodeNr = 0;
3700 ctxt->nodeMax = 0;
3701 ctxt->node = NULL;
3702 ctxt->inputNr = 0;
3703 ctxt->inputMax = 0;
3704 ctxt->input = NULL;
3705 return;
3706 }
3707 ctxt->nameNr = 0;
3708 ctxt->nameMax = 10;
3709 ctxt->name = NULL;
3710
3711 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3712 else {
3713 ctxt->sax = sax;
3714 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3715 }
3716 ctxt->userData = ctxt;
3717 ctxt->myDoc = NULL;
3718 ctxt->wellFormed = 1;
3719 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00003720 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00003721 ctxt->html = 1;
3722 ctxt->record_info = 0;
3723 ctxt->validate = 0;
3724 ctxt->nbChars = 0;
3725 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00003726 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003727 xmlInitNodeInfoSeq(&ctxt->node_seq);
3728}
3729
3730/**
3731 * htmlFreeParserCtxt:
3732 * @ctxt: an HTML parser context
3733 *
3734 * Free all the memory used by a parser context. However the parsed
3735 * document in ctxt->myDoc is not freed.
3736 */
3737
3738void
3739htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3740{
3741 xmlFreeParserCtxt(ctxt);
3742}
3743
3744/**
Daniel Veillard1d995272002-07-22 16:43:32 +00003745 * htmlNewParserCtxt:
3746 *
3747 * Allocate and initialize a new parser context.
3748 *
3749 * Returns the xmlParserCtxtPtr or NULL
3750 */
3751
3752static htmlParserCtxtPtr
3753htmlNewParserCtxt(void)
3754{
3755 xmlParserCtxtPtr ctxt;
3756
3757 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
3758 if (ctxt == NULL) {
3759 xmlGenericError(xmlGenericErrorContext,
3760 "xmlNewParserCtxt : cannot allocate context\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00003761 return(NULL);
3762 }
3763 memset(ctxt, 0, sizeof(xmlParserCtxt));
3764 htmlInitParserCtxt(ctxt);
3765 return(ctxt);
3766}
3767
3768/**
3769 * htmlCreateMemoryParserCtxt:
3770 * @buffer: a pointer to a char array
3771 * @size: the size of the array
3772 *
3773 * Create a parser context for an HTML in-memory document.
3774 *
3775 * Returns the new parser context or NULL
3776 */
3777static htmlParserCtxtPtr
3778htmlCreateMemoryParserCtxt(const char *buffer, int size) {
3779 xmlParserCtxtPtr ctxt;
3780 xmlParserInputPtr input;
3781 xmlParserInputBufferPtr buf;
3782
3783 if (buffer == NULL)
3784 return(NULL);
3785 if (size <= 0)
3786 return(NULL);
3787
3788 ctxt = htmlNewParserCtxt();
3789 if (ctxt == NULL)
3790 return(NULL);
3791
3792 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
3793 if (buf == NULL) return(NULL);
3794
3795 input = xmlNewInputStream(ctxt);
3796 if (input == NULL) {
3797 xmlFreeParserCtxt(ctxt);
3798 return(NULL);
3799 }
3800
3801 input->filename = NULL;
3802 input->buf = buf;
3803 input->base = input->buf->buffer->content;
3804 input->cur = input->buf->buffer->content;
3805 input->end = &input->buf->buffer->content[input->buf->buffer->use];
3806
3807 inputPush(ctxt, input);
3808 return(ctxt);
3809}
3810
3811/**
Owen Taylor3473f882001-02-23 17:55:21 +00003812 * htmlCreateDocParserCtxt :
3813 * @cur: a pointer to an array of xmlChar
3814 * @encoding: a free form C string describing the HTML document encoding, or NULL
3815 *
3816 * Create a parser context for an HTML document.
3817 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003818 * TODO: check the need to add encoding handling there
3819 *
Owen Taylor3473f882001-02-23 17:55:21 +00003820 * Returns the new parser context or NULL
3821 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003822static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00003823htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Daniel Veillard1d995272002-07-22 16:43:32 +00003824 int len;
Owen Taylor3473f882001-02-23 17:55:21 +00003825
Daniel Veillard1d995272002-07-22 16:43:32 +00003826 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00003827 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00003828 len = xmlStrlen(cur);
3829 return(htmlCreateMemoryParserCtxt((char *)cur, len));
Owen Taylor3473f882001-02-23 17:55:21 +00003830}
3831
3832/************************************************************************
3833 * *
3834 * Progressive parsing interfaces *
3835 * *
3836 ************************************************************************/
3837
3838/**
3839 * htmlParseLookupSequence:
3840 * @ctxt: an HTML parser context
3841 * @first: the first char to lookup
3842 * @next: the next char to lookup or zero
3843 * @third: the next char to lookup or zero
3844 *
3845 * Try to find if a sequence (first, next, third) or just (first next) or
3846 * (first) is available in the input stream.
3847 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3848 * to avoid rescanning sequences of bytes, it DOES change the state of the
3849 * parser, do not use liberally.
3850 * This is basically similar to xmlParseLookupSequence()
3851 *
3852 * Returns the index to the current parsing point if the full sequence
3853 * is available, -1 otherwise.
3854 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003855static int
Owen Taylor3473f882001-02-23 17:55:21 +00003856htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3857 xmlChar next, xmlChar third) {
3858 int base, len;
3859 htmlParserInputPtr in;
3860 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00003861 int incomment = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003862
3863 in = ctxt->input;
3864 if (in == NULL) return(-1);
3865 base = in->cur - in->base;
3866 if (base < 0) return(-1);
3867 if (ctxt->checkIndex > base)
3868 base = ctxt->checkIndex;
3869 if (in->buf == NULL) {
3870 buf = in->base;
3871 len = in->length;
3872 } else {
3873 buf = in->buf->buffer->content;
3874 len = in->buf->buffer->use;
3875 }
3876 /* take into account the sequence length */
3877 if (third) len -= 2;
3878 else if (next) len --;
3879 for (;base < len;base++) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00003880 if (!incomment && (base + 4 < len)) {
3881 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
3882 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
3883 incomment = 1;
3884 }
3885 /* do not increment base, some people use <!--> */
3886 }
3887 if (incomment) {
3888 if (base + 3 < len)
3889 return(-1);
3890 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
3891 (buf[base + 2] == '>')) {
3892 incomment = 0;
3893 base += 2;
3894 }
3895 continue;
3896 }
Owen Taylor3473f882001-02-23 17:55:21 +00003897 if (buf[base] == first) {
3898 if (third != 0) {
3899 if ((buf[base + 1] != next) ||
3900 (buf[base + 2] != third)) continue;
3901 } else if (next != 0) {
3902 if (buf[base + 1] != next) continue;
3903 }
3904 ctxt->checkIndex = 0;
3905#ifdef DEBUG_PUSH
3906 if (next == 0)
3907 xmlGenericError(xmlGenericErrorContext,
3908 "HPP: lookup '%c' found at %d\n",
3909 first, base);
3910 else if (third == 0)
3911 xmlGenericError(xmlGenericErrorContext,
3912 "HPP: lookup '%c%c' found at %d\n",
3913 first, next, base);
3914 else
3915 xmlGenericError(xmlGenericErrorContext,
3916 "HPP: lookup '%c%c%c' found at %d\n",
3917 first, next, third, base);
3918#endif
3919 return(base - (in->cur - in->base));
3920 }
3921 }
3922 ctxt->checkIndex = base;
3923#ifdef DEBUG_PUSH
3924 if (next == 0)
3925 xmlGenericError(xmlGenericErrorContext,
3926 "HPP: lookup '%c' failed\n", first);
3927 else if (third == 0)
3928 xmlGenericError(xmlGenericErrorContext,
3929 "HPP: lookup '%c%c' failed\n", first, next);
3930 else
3931 xmlGenericError(xmlGenericErrorContext,
3932 "HPP: lookup '%c%c%c' failed\n", first, next, third);
3933#endif
3934 return(-1);
3935}
3936
3937/**
3938 * htmlParseTryOrFinish:
3939 * @ctxt: an HTML parser context
3940 * @terminate: last chunk indicator
3941 *
3942 * Try to progress on parsing
3943 *
3944 * Returns zero if no parsing was possible
3945 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003946static int
Owen Taylor3473f882001-02-23 17:55:21 +00003947htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
3948 int ret = 0;
3949 htmlParserInputPtr in;
3950 int avail = 0;
3951 xmlChar cur, next;
3952
3953#ifdef DEBUG_PUSH
3954 switch (ctxt->instate) {
3955 case XML_PARSER_EOF:
3956 xmlGenericError(xmlGenericErrorContext,
3957 "HPP: try EOF\n"); break;
3958 case XML_PARSER_START:
3959 xmlGenericError(xmlGenericErrorContext,
3960 "HPP: try START\n"); break;
3961 case XML_PARSER_MISC:
3962 xmlGenericError(xmlGenericErrorContext,
3963 "HPP: try MISC\n");break;
3964 case XML_PARSER_COMMENT:
3965 xmlGenericError(xmlGenericErrorContext,
3966 "HPP: try COMMENT\n");break;
3967 case XML_PARSER_PROLOG:
3968 xmlGenericError(xmlGenericErrorContext,
3969 "HPP: try PROLOG\n");break;
3970 case XML_PARSER_START_TAG:
3971 xmlGenericError(xmlGenericErrorContext,
3972 "HPP: try START_TAG\n");break;
3973 case XML_PARSER_CONTENT:
3974 xmlGenericError(xmlGenericErrorContext,
3975 "HPP: try CONTENT\n");break;
3976 case XML_PARSER_CDATA_SECTION:
3977 xmlGenericError(xmlGenericErrorContext,
3978 "HPP: try CDATA_SECTION\n");break;
3979 case XML_PARSER_END_TAG:
3980 xmlGenericError(xmlGenericErrorContext,
3981 "HPP: try END_TAG\n");break;
3982 case XML_PARSER_ENTITY_DECL:
3983 xmlGenericError(xmlGenericErrorContext,
3984 "HPP: try ENTITY_DECL\n");break;
3985 case XML_PARSER_ENTITY_VALUE:
3986 xmlGenericError(xmlGenericErrorContext,
3987 "HPP: try ENTITY_VALUE\n");break;
3988 case XML_PARSER_ATTRIBUTE_VALUE:
3989 xmlGenericError(xmlGenericErrorContext,
3990 "HPP: try ATTRIBUTE_VALUE\n");break;
3991 case XML_PARSER_DTD:
3992 xmlGenericError(xmlGenericErrorContext,
3993 "HPP: try DTD\n");break;
3994 case XML_PARSER_EPILOG:
3995 xmlGenericError(xmlGenericErrorContext,
3996 "HPP: try EPILOG\n");break;
3997 case XML_PARSER_PI:
3998 xmlGenericError(xmlGenericErrorContext,
3999 "HPP: try PI\n");break;
4000 case XML_PARSER_SYSTEM_LITERAL:
4001 xmlGenericError(xmlGenericErrorContext,
4002 "HPP: try SYSTEM_LITERAL\n");break;
4003 }
4004#endif
4005
4006 while (1) {
4007
4008 in = ctxt->input;
4009 if (in == NULL) break;
4010 if (in->buf == NULL)
4011 avail = in->length - (in->cur - in->base);
4012 else
4013 avail = in->buf->buffer->use - (in->cur - in->base);
4014 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004015 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004016 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4017 /*
4018 * SAX: end of the document processing.
4019 */
4020 ctxt->instate = XML_PARSER_EOF;
4021 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4022 ctxt->sax->endDocument(ctxt->userData);
4023 }
4024 }
4025 if (avail < 1)
4026 goto done;
4027 switch (ctxt->instate) {
4028 case XML_PARSER_EOF:
4029 /*
4030 * Document parsing is done !
4031 */
4032 goto done;
4033 case XML_PARSER_START:
4034 /*
4035 * Very first chars read from the document flow.
4036 */
4037 cur = in->cur[0];
4038 if (IS_BLANK(cur)) {
4039 SKIP_BLANKS;
4040 if (in->buf == NULL)
4041 avail = in->length - (in->cur - in->base);
4042 else
4043 avail = in->buf->buffer->use - (in->cur - in->base);
4044 }
4045 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4046 ctxt->sax->setDocumentLocator(ctxt->userData,
4047 &xmlDefaultSAXLocator);
4048 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4049 (!ctxt->disableSAX))
4050 ctxt->sax->startDocument(ctxt->userData);
4051
4052 cur = in->cur[0];
4053 next = in->cur[1];
4054 if ((cur == '<') && (next == '!') &&
4055 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4056 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4057 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4058 (UPP(8) == 'E')) {
4059 if ((!terminate) &&
4060 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4061 goto done;
4062#ifdef DEBUG_PUSH
4063 xmlGenericError(xmlGenericErrorContext,
4064 "HPP: Parsing internal subset\n");
4065#endif
4066 htmlParseDocTypeDecl(ctxt);
4067 ctxt->instate = XML_PARSER_PROLOG;
4068#ifdef DEBUG_PUSH
4069 xmlGenericError(xmlGenericErrorContext,
4070 "HPP: entering PROLOG\n");
4071#endif
4072 } else {
4073 ctxt->instate = XML_PARSER_MISC;
4074 }
4075#ifdef DEBUG_PUSH
4076 xmlGenericError(xmlGenericErrorContext,
4077 "HPP: entering MISC\n");
4078#endif
4079 break;
4080 case XML_PARSER_MISC:
4081 SKIP_BLANKS;
4082 if (in->buf == NULL)
4083 avail = in->length - (in->cur - in->base);
4084 else
4085 avail = in->buf->buffer->use - (in->cur - in->base);
4086 if (avail < 2)
4087 goto done;
4088 cur = in->cur[0];
4089 next = in->cur[1];
4090 if ((cur == '<') && (next == '!') &&
4091 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4092 if ((!terminate) &&
4093 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4094 goto done;
4095#ifdef DEBUG_PUSH
4096 xmlGenericError(xmlGenericErrorContext,
4097 "HPP: Parsing Comment\n");
4098#endif
4099 htmlParseComment(ctxt);
4100 ctxt->instate = XML_PARSER_MISC;
4101 } else if ((cur == '<') && (next == '!') &&
4102 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4103 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4104 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4105 (UPP(8) == 'E')) {
4106 if ((!terminate) &&
4107 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4108 goto done;
4109#ifdef DEBUG_PUSH
4110 xmlGenericError(xmlGenericErrorContext,
4111 "HPP: Parsing internal subset\n");
4112#endif
4113 htmlParseDocTypeDecl(ctxt);
4114 ctxt->instate = XML_PARSER_PROLOG;
4115#ifdef DEBUG_PUSH
4116 xmlGenericError(xmlGenericErrorContext,
4117 "HPP: entering PROLOG\n");
4118#endif
4119 } else if ((cur == '<') && (next == '!') &&
4120 (avail < 9)) {
4121 goto done;
4122 } else {
4123 ctxt->instate = XML_PARSER_START_TAG;
4124#ifdef DEBUG_PUSH
4125 xmlGenericError(xmlGenericErrorContext,
4126 "HPP: entering START_TAG\n");
4127#endif
4128 }
4129 break;
4130 case XML_PARSER_PROLOG:
4131 SKIP_BLANKS;
4132 if (in->buf == NULL)
4133 avail = in->length - (in->cur - in->base);
4134 else
4135 avail = in->buf->buffer->use - (in->cur - in->base);
4136 if (avail < 2)
4137 goto done;
4138 cur = in->cur[0];
4139 next = in->cur[1];
4140 if ((cur == '<') && (next == '!') &&
4141 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4142 if ((!terminate) &&
4143 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4144 goto done;
4145#ifdef DEBUG_PUSH
4146 xmlGenericError(xmlGenericErrorContext,
4147 "HPP: Parsing Comment\n");
4148#endif
4149 htmlParseComment(ctxt);
4150 ctxt->instate = XML_PARSER_PROLOG;
4151 } else if ((cur == '<') && (next == '!') &&
4152 (avail < 4)) {
4153 goto done;
4154 } else {
4155 ctxt->instate = XML_PARSER_START_TAG;
4156#ifdef DEBUG_PUSH
4157 xmlGenericError(xmlGenericErrorContext,
4158 "HPP: entering START_TAG\n");
4159#endif
4160 }
4161 break;
4162 case XML_PARSER_EPILOG:
4163 if (in->buf == NULL)
4164 avail = in->length - (in->cur - in->base);
4165 else
4166 avail = in->buf->buffer->use - (in->cur - in->base);
4167 if (avail < 1)
4168 goto done;
4169 cur = in->cur[0];
4170 if (IS_BLANK(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004171 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004172 goto done;
4173 }
4174 if (avail < 2)
4175 goto done;
4176 next = in->cur[1];
4177 if ((cur == '<') && (next == '!') &&
4178 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4179 if ((!terminate) &&
4180 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4181 goto done;
4182#ifdef DEBUG_PUSH
4183 xmlGenericError(xmlGenericErrorContext,
4184 "HPP: Parsing Comment\n");
4185#endif
4186 htmlParseComment(ctxt);
4187 ctxt->instate = XML_PARSER_EPILOG;
4188 } else if ((cur == '<') && (next == '!') &&
4189 (avail < 4)) {
4190 goto done;
4191 } else {
4192 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004193 ctxt->wellFormed = 0;
4194 ctxt->instate = XML_PARSER_EOF;
4195#ifdef DEBUG_PUSH
4196 xmlGenericError(xmlGenericErrorContext,
4197 "HPP: entering EOF\n");
4198#endif
4199 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4200 ctxt->sax->endDocument(ctxt->userData);
4201 goto done;
4202 }
4203 break;
4204 case XML_PARSER_START_TAG: {
4205 xmlChar *name, *oldname;
4206 int depth = ctxt->nameNr;
Daniel Veillardbb371292001-08-16 23:26:59 +00004207 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004208
4209 if (avail < 2)
4210 goto done;
4211 cur = in->cur[0];
4212 if (cur != '<') {
4213 ctxt->instate = XML_PARSER_CONTENT;
4214#ifdef DEBUG_PUSH
4215 xmlGenericError(xmlGenericErrorContext,
4216 "HPP: entering CONTENT\n");
4217#endif
4218 break;
4219 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004220 if (in->cur[1] == '/') {
4221 ctxt->instate = XML_PARSER_END_TAG;
4222 ctxt->checkIndex = 0;
4223#ifdef DEBUG_PUSH
4224 xmlGenericError(xmlGenericErrorContext,
4225 "HPP: entering END_TAG\n");
4226#endif
4227 break;
4228 }
Owen Taylor3473f882001-02-23 17:55:21 +00004229 if ((!terminate) &&
4230 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4231 goto done;
4232
4233 oldname = xmlStrdup(ctxt->name);
4234 htmlParseStartTag(ctxt);
4235 name = ctxt->name;
4236#ifdef DEBUG
4237 if (oldname == NULL)
4238 xmlGenericError(xmlGenericErrorContext,
4239 "Start of element %s\n", name);
4240 else if (name == NULL)
4241 xmlGenericError(xmlGenericErrorContext,
4242 "Start of element failed, was %s\n",
4243 oldname);
4244 else
4245 xmlGenericError(xmlGenericErrorContext,
4246 "Start of element %s, was %s\n",
4247 name, oldname);
4248#endif
4249 if (((depth == ctxt->nameNr) &&
4250 (xmlStrEqual(oldname, ctxt->name))) ||
4251 (name == NULL)) {
4252 if (CUR == '>')
4253 NEXT;
4254 if (oldname != NULL)
4255 xmlFree(oldname);
4256 break;
4257 }
4258 if (oldname != NULL)
4259 xmlFree(oldname);
4260
4261 /*
4262 * Lookup the info for that element.
4263 */
4264 info = htmlTagLookup(name);
4265 if (info == NULL) {
4266 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4267 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4268 name);
4269 ctxt->wellFormed = 0;
4270 } else if (info->depr) {
4271 /***************************
4272 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4273 ctxt->sax->warning(ctxt->userData,
4274 "Tag %s is deprecated\n",
4275 name);
4276 ***************************/
4277 }
4278
4279 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004280 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004281 */
4282 if ((CUR == '/') && (NXT(1) == '>')) {
4283 SKIP(2);
4284 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4285 ctxt->sax->endElement(ctxt->userData, name);
4286 oldname = htmlnamePop(ctxt);
4287#ifdef DEBUG
4288 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
4289 oldname);
4290#endif
4291 if (oldname != NULL)
4292 xmlFree(oldname);
4293 ctxt->instate = XML_PARSER_CONTENT;
4294#ifdef DEBUG_PUSH
4295 xmlGenericError(xmlGenericErrorContext,
4296 "HPP: entering CONTENT\n");
4297#endif
4298 break;
4299 }
4300
4301 if (CUR == '>') {
4302 NEXT;
4303 } else {
4304 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4305 ctxt->sax->error(ctxt->userData,
4306 "Couldn't find end of Start Tag %s\n",
4307 name);
4308 ctxt->wellFormed = 0;
4309
4310 /*
4311 * end of parsing of this node.
4312 */
4313 if (xmlStrEqual(name, ctxt->name)) {
4314 nodePop(ctxt);
4315 oldname = htmlnamePop(ctxt);
4316#ifdef DEBUG
4317 xmlGenericError(xmlGenericErrorContext,
4318 "End of start tag problem: popping out %s\n", oldname);
4319#endif
4320 if (oldname != NULL)
4321 xmlFree(oldname);
4322 }
4323
4324 ctxt->instate = XML_PARSER_CONTENT;
4325#ifdef DEBUG_PUSH
4326 xmlGenericError(xmlGenericErrorContext,
4327 "HPP: entering CONTENT\n");
4328#endif
4329 break;
4330 }
4331
4332 /*
4333 * Check for an Empty Element from DTD definition
4334 */
4335 if ((info != NULL) && (info->empty)) {
4336 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4337 ctxt->sax->endElement(ctxt->userData, name);
4338 oldname = htmlnamePop(ctxt);
4339#ifdef DEBUG
4340 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
4341#endif
4342 if (oldname != NULL)
4343 xmlFree(oldname);
4344 }
4345 ctxt->instate = XML_PARSER_CONTENT;
4346#ifdef DEBUG_PUSH
4347 xmlGenericError(xmlGenericErrorContext,
4348 "HPP: entering CONTENT\n");
4349#endif
4350 break;
4351 }
4352 case XML_PARSER_CONTENT: {
4353 long cons;
4354 /*
4355 * Handle preparsed entities and charRef
4356 */
4357 if (ctxt->token != 0) {
4358 xmlChar chr[2] = { 0 , 0 } ;
4359
4360 chr[0] = (xmlChar) ctxt->token;
4361 htmlCheckParagraph(ctxt);
4362 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4363 ctxt->sax->characters(ctxt->userData, chr, 1);
4364 ctxt->token = 0;
4365 ctxt->checkIndex = 0;
4366 }
4367 if ((avail == 1) && (terminate)) {
4368 cur = in->cur[0];
4369 if ((cur != '<') && (cur != '&')) {
4370 if (ctxt->sax != NULL) {
4371 if (IS_BLANK(cur)) {
4372 if (ctxt->sax->ignorableWhitespace != NULL)
4373 ctxt->sax->ignorableWhitespace(
4374 ctxt->userData, &cur, 1);
4375 } else {
4376 htmlCheckParagraph(ctxt);
4377 if (ctxt->sax->characters != NULL)
4378 ctxt->sax->characters(
4379 ctxt->userData, &cur, 1);
4380 }
4381 }
4382 ctxt->token = 0;
4383 ctxt->checkIndex = 0;
4384 NEXT;
William M. Brack1633d182001-10-05 15:41:19 +00004385 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004386 }
Owen Taylor3473f882001-02-23 17:55:21 +00004387 }
4388 if (avail < 2)
4389 goto done;
4390 cur = in->cur[0];
4391 next = in->cur[1];
4392 cons = ctxt->nbChars;
4393 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4394 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4395 /*
4396 * Handle SCRIPT/STYLE separately
4397 */
4398 if ((!terminate) &&
4399 (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4400 goto done;
4401 htmlParseScript(ctxt);
4402 if ((cur == '<') && (next == '/')) {
4403 ctxt->instate = XML_PARSER_END_TAG;
4404 ctxt->checkIndex = 0;
4405#ifdef DEBUG_PUSH
4406 xmlGenericError(xmlGenericErrorContext,
4407 "HPP: entering END_TAG\n");
4408#endif
4409 break;
4410 }
4411 } else {
4412 /*
4413 * Sometimes DOCTYPE arrives in the middle of the document
4414 */
4415 if ((cur == '<') && (next == '!') &&
4416 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4417 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4418 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4419 (UPP(8) == 'E')) {
4420 if ((!terminate) &&
4421 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4422 goto done;
4423 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4424 ctxt->sax->error(ctxt->userData,
4425 "Misplaced DOCTYPE declaration\n");
4426 ctxt->wellFormed = 0;
4427 htmlParseDocTypeDecl(ctxt);
4428 } else if ((cur == '<') && (next == '!') &&
4429 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4430 if ((!terminate) &&
4431 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4432 goto done;
4433#ifdef DEBUG_PUSH
4434 xmlGenericError(xmlGenericErrorContext,
4435 "HPP: Parsing Comment\n");
4436#endif
4437 htmlParseComment(ctxt);
4438 ctxt->instate = XML_PARSER_CONTENT;
4439 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4440 goto done;
4441 } else if ((cur == '<') && (next == '/')) {
4442 ctxt->instate = XML_PARSER_END_TAG;
4443 ctxt->checkIndex = 0;
4444#ifdef DEBUG_PUSH
4445 xmlGenericError(xmlGenericErrorContext,
4446 "HPP: entering END_TAG\n");
4447#endif
4448 break;
4449 } else if (cur == '<') {
4450 ctxt->instate = XML_PARSER_START_TAG;
4451 ctxt->checkIndex = 0;
4452#ifdef DEBUG_PUSH
4453 xmlGenericError(xmlGenericErrorContext,
4454 "HPP: entering START_TAG\n");
4455#endif
4456 break;
4457 } else if (cur == '&') {
4458 if ((!terminate) &&
4459 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4460 goto done;
4461#ifdef DEBUG_PUSH
4462 xmlGenericError(xmlGenericErrorContext,
4463 "HPP: Parsing Reference\n");
4464#endif
4465 /* TODO: check generation of subtrees if noent !!! */
4466 htmlParseReference(ctxt);
4467 } else {
4468 /* TODO Avoid the extra copy, handle directly !!!!!! */
4469 /*
4470 * Goal of the following test is :
4471 * - minimize calls to the SAX 'character' callback
4472 * when they are mergeable
4473 */
4474 if ((ctxt->inputNr == 1) &&
4475 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4476 if ((!terminate) &&
4477 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4478 goto done;
4479 }
4480 ctxt->checkIndex = 0;
4481#ifdef DEBUG_PUSH
4482 xmlGenericError(xmlGenericErrorContext,
4483 "HPP: Parsing char data\n");
4484#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004485 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004486 }
4487 }
4488 if (cons == ctxt->nbChars) {
4489 if (ctxt->node != NULL) {
4490 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4491 ctxt->sax->error(ctxt->userData,
4492 "detected an error in element content\n");
4493 ctxt->wellFormed = 0;
4494 }
4495 NEXT;
4496 break;
4497 }
4498
4499 break;
4500 }
4501 case XML_PARSER_END_TAG:
4502 if (avail < 2)
4503 goto done;
4504 if ((!terminate) &&
4505 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4506 goto done;
4507 htmlParseEndTag(ctxt);
4508 if (ctxt->nameNr == 0) {
4509 ctxt->instate = XML_PARSER_EPILOG;
4510 } else {
4511 ctxt->instate = XML_PARSER_CONTENT;
4512 }
4513 ctxt->checkIndex = 0;
4514#ifdef DEBUG_PUSH
4515 xmlGenericError(xmlGenericErrorContext,
4516 "HPP: entering CONTENT\n");
4517#endif
4518 break;
4519 case XML_PARSER_CDATA_SECTION:
4520 xmlGenericError(xmlGenericErrorContext,
4521 "HPP: internal error, state == CDATA\n");
4522 ctxt->instate = XML_PARSER_CONTENT;
4523 ctxt->checkIndex = 0;
4524#ifdef DEBUG_PUSH
4525 xmlGenericError(xmlGenericErrorContext,
4526 "HPP: entering CONTENT\n");
4527#endif
4528 break;
4529 case XML_PARSER_DTD:
4530 xmlGenericError(xmlGenericErrorContext,
4531 "HPP: internal error, state == DTD\n");
4532 ctxt->instate = XML_PARSER_CONTENT;
4533 ctxt->checkIndex = 0;
4534#ifdef DEBUG_PUSH
4535 xmlGenericError(xmlGenericErrorContext,
4536 "HPP: entering CONTENT\n");
4537#endif
4538 break;
4539 case XML_PARSER_COMMENT:
4540 xmlGenericError(xmlGenericErrorContext,
4541 "HPP: internal error, state == COMMENT\n");
4542 ctxt->instate = XML_PARSER_CONTENT;
4543 ctxt->checkIndex = 0;
4544#ifdef DEBUG_PUSH
4545 xmlGenericError(xmlGenericErrorContext,
4546 "HPP: entering CONTENT\n");
4547#endif
4548 break;
4549 case XML_PARSER_PI:
4550 xmlGenericError(xmlGenericErrorContext,
4551 "HPP: internal error, state == PI\n");
4552 ctxt->instate = XML_PARSER_CONTENT;
4553 ctxt->checkIndex = 0;
4554#ifdef DEBUG_PUSH
4555 xmlGenericError(xmlGenericErrorContext,
4556 "HPP: entering CONTENT\n");
4557#endif
4558 break;
4559 case XML_PARSER_ENTITY_DECL:
4560 xmlGenericError(xmlGenericErrorContext,
4561 "HPP: internal error, state == ENTITY_DECL\n");
4562 ctxt->instate = XML_PARSER_CONTENT;
4563 ctxt->checkIndex = 0;
4564#ifdef DEBUG_PUSH
4565 xmlGenericError(xmlGenericErrorContext,
4566 "HPP: entering CONTENT\n");
4567#endif
4568 break;
4569 case XML_PARSER_ENTITY_VALUE:
4570 xmlGenericError(xmlGenericErrorContext,
4571 "HPP: internal error, state == ENTITY_VALUE\n");
4572 ctxt->instate = XML_PARSER_CONTENT;
4573 ctxt->checkIndex = 0;
4574#ifdef DEBUG_PUSH
4575 xmlGenericError(xmlGenericErrorContext,
4576 "HPP: entering DTD\n");
4577#endif
4578 break;
4579 case XML_PARSER_ATTRIBUTE_VALUE:
4580 xmlGenericError(xmlGenericErrorContext,
4581 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
4582 ctxt->instate = XML_PARSER_START_TAG;
4583 ctxt->checkIndex = 0;
4584#ifdef DEBUG_PUSH
4585 xmlGenericError(xmlGenericErrorContext,
4586 "HPP: entering START_TAG\n");
4587#endif
4588 break;
4589 case XML_PARSER_SYSTEM_LITERAL:
4590 xmlGenericError(xmlGenericErrorContext,
4591 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
4592 ctxt->instate = XML_PARSER_CONTENT;
4593 ctxt->checkIndex = 0;
4594#ifdef DEBUG_PUSH
4595 xmlGenericError(xmlGenericErrorContext,
4596 "HPP: entering CONTENT\n");
4597#endif
4598 break;
4599 case XML_PARSER_IGNORE:
4600 xmlGenericError(xmlGenericErrorContext,
4601 "HPP: internal error, state == XML_PARSER_IGNORE\n");
4602 ctxt->instate = XML_PARSER_CONTENT;
4603 ctxt->checkIndex = 0;
4604#ifdef DEBUG_PUSH
4605 xmlGenericError(xmlGenericErrorContext,
4606 "HPP: entering CONTENT\n");
4607#endif
4608 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00004609 case XML_PARSER_PUBLIC_LITERAL:
4610 xmlGenericError(xmlGenericErrorContext,
4611 "HPP: internal error, state == XML_PARSER_LITERAL\n");
4612 ctxt->instate = XML_PARSER_CONTENT;
4613 ctxt->checkIndex = 0;
4614#ifdef DEBUG_PUSH
4615 xmlGenericError(xmlGenericErrorContext,
4616 "HPP: entering CONTENT\n");
4617#endif
4618 break;
4619
Owen Taylor3473f882001-02-23 17:55:21 +00004620 }
4621 }
4622done:
4623 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004624 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004625 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4626 /*
4627 * SAX: end of the document processing.
4628 */
4629 ctxt->instate = XML_PARSER_EOF;
4630 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4631 ctxt->sax->endDocument(ctxt->userData);
4632 }
4633 }
4634 if ((ctxt->myDoc != NULL) &&
4635 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4636 (ctxt->instate == XML_PARSER_EPILOG))) {
4637 xmlDtdPtr dtd;
4638 dtd = xmlGetIntSubset(ctxt->myDoc);
4639 if (dtd == NULL)
4640 ctxt->myDoc->intSubset =
4641 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4642 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4643 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4644 }
4645#ifdef DEBUG_PUSH
4646 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4647#endif
4648 return(ret);
4649}
4650
4651/**
Owen Taylor3473f882001-02-23 17:55:21 +00004652 * htmlParseChunk:
4653 * @ctxt: an XML parser context
4654 * @chunk: an char array
4655 * @size: the size in byte of the chunk
4656 * @terminate: last chunk indicator
4657 *
4658 * Parse a Chunk of memory
4659 *
4660 * Returns zero if no error, the xmlParserErrors otherwise.
4661 */
4662int
4663htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4664 int terminate) {
4665 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4666 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4667 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4668 int cur = ctxt->input->cur - ctxt->input->base;
4669
4670 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4671 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4672 ctxt->input->cur = ctxt->input->base + cur;
4673#ifdef DEBUG_PUSH
4674 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4675#endif
4676
4677 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4678 htmlParseTryOrFinish(ctxt, terminate);
4679 } else if (ctxt->instate != XML_PARSER_EOF) {
4680 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
4681 htmlParseTryOrFinish(ctxt, terminate);
4682 }
4683 if (terminate) {
4684 if ((ctxt->instate != XML_PARSER_EOF) &&
4685 (ctxt->instate != XML_PARSER_EPILOG) &&
4686 (ctxt->instate != XML_PARSER_MISC)) {
4687 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004688 ctxt->wellFormed = 0;
4689 }
4690 if (ctxt->instate != XML_PARSER_EOF) {
4691 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4692 ctxt->sax->endDocument(ctxt->userData);
4693 }
4694 ctxt->instate = XML_PARSER_EOF;
4695 }
4696 return((xmlParserErrors) ctxt->errNo);
4697}
4698
4699/************************************************************************
4700 * *
4701 * User entry points *
4702 * *
4703 ************************************************************************/
4704
4705/**
4706 * htmlCreatePushParserCtxt :
4707 * @sax: a SAX handler
4708 * @user_data: The user data returned on SAX callbacks
4709 * @chunk: a pointer to an array of chars
4710 * @size: number of chars in the array
4711 * @filename: an optional file name or URI
4712 * @enc: an optional encoding
4713 *
4714 * Create a parser context for using the HTML parser in push mode
4715 * To allow content encoding detection, @size should be >= 4
4716 * The value of @filename is used for fetching external entities
4717 * and error/warning reports.
4718 *
4719 * Returns the new parser context or NULL
4720 */
4721htmlParserCtxtPtr
4722htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4723 const char *chunk, int size, const char *filename,
4724 xmlCharEncoding enc) {
4725 htmlParserCtxtPtr ctxt;
4726 htmlParserInputPtr inputStream;
4727 xmlParserInputBufferPtr buf;
4728
Daniel Veillardd0463562001-10-13 09:15:48 +00004729 xmlInitParser();
4730
Owen Taylor3473f882001-02-23 17:55:21 +00004731 buf = xmlAllocParserInputBuffer(enc);
4732 if (buf == NULL) return(NULL);
4733
4734 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4735 if (ctxt == NULL) {
4736 xmlFree(buf);
4737 return(NULL);
4738 }
4739 memset(ctxt, 0, sizeof(htmlParserCtxt));
4740 htmlInitParserCtxt(ctxt);
4741 if (sax != NULL) {
4742 if (ctxt->sax != &htmlDefaultSAXHandler)
4743 xmlFree(ctxt->sax);
4744 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4745 if (ctxt->sax == NULL) {
4746 xmlFree(buf);
4747 xmlFree(ctxt);
4748 return(NULL);
4749 }
4750 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4751 if (user_data != NULL)
4752 ctxt->userData = user_data;
4753 }
4754 if (filename == NULL) {
4755 ctxt->directory = NULL;
4756 } else {
4757 ctxt->directory = xmlParserGetDirectory(filename);
4758 }
4759
4760 inputStream = htmlNewInputStream(ctxt);
4761 if (inputStream == NULL) {
4762 xmlFreeParserCtxt(ctxt);
4763 return(NULL);
4764 }
4765
4766 if (filename == NULL)
4767 inputStream->filename = NULL;
4768 else
4769 inputStream->filename = xmlMemStrdup(filename);
4770 inputStream->buf = buf;
4771 inputStream->base = inputStream->buf->buffer->content;
4772 inputStream->cur = inputStream->buf->buffer->content;
4773
4774 inputPush(ctxt, inputStream);
4775
4776 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4777 (ctxt->input->buf != NULL)) {
4778 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4779#ifdef DEBUG_PUSH
4780 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4781#endif
4782 }
4783
4784 return(ctxt);
4785}
4786
4787/**
4788 * htmlSAXParseDoc :
4789 * @cur: a pointer to an array of xmlChar
4790 * @encoding: a free form C string describing the HTML document encoding, or NULL
4791 * @sax: the SAX handler block
4792 * @userData: if using SAX, this pointer will be provided on callbacks.
4793 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004794 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
4795 * to handle parse events. If sax is NULL, fallback to the default DOM
4796 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00004797 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004798 * Returns the resulting document tree unless SAX is NULL or the document is
4799 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00004800 */
4801
4802htmlDocPtr
4803htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
4804 htmlDocPtr ret;
4805 htmlParserCtxtPtr ctxt;
4806
Daniel Veillardd0463562001-10-13 09:15:48 +00004807 xmlInitParser();
4808
Owen Taylor3473f882001-02-23 17:55:21 +00004809 if (cur == NULL) return(NULL);
4810
4811
4812 ctxt = htmlCreateDocParserCtxt(cur, encoding);
4813 if (ctxt == NULL) return(NULL);
4814 if (sax != NULL) {
4815 ctxt->sax = sax;
4816 ctxt->userData = userData;
4817 }
4818
4819 htmlParseDocument(ctxt);
4820 ret = ctxt->myDoc;
4821 if (sax != NULL) {
4822 ctxt->sax = NULL;
4823 ctxt->userData = NULL;
4824 }
4825 htmlFreeParserCtxt(ctxt);
4826
4827 return(ret);
4828}
4829
4830/**
4831 * htmlParseDoc :
4832 * @cur: a pointer to an array of xmlChar
4833 * @encoding: a free form C string describing the HTML document encoding, or NULL
4834 *
4835 * parse an HTML in-memory document and build a tree.
4836 *
4837 * Returns the resulting document tree
4838 */
4839
4840htmlDocPtr
4841htmlParseDoc(xmlChar *cur, const char *encoding) {
4842 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4843}
4844
4845
4846/**
4847 * htmlCreateFileParserCtxt :
4848 * @filename: the filename
4849 * @encoding: a free form C string describing the HTML document encoding, or NULL
4850 *
4851 * Create a parser context for a file content.
4852 * Automatic support for ZLIB/Compress compressed document is provided
4853 * by default if found at compile-time.
4854 *
4855 * Returns the new parser context or NULL
4856 */
4857htmlParserCtxtPtr
4858htmlCreateFileParserCtxt(const char *filename, const char *encoding)
4859{
4860 htmlParserCtxtPtr ctxt;
4861 htmlParserInputPtr inputStream;
4862 xmlParserInputBufferPtr buf;
4863 /* htmlCharEncoding enc; */
4864 xmlChar *content, *content_line = (xmlChar *) "charset=";
4865
4866 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
4867 if (buf == NULL) return(NULL);
4868
4869 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4870 if (ctxt == NULL) {
Daniel Veillard3487c8d2002-09-05 11:33:25 +00004871 xmlGenericError(xmlGenericErrorContext, "malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004872 return(NULL);
4873 }
4874 memset(ctxt, 0, sizeof(htmlParserCtxt));
4875 htmlInitParserCtxt(ctxt);
4876 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
4877 if (inputStream == NULL) {
Daniel Veillard3487c8d2002-09-05 11:33:25 +00004878 xmlGenericError(xmlGenericErrorContext, "malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004879 xmlFree(ctxt);
4880 return(NULL);
4881 }
4882 memset(inputStream, 0, sizeof(htmlParserInput));
4883
Daniel Veillarda646cfd2002-09-17 21:50:03 +00004884 inputStream->filename = (char *)
4885 xmlNormalizeWindowsPath((xmlChar *)filename);
Owen Taylor3473f882001-02-23 17:55:21 +00004886 inputStream->line = 1;
4887 inputStream->col = 1;
4888 inputStream->buf = buf;
4889 inputStream->directory = NULL;
4890
4891 inputStream->base = inputStream->buf->buffer->content;
4892 inputStream->cur = inputStream->buf->buffer->content;
4893 inputStream->free = NULL;
4894
4895 inputPush(ctxt, inputStream);
4896
4897 /* set encoding */
4898 if (encoding) {
4899 content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
4900 if (content) {
4901 strcpy ((char *)content, (char *)content_line);
4902 strcat ((char *)content, (char *)encoding);
4903 htmlCheckEncoding (ctxt, content);
4904 xmlFree (content);
4905 }
4906 }
4907
4908 return(ctxt);
4909}
4910
4911/**
4912 * htmlSAXParseFile :
4913 * @filename: the filename
4914 * @encoding: a free form C string describing the HTML document encoding, or NULL
4915 * @sax: the SAX handler block
4916 * @userData: if using SAX, this pointer will be provided on callbacks.
4917 *
4918 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4919 * compressed document is provided by default if found at compile-time.
4920 * It use the given SAX function block to handle the parsing callback.
4921 * If sax is NULL, fallback to the default DOM tree building routines.
4922 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004923 * Returns the resulting document tree unless SAX is NULL or the document is
4924 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00004925 */
4926
4927htmlDocPtr
4928htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
4929 void *userData) {
4930 htmlDocPtr ret;
4931 htmlParserCtxtPtr ctxt;
4932 htmlSAXHandlerPtr oldsax = NULL;
4933
Daniel Veillardd0463562001-10-13 09:15:48 +00004934 xmlInitParser();
4935
Owen Taylor3473f882001-02-23 17:55:21 +00004936 ctxt = htmlCreateFileParserCtxt(filename, encoding);
4937 if (ctxt == NULL) return(NULL);
4938 if (sax != NULL) {
4939 oldsax = ctxt->sax;
4940 ctxt->sax = sax;
4941 ctxt->userData = userData;
4942 }
4943
4944 htmlParseDocument(ctxt);
4945
4946 ret = ctxt->myDoc;
4947 if (sax != NULL) {
4948 ctxt->sax = oldsax;
4949 ctxt->userData = NULL;
4950 }
4951 htmlFreeParserCtxt(ctxt);
4952
4953 return(ret);
4954}
4955
4956/**
4957 * htmlParseFile :
4958 * @filename: the filename
4959 * @encoding: a free form C string describing the HTML document encoding, or NULL
4960 *
4961 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4962 * compressed document is provided by default if found at compile-time.
4963 *
4964 * Returns the resulting document tree
4965 */
4966
4967htmlDocPtr
4968htmlParseFile(const char *filename, const char *encoding) {
4969 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
4970}
4971
4972/**
4973 * htmlHandleOmittedElem:
4974 * @val: int 0 or 1
4975 *
4976 * Set and return the previous value for handling HTML omitted tags.
4977 *
4978 * Returns the last value for 0 for no handling, 1 for auto insertion.
4979 */
4980
4981int
4982htmlHandleOmittedElem(int val) {
4983 int old = htmlOmittedDefaultValue;
4984
4985 htmlOmittedDefaultValue = val;
4986 return(old);
4987}
4988
4989#endif /* LIBXML_HTML_ENABLED */