blob: dad640227d284ca4c390468c3b33dd9a1b362478 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Owen Taylor3473f882001-02-23 17:55:21 +000045
46#define HTML_MAX_NAMELEN 1000
47#define HTML_PARSER_BIG_BUFFER_SIZE 1000
48#define HTML_PARSER_BUFFER_SIZE 100
49
50/* #define DEBUG */
51/* #define DEBUG_PUSH */
52
Daniel Veillard22090732001-07-16 00:06:07 +000053static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000054
Daniel Veillard56a4cb82001-03-24 17:00:36 +000055xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
56 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000057static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000058
59/************************************************************************
60 * *
Owen Taylor3473f882001-02-23 17:55:21 +000061 * Parser stacks related functions and macros *
62 * *
63 ************************************************************************/
64
65/*
66 * Generic function for accessing stacks in the Parser Context
67 */
68
69#define PUSH_AND_POP(scope, type, name) \
70scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
71 if (ctxt->name##Nr >= ctxt->name##Max) { \
72 ctxt->name##Max *= 2; \
73 ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
74 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
75 if (ctxt->name##Tab == NULL) { \
76 xmlGenericError(xmlGenericErrorContext, \
77 "realloc failed !\n"); \
78 return(0); \
79 } \
80 } \
81 ctxt->name##Tab[ctxt->name##Nr] = value; \
82 ctxt->name = value; \
83 return(ctxt->name##Nr++); \
84} \
85scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
86 type ret; \
87 if (ctxt->name##Nr < 0) return(0); \
88 ctxt->name##Nr--; \
89 if (ctxt->name##Nr < 0) return(0); \
90 if (ctxt->name##Nr > 0) \
91 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
92 else \
93 ctxt->name = NULL; \
94 ret = ctxt->name##Tab[ctxt->name##Nr]; \
95 ctxt->name##Tab[ctxt->name##Nr] = 0; \
96 return(ret); \
97} \
98
Daniel Veillard56a4cb82001-03-24 17:00:36 +000099/* PUSH_AND_POP(static, xmlNodePtr, node) */
100PUSH_AND_POP(static, xmlChar*, name)
Owen Taylor3473f882001-02-23 17:55:21 +0000101
102/*
103 * Macros for accessing the content. Those should be used only by the parser,
104 * and not exported.
105 *
106 * Dirty macros, i.e. one need to make assumption on the context to use them
107 *
108 * CUR_PTR return the current pointer to the xmlChar to be parsed.
109 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
110 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
111 * in UNICODE mode. This should be used internally by the parser
112 * only to compare to ASCII values otherwise it would break when
113 * running with UTF-8 encoding.
114 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
115 * to compare on ASCII based substring.
116 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
117 * it should be used only to compare on ASCII based substring.
118 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
119 * strings within the parser.
120 *
121 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
122 *
123 * CURRENT Returns the current char value, with the full decoding of
124 * UTF-8 if we are using this mode. It returns an int.
125 * NEXT Skip to the next character, this does the proper decoding
126 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
127 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
128 */
129
130#define UPPER (toupper(*ctxt->input->cur))
131
132#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
133
134#define NXT(val) ctxt->input->cur[(val)]
135
136#define UPP(val) (toupper(ctxt->input->cur[(val)]))
137
138#define CUR_PTR ctxt->input->cur
139
140#define SHRINK xmlParserInputShrink(ctxt->input)
141
142#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
143
144#define CURRENT ((int) (*ctxt->input->cur))
145
146#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
147
148/* Inported from XML */
149
Daniel Veillard561b7f82002-03-20 21:55:57 +0000150/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
151#define CUR ((int) (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000152#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
153
Daniel Veillard561b7f82002-03-20 21:55:57 +0000154#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000155#define NXT(val) ctxt->input->cur[(val)]
156#define CUR_PTR ctxt->input->cur
157
158
159#define NEXTL(l) do { \
160 if (*(ctxt->input->cur) == '\n') { \
161 ctxt->input->line++; ctxt->input->col = 1; \
162 } else ctxt->input->col++; \
163 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
164 } while (0)
165
166/************
167 \
168 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
169 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
170 ************/
171
172#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
173#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
174
175#define COPY_BUF(l,b,i,v) \
176 if (l == 1) b[i++] = (xmlChar) v; \
177 else i += xmlCopyChar(l,&b[i],v)
178
179/**
180 * htmlCurrentChar:
181 * @ctxt: the HTML parser context
182 * @len: pointer to the length of the char read
183 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000184 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000185 * bytes in the input buffer. Implement the end of line normalization:
186 * 2.11 End-of-Line Handling
187 * If the encoding is unspecified, in the case we find an ISO-Latin-1
188 * char, then the encoding converter is plugged in automatically.
189 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000190 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000191 */
192
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000193static int
Owen Taylor3473f882001-02-23 17:55:21 +0000194htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
195 if (ctxt->instate == XML_PARSER_EOF)
196 return(0);
197
198 if (ctxt->token != 0) {
199 *len = 0;
200 return(ctxt->token);
201 }
202 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
203 /*
204 * We are supposed to handle UTF8, check it's valid
205 * From rfc2044: encoding of the Unicode values on UTF-8:
206 *
207 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
208 * 0000 0000-0000 007F 0xxxxxxx
209 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
210 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
211 *
212 * Check for the 0x110000 limit too
213 */
214 const unsigned char *cur = ctxt->input->cur;
215 unsigned char c;
216 unsigned int val;
217
218 c = *cur;
219 if (c & 0x80) {
220 if (cur[1] == 0)
221 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
222 if ((cur[1] & 0xc0) != 0x80)
223 goto encoding_error;
224 if ((c & 0xe0) == 0xe0) {
225
226 if (cur[2] == 0)
227 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
228 if ((cur[2] & 0xc0) != 0x80)
229 goto encoding_error;
230 if ((c & 0xf0) == 0xf0) {
231 if (cur[3] == 0)
232 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
233 if (((c & 0xf8) != 0xf0) ||
234 ((cur[3] & 0xc0) != 0x80))
235 goto encoding_error;
236 /* 4-byte code */
237 *len = 4;
238 val = (cur[0] & 0x7) << 18;
239 val |= (cur[1] & 0x3f) << 12;
240 val |= (cur[2] & 0x3f) << 6;
241 val |= cur[3] & 0x3f;
242 } else {
243 /* 3-byte code */
244 *len = 3;
245 val = (cur[0] & 0xf) << 12;
246 val |= (cur[1] & 0x3f) << 6;
247 val |= cur[2] & 0x3f;
248 }
249 } else {
250 /* 2-byte code */
251 *len = 2;
252 val = (cur[0] & 0x1f) << 6;
253 val |= cur[1] & 0x3f;
254 }
255 if (!IS_CHAR(val)) {
256 ctxt->errNo = XML_ERR_INVALID_ENCODING;
257 if ((ctxt->sax != NULL) &&
258 (ctxt->sax->error != NULL))
259 ctxt->sax->error(ctxt->userData,
260 "Char 0x%X out of allowed range\n", val);
261 ctxt->wellFormed = 0;
262 ctxt->disableSAX = 1;
263 }
264 return(val);
265 } else {
266 /* 1-byte code */
267 *len = 1;
268 return((int) *ctxt->input->cur);
269 }
270 }
271 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000272 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000273 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000274 * XML constructs only use < 128 chars
275 */
276 *len = 1;
277 if ((int) *ctxt->input->cur < 0x80)
278 return((int) *ctxt->input->cur);
279
280 /*
281 * Humm this is bad, do an automatic flow conversion
282 */
283 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
284 ctxt->charset = XML_CHAR_ENCODING_UTF8;
285 return(xmlCurrentChar(ctxt, len));
286
287encoding_error:
288 /*
289 * If we detect an UTF8 error that probably mean that the
290 * input encoding didn't get properly advertized in the
291 * declaration header. Report the error and switch the encoding
292 * to ISO-Latin-1 (if you don't like this policy, just declare the
293 * encoding !)
294 */
295 ctxt->errNo = XML_ERR_INVALID_ENCODING;
296 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
297 ctxt->sax->error(ctxt->userData,
298 "Input is not proper UTF-8, indicate encoding !\n");
299 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
300 ctxt->input->cur[0], ctxt->input->cur[1],
301 ctxt->input->cur[2], ctxt->input->cur[3]);
302 }
303
304 ctxt->charset = XML_CHAR_ENCODING_8859_1;
305 *len = 1;
306 return((int) *ctxt->input->cur);
307}
308
309/**
Owen Taylor3473f882001-02-23 17:55:21 +0000310 * htmlSkipBlankChars:
311 * @ctxt: the HTML parser context
312 *
313 * skip all blanks character found at that point in the input streams.
314 *
315 * Returns the number of space chars skipped
316 */
317
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000318static int
Owen Taylor3473f882001-02-23 17:55:21 +0000319htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
320 int res = 0;
321
322 while (IS_BLANK(*(ctxt->input->cur))) {
323 if ((*ctxt->input->cur == 0) &&
324 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
325 xmlPopInput(ctxt);
326 } else {
327 if (*(ctxt->input->cur) == '\n') {
328 ctxt->input->line++; ctxt->input->col = 1;
329 } else ctxt->input->col++;
330 ctxt->input->cur++;
331 ctxt->nbChars++;
332 if (*ctxt->input->cur == 0)
333 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
334 }
335 res++;
336 }
337 return(res);
338}
339
340
341
342/************************************************************************
343 * *
344 * The list of HTML elements and their properties *
345 * *
346 ************************************************************************/
347
348/*
349 * Start Tag: 1 means the start tag can be ommited
350 * End Tag: 1 means the end tag can be ommited
351 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000352 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000353 * Depr: this element is deprecated
354 * DTD: 1 means that this element is valid only in the Loose DTD
355 * 2 means that this element is valid only in the Frameset DTD
356 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000357 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Owen Taylor3473f882001-02-23 17:55:21 +0000358 */
Daniel Veillard22090732001-07-16 00:06:07 +0000359static const htmlElemDesc
360html40ElementTable[] = {
Daniel Veillard02bb1702001-06-13 21:11:59 +0000361{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor " },
362{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form" },
363{ "acronym", 0, 0, 0, 0, 0, 0, 1, "" },
364{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author " },
365{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet " },
366{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area " },
367{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style" },
368{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri " },
369{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " },
370{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride " },
371{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style" },
372{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation " },
373{ "body", 1, 1, 0, 0, 0, 0, 0, "document body " },
374{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break " },
375{ "button", 0, 0, 0, 0, 0, 0, 2, "push button " },
376{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption " },
377{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center " },
378{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation" },
379{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment" },
380{ "col", 0, 2, 2, 1, 0, 0, 0, "table column " },
381{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group " },
382{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description " },
383{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text " },
384{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition" },
385{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list" },
386{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container"},
387{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list " },
388{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term " },
389{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis" },
390{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group " },
391{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font " },
392{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form " },
393{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " },
394{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" },
395{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading " },
396{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading " },
397{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading " },
398{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading " },
399{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading " },
400{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading " },
401{ "head", 1, 1, 0, 0, 0, 0, 0, "document head " },
402{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " },
403{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element " },
404{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style" },
405{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow " },
406{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image " },
407{ "input", 0, 2, 2, 1, 0, 0, 1, "form control " },
408{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text" },
409{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt " },
410{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user" },
411{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text " },
412{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend " },
413{ "li", 0, 1, 1, 0, 0, 0, 0, "list item " },
414{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link " },
415{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map " },
416{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list " },
417{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation " },
418{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering " },
419{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
420{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object " },
421{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list " },
422{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group " },
423{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " },
424{ "p", 0, 1, 1, 0, 0, 0, 0, "paragraph " },
425{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value " },
426{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text " },
427{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation " },
428{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style" },
429{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc." },
430{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements " },
431{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector " },
432{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style" },
433{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container " },
434{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text" },
435{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis" },
436{ "style", 0, 0, 0, 0, 0, 0, 0, "style info " },
437{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript" },
438{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript " },
439{ "table", 0, 0, 0, 0, 0, 0, 0, "&#160;" },
440{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body " },
441{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell" },
442{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field " },
443{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer " },
444{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell" },
445{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header " },
446{ "title", 0, 0, 0, 0, 0, 0, 0, "document title " },
447{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row " },
448{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style" },
449{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style" },
450{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list " },
451{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument" },
Owen Taylor3473f882001-02-23 17:55:21 +0000452};
453
454/*
Owen Taylor3473f882001-02-23 17:55:21 +0000455 * start tags that imply the end of current element
456 */
Daniel Veillard22090732001-07-16 00:06:07 +0000457static const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000458"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
459 "dl", "ul", "ol", "menu", "dir", "address", "pre",
460 "listing", "xmp", "head", NULL,
461"head", "p", NULL,
462"title", "p", NULL,
463"body", "head", "style", "link", "title", "p", NULL,
464"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
465 "pre", "listing", "xmp", "head", "li", NULL,
466"hr", "p", "head", NULL,
467"h1", "p", "head", NULL,
468"h2", "p", "head", NULL,
469"h3", "p", "head", NULL,
470"h4", "p", "head", NULL,
471"h5", "p", "head", NULL,
472"h6", "p", "head", NULL,
473"dir", "p", "head", NULL,
474"address", "p", "head", "ul", NULL,
475"pre", "p", "head", "ul", NULL,
476"listing", "p", "head", NULL,
477"xmp", "p", "head", NULL,
478"blockquote", "p", "head", NULL,
479"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
480 "xmp", "head", NULL,
481"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
482 "head", "dd", NULL,
483"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
484 "head", "dt", NULL,
485"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
486 "listing", "xmp", NULL,
487"ol", "p", "head", "ul", NULL,
488"menu", "p", "head", "ul", NULL,
489"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
490"div", "p", "head", NULL,
491"noscript", "p", "head", NULL,
492"center", "font", "b", "i", "p", "head", NULL,
493"a", "a", NULL,
494"caption", "p", NULL,
495"colgroup", "caption", "colgroup", "col", "p", NULL,
496"col", "caption", "col", "p", NULL,
497"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
498 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000499"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
500"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000501"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
502"thead", "caption", "col", "colgroup", NULL,
503"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
504 "tbody", "p", NULL,
505"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
506 "tfoot", "tbody", "p", NULL,
507"optgroup", "option", NULL,
508"option", "option", NULL,
509"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
510 "pre", "listing", "xmp", "a", NULL,
511NULL
512};
513
514/*
515 * The list of HTML elements which are supposed not to have
516 * CDATA content and where a p element will be implied
517 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000518 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +0000519 * implied paragraph
520 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000521static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000522 "html",
523 "head",
524 "body",
525 NULL
526};
527
528/*
529 * The list of HTML attributes which are of content %Script;
530 * NOTE: when adding ones, check htmlIsScriptAttribute() since
531 * it assumes the name starts with 'on'
532 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000533static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000534 "onclick",
535 "ondblclick",
536 "onmousedown",
537 "onmouseup",
538 "onmouseover",
539 "onmousemove",
540 "onmouseout",
541 "onkeypress",
542 "onkeydown",
543 "onkeyup",
544 "onload",
545 "onunload",
546 "onfocus",
547 "onblur",
548 "onsubmit",
549 "onrest",
550 "onchange",
551 "onselect"
552};
553
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000554/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000555 * This table is used by the htmlparser to know what to do with
556 * broken html pages. By assigning different priorities to different
557 * elements the parser can decide how to handle extra endtags.
558 * Endtags are only allowed to close elements with lower or equal
559 * priority.
560 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000561
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000562typedef struct {
563 const char *name;
564 int priority;
565} elementPriority;
566
Daniel Veillard22090732001-07-16 00:06:07 +0000567static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000568 {"div", 150},
569 {"td", 160},
570 {"th", 160},
571 {"tr", 170},
572 {"thead", 180},
573 {"tbody", 180},
574 {"tfoot", 180},
575 {"table", 190},
576 {"head", 200},
577 {"body", 200},
578 {"html", 220},
579 {NULL, 100} /* Default priority */
580};
Owen Taylor3473f882001-02-23 17:55:21 +0000581
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000582static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +0000583static int htmlStartCloseIndexinitialized = 0;
584
585/************************************************************************
586 * *
587 * functions to handle HTML specific data *
588 * *
589 ************************************************************************/
590
591/**
592 * htmlInitAutoClose:
593 *
594 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
595 * This is not reentrant. Call xmlInitParser() once before processing in
596 * case of use in multithreaded programs.
597 */
598void
599htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000600 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000601
602 if (htmlStartCloseIndexinitialized) return;
603
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000604 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
605 indx = 0;
606 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
607 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +0000608 while (htmlStartClose[i] != NULL) i++;
609 i++;
610 }
611 htmlStartCloseIndexinitialized = 1;
612}
613
614/**
615 * htmlTagLookup:
616 * @tag: The tag name in lowercase
617 *
618 * Lookup the HTML tag in the ElementTable
619 *
620 * Returns the related htmlElemDescPtr or NULL if not found.
621 */
Daniel Veillardbb371292001-08-16 23:26:59 +0000622const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +0000623htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000624 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000625
626 for (i = 0; i < (sizeof(html40ElementTable) /
627 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000628 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillard22090732001-07-16 00:06:07 +0000629 return((const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +0000630 }
631 return(NULL);
632}
633
634/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000635 * htmlGetEndPriority:
636 * @name: The name of the element to look up the priority for.
637 *
638 * Return value: The "endtag" priority.
639 **/
640static int
641htmlGetEndPriority (const xmlChar *name) {
642 int i = 0;
643
644 while ((htmlEndPriority[i].name != NULL) &&
645 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
646 i++;
647
648 return(htmlEndPriority[i].priority);
649}
650
651/**
Owen Taylor3473f882001-02-23 17:55:21 +0000652 * htmlCheckAutoClose:
653 * @newtag: The new tag name
654 * @oldtag: The old tag name
655 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000656 * Checks whether the new tag is one of the registered valid tags for
657 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +0000658 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
659 *
660 * Returns 0 if no, 1 if yes.
661 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000662static int
Owen Taylor3473f882001-02-23 17:55:21 +0000663htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000664 int i, indx;
665 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +0000666
667 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
668
669 /* inefficient, but not a big deal */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000670 for (indx = 0; indx < 100;indx++) {
671 closed = htmlStartCloseIndex[indx];
672 if (closed == NULL) return(0);
673 if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor3473f882001-02-23 17:55:21 +0000674 }
675
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000676 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +0000677 i++;
678 while (htmlStartClose[i] != NULL) {
679 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
680 return(1);
681 }
682 i++;
683 }
684 return(0);
685}
686
687/**
688 * htmlAutoCloseOnClose:
689 * @ctxt: an HTML parser context
690 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000691 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +0000692 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000693 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000694 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000695static void
Owen Taylor3473f882001-02-23 17:55:21 +0000696htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillardbb371292001-08-16 23:26:59 +0000697 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +0000698 xmlChar *oldname;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000699 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +0000700
701#ifdef DEBUG
702 xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
703 for (i = 0;i < ctxt->nameNr;i++)
704 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
705#endif
706
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000707 priority = htmlGetEndPriority (newtag);
708
Owen Taylor3473f882001-02-23 17:55:21 +0000709 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000710
Owen Taylor3473f882001-02-23 17:55:21 +0000711 if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000712 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000713 * A missplaced endtag can only close elements with lower
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000714 * or equal priority, so if we find an element with higher
715 * priority before we find an element with
716 * matching name, we just ignore this endtag
717 */
718 if (htmlGetEndPriority (ctxt->nameTab[i]) > priority) return;
Owen Taylor3473f882001-02-23 17:55:21 +0000719 }
720 if (i < 0) return;
721
722 while (!xmlStrEqual(newtag, ctxt->name)) {
723 info = htmlTagLookup(ctxt->name);
724 if ((info == NULL) || (info->endTag == 1)) {
725#ifdef DEBUG
726 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
727#endif
Daniel Veillard56098d42001-04-24 12:51:09 +0000728 } else if (info->endTag == 3) {
729#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +0000730 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", newtag, ctxt->name);
William M. Brack1633d182001-10-05 15:41:19 +0000731
Daniel Veillard56098d42001-04-24 12:51:09 +0000732#endif
733 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
734 ctxt->sax->error(ctxt->userData,
735 "Opening and ending tag mismatch: %s and %s\n",
736 newtag, ctxt->name);
737 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000738 }
739 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
740 ctxt->sax->endElement(ctxt->userData, ctxt->name);
741 oldname = htmlnamePop(ctxt);
742 if (oldname != NULL) {
743#ifdef DEBUG
744 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
745#endif
746 xmlFree(oldname);
747 }
748 }
749}
750
751/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000752 * htmlAutoCloseOnEnd:
753 * @ctxt: an HTML parser context
754 *
755 * Close all remaining tags at the end of the stream
756 */
757static void
758htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
759 xmlChar *oldname;
760 int i;
761
762 if (ctxt->nameNr == 0)
763 return;
764#ifdef DEBUG
765 xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
766#endif
767
768 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
769#ifdef DEBUG
770 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
771#endif
772 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
773 ctxt->sax->endElement(ctxt->userData, ctxt->name);
774 oldname = htmlnamePop(ctxt);
775 if (oldname != NULL) {
776#ifdef DEBUG
777 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
778#endif
779 xmlFree(oldname);
780 }
781 }
782}
783
784/**
Owen Taylor3473f882001-02-23 17:55:21 +0000785 * htmlAutoClose:
786 * @ctxt: an HTML parser context
787 * @newtag: The new tag name or NULL
788 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000789 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000790 * The list is kept in htmlStartClose array. This function is
791 * called when a new tag has been detected and generates the
792 * appropriates closes if possible/needed.
793 * If newtag is NULL this mean we are at the end of the resource
794 * and we should check
795 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000796static void
Owen Taylor3473f882001-02-23 17:55:21 +0000797htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
798 xmlChar *oldname;
799 while ((newtag != NULL) && (ctxt->name != NULL) &&
800 (htmlCheckAutoClose(newtag, ctxt->name))) {
801#ifdef DEBUG
802 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
803#endif
804 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
805 ctxt->sax->endElement(ctxt->userData, ctxt->name);
806 oldname = htmlnamePop(ctxt);
807 if (oldname != NULL) {
808#ifdef DEBUG
809 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
810#endif
811 xmlFree(oldname);
812 }
813 }
814 if (newtag == NULL) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000815 htmlAutoCloseOnEnd(ctxt);
816 return;
Owen Taylor3473f882001-02-23 17:55:21 +0000817 }
818 while ((newtag == NULL) && (ctxt->name != NULL) &&
819 ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
820 (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
821 (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
822#ifdef DEBUG
823 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
824#endif
825 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
826 ctxt->sax->endElement(ctxt->userData, ctxt->name);
827 oldname = htmlnamePop(ctxt);
828 if (oldname != NULL) {
829#ifdef DEBUG
830 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
831#endif
832 xmlFree(oldname);
833 }
834 }
835
836}
837
838/**
839 * htmlAutoCloseTag:
840 * @doc: the HTML document
841 * @name: The tag name
842 * @elem: the HTML element
843 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000844 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000845 * The list is kept in htmlStartClose array. This function checks
846 * if the element or one of it's children would autoclose the
847 * given tag.
848 *
849 * Returns 1 if autoclose, 0 otherwise
850 */
851int
852htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
853 htmlNodePtr child;
854
855 if (elem == NULL) return(1);
856 if (xmlStrEqual(name, elem->name)) return(0);
857 if (htmlCheckAutoClose(elem->name, name)) return(1);
858 child = elem->children;
859 while (child != NULL) {
860 if (htmlAutoCloseTag(doc, name, child)) return(1);
861 child = child->next;
862 }
863 return(0);
864}
865
866/**
867 * htmlIsAutoClosed:
868 * @doc: the HTML document
869 * @elem: the HTML element
870 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000871 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000872 * The list is kept in htmlStartClose array. This function checks
873 * if a tag is autoclosed by one of it's child
874 *
875 * Returns 1 if autoclosed, 0 otherwise
876 */
877int
878htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
879 htmlNodePtr child;
880
881 if (elem == NULL) return(1);
882 child = elem->children;
883 while (child != NULL) {
884 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
885 child = child->next;
886 }
887 return(0);
888}
889
890/**
891 * htmlCheckImplied:
892 * @ctxt: an HTML parser context
893 * @newtag: The new tag name
894 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000895 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +0000896 * called when a new tag has been detected and generates the
897 * appropriates implicit tags if missing
898 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000899static void
Owen Taylor3473f882001-02-23 17:55:21 +0000900htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
901 if (!htmlOmittedDefaultValue)
902 return;
903 if (xmlStrEqual(newtag, BAD_CAST"html"))
904 return;
905 if (ctxt->nameNr <= 0) {
906#ifdef DEBUG
907 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
908#endif
909 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
910 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
911 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
912 }
913 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
914 return;
915 if ((ctxt->nameNr <= 1) &&
916 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
917 (xmlStrEqual(newtag, BAD_CAST"style")) ||
918 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
919 (xmlStrEqual(newtag, BAD_CAST"link")) ||
920 (xmlStrEqual(newtag, BAD_CAST"title")) ||
921 (xmlStrEqual(newtag, BAD_CAST"base")))) {
922 /*
923 * dropped OBJECT ... i you put it first BODY will be
924 * assumed !
925 */
926#ifdef DEBUG
927 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
928#endif
929 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
930 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
931 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
932 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
933 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
934 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
935 int i;
936 for (i = 0;i < ctxt->nameNr;i++) {
937 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
938 return;
939 }
940 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
941 return;
942 }
943 }
944
945#ifdef DEBUG
946 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
947#endif
948 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
949 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
950 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
951 }
952}
953
954/**
955 * htmlCheckParagraph
956 * @ctxt: an HTML parser context
957 *
958 * Check whether a p element need to be implied before inserting
959 * characters in the current element.
960 *
961 * Returns 1 if a paragraph has been inserted, 0 if not and -1
962 * in case of error.
963 */
964
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000965static int
Owen Taylor3473f882001-02-23 17:55:21 +0000966htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
967 const xmlChar *tag;
968 int i;
969
970 if (ctxt == NULL)
971 return(-1);
972 tag = ctxt->name;
973 if (tag == NULL) {
974 htmlAutoClose(ctxt, BAD_CAST"p");
975 htmlCheckImplied(ctxt, BAD_CAST"p");
976 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
977 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
978 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
979 return(1);
980 }
981 if (!htmlOmittedDefaultValue)
982 return(0);
983 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
984 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
985#ifdef DEBUG
986 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
987#endif
988 htmlAutoClose(ctxt, BAD_CAST"p");
989 htmlCheckImplied(ctxt, BAD_CAST"p");
990 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
991 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
992 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
993 return(1);
994 }
995 }
996 return(0);
997}
998
999/**
1000 * htmlIsScriptAttribute:
1001 * @name: an attribute name
1002 *
1003 * Check if an attribute is of content type Script
1004 *
1005 * Returns 1 is the attribute is a script 0 otherwise
1006 */
1007int
1008htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001009 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001010
1011 if (name == NULL)
1012 return(0);
1013 /*
1014 * all script attributes start with 'on'
1015 */
1016 if ((name[0] != 'o') || (name[1] != 'n'))
1017 return(0);
1018 for (i = 0;
1019 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1020 i++) {
1021 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1022 return(1);
1023 }
1024 return(0);
1025}
1026
1027/************************************************************************
1028 * *
1029 * The list of HTML predefined entities *
1030 * *
1031 ************************************************************************/
1032
1033
Daniel Veillard22090732001-07-16 00:06:07 +00001034static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001035/*
1036 * the 4 absolute ones, plus apostrophe.
1037 */
1038{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1039{ 38, "amp", "ampersand, U+0026 ISOnum" },
1040{ 39, "apos", "single quote" },
1041{ 60, "lt", "less-than sign, U+003C ISOnum" },
1042{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1043
1044/*
1045 * A bunch still in the 128-255 range
1046 * Replacing them depend really on the charset used.
1047 */
1048{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1049{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1050{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1051{ 163, "pound","pound sign, U+00A3 ISOnum" },
1052{ 164, "curren","currency sign, U+00A4 ISOnum" },
1053{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1054{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1055{ 167, "sect", "section sign, U+00A7 ISOnum" },
1056{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1057{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1058{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1059{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1060{ 172, "not", "not sign, U+00AC ISOnum" },
1061{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1062{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1063{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1064{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1065{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1066{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1067{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1068{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1069{ 181, "micro","micro sign, U+00B5 ISOnum" },
1070{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1071{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1072{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1073{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1074{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1075{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1076{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1077{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1078{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1079{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1080{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1081{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1082{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1083{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1084{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1085{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1086{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1087{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1088{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1089{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1090{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1091{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1092{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1093{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1094{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1095{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1096{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1097{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1098{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1099{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1100{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1101{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1102{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1103{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1104{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1105{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1106{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1107{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1108{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1109{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1110{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1111{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1112{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1113{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1114{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1115{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1116{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1117{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1118{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1119{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1120{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1121{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1122{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1123{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1124{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1125{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1126{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1127{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1128{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1129{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1130{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1131{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1132{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1133{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1134{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1135{ 247, "divide","division sign, U+00F7 ISOnum" },
1136{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1137{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1138{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1139{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1140{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1141{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1142{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1143{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1144
1145{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1146{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1147{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1148{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1149{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1150
1151/*
1152 * Anything below should really be kept as entities references
1153 */
1154{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1155
1156{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1157{ 732, "tilde","small tilde, U+02DC ISOdia" },
1158
1159{ 913, "Alpha","greek capital letter alpha, U+0391" },
1160{ 914, "Beta", "greek capital letter beta, U+0392" },
1161{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1162{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1163{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1164{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1165{ 919, "Eta", "greek capital letter eta, U+0397" },
1166{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1167{ 921, "Iota", "greek capital letter iota, U+0399" },
1168{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001169{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001170{ 924, "Mu", "greek capital letter mu, U+039C" },
1171{ 925, "Nu", "greek capital letter nu, U+039D" },
1172{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1173{ 927, "Omicron","greek capital letter omicron, U+039F" },
1174{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1175{ 929, "Rho", "greek capital letter rho, U+03A1" },
1176{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1177{ 932, "Tau", "greek capital letter tau, U+03A4" },
1178{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1179{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1180{ 935, "Chi", "greek capital letter chi, U+03A7" },
1181{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1182{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1183
1184{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1185{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1186{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1187{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1188{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1189{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1190{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1191{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1192{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1193{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1194{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1195{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1196{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1197{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1198{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1199{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1200{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1201{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1202{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1203{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1204{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1205{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1206{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1207{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1208{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1209{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1210{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1211{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1212
1213{ 8194, "ensp", "en space, U+2002 ISOpub" },
1214{ 8195, "emsp", "em space, U+2003 ISOpub" },
1215{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1216{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1217{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1218{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1219{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1220{ 8211, "ndash","en dash, U+2013 ISOpub" },
1221{ 8212, "mdash","em dash, U+2014 ISOpub" },
1222{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1223{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1224{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1225{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1226{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1227{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1228{ 8224, "dagger","dagger, U+2020 ISOpub" },
1229{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1230
1231{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1232{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1233
1234{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1235
1236{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1237{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1238
1239{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1240{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1241
1242{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1243{ 8260, "frasl","fraction slash, U+2044 NEW" },
1244
1245{ 8364, "euro", "euro sign, U+20AC NEW" },
1246
1247{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1248{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1249{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1250{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1251{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1252{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1253{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1254{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1255{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1256{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1257{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1258{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1259{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1260{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1261{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1262{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1263
1264{ 8704, "forall","for all, U+2200 ISOtech" },
1265{ 8706, "part", "partial differential, U+2202 ISOtech" },
1266{ 8707, "exist","there exists, U+2203 ISOtech" },
1267{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1268{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1269{ 8712, "isin", "element of, U+2208 ISOtech" },
1270{ 8713, "notin","not an element of, U+2209 ISOtech" },
1271{ 8715, "ni", "contains as member, U+220B ISOtech" },
1272{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001273{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001274{ 8722, "minus","minus sign, U+2212 ISOtech" },
1275{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1276{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1277{ 8733, "prop", "proportional to, U+221D ISOtech" },
1278{ 8734, "infin","infinity, U+221E ISOtech" },
1279{ 8736, "ang", "angle, U+2220 ISOamso" },
1280{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1281{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1282{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1283{ 8746, "cup", "union = cup, U+222A ISOtech" },
1284{ 8747, "int", "integral, U+222B ISOtech" },
1285{ 8756, "there4","therefore, U+2234 ISOtech" },
1286{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1287{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1288{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1289{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1290{ 8801, "equiv","identical to, U+2261 ISOtech" },
1291{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1292{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1293{ 8834, "sub", "subset of, U+2282 ISOtech" },
1294{ 8835, "sup", "superset of, U+2283 ISOtech" },
1295{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1296{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1297{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1298{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1299{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1300{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1301{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1302{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1303{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1304{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1305{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1306{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1307{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1308{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1309
1310{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1311{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1312{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1313{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1314
1315};
1316
1317/************************************************************************
1318 * *
1319 * Commodity functions to handle entities *
1320 * *
1321 ************************************************************************/
1322
1323/*
1324 * Macro used to grow the current buffer.
1325 */
1326#define growBuffer(buffer) { \
1327 buffer##_size *= 2; \
Daniel Veillard3487c8d2002-09-05 11:33:25 +00001328 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
Owen Taylor3473f882001-02-23 17:55:21 +00001329 if (buffer == NULL) { \
Daniel Veillard3487c8d2002-09-05 11:33:25 +00001330 xmlGenericError(xmlGenericErrorContext, "realloc failed\n"); \
Owen Taylor3473f882001-02-23 17:55:21 +00001331 return(NULL); \
1332 } \
1333}
1334
1335/**
1336 * htmlEntityLookup:
1337 * @name: the entity name
1338 *
1339 * Lookup the given entity in EntitiesTable
1340 *
1341 * TODO: the linear scan is really ugly, an hash table is really needed.
1342 *
1343 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1344 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001345const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001346htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001347 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001348
1349 for (i = 0;i < (sizeof(html40EntitiesTable)/
1350 sizeof(html40EntitiesTable[0]));i++) {
1351 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1352#ifdef DEBUG
1353 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
1354#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001355 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001356 }
1357 }
1358 return(NULL);
1359}
1360
1361/**
1362 * htmlEntityValueLookup:
1363 * @value: the entity's unicode value
1364 *
1365 * Lookup the given entity in EntitiesTable
1366 *
1367 * TODO: the linear scan is really ugly, an hash table is really needed.
1368 *
1369 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1370 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001371const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001372htmlEntityValueLookup(unsigned int value) {
1373 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001374#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00001375 unsigned int lv = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001376#endif
1377
1378 for (i = 0;i < (sizeof(html40EntitiesTable)/
1379 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001380 if (html40EntitiesTable[i].value >= value) {
1381 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001382 break;
1383#ifdef DEBUG
1384 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
1385#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001386 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001387 }
1388#ifdef DEBUG
1389 if (lv > html40EntitiesTable[i].value) {
1390 xmlGenericError(xmlGenericErrorContext,
1391 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1392 lv, html40EntitiesTable[i].value);
1393 }
1394 lv = html40EntitiesTable[i].value;
1395#endif
1396 }
1397 return(NULL);
1398}
1399
1400/**
1401 * UTF8ToHtml:
1402 * @out: a pointer to an array of bytes to store the result
1403 * @outlen: the length of @out
1404 * @in: a pointer to an array of UTF-8 chars
1405 * @inlen: the length of @in
1406 *
1407 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1408 * plus HTML entities block of chars out.
1409 *
1410 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1411 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001412 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001413 * The value of @outlen after return is the number of octets consumed.
1414 */
1415int
1416UTF8ToHtml(unsigned char* out, int *outlen,
1417 const unsigned char* in, int *inlen) {
1418 const unsigned char* processed = in;
1419 const unsigned char* outend;
1420 const unsigned char* outstart = out;
1421 const unsigned char* instart = in;
1422 const unsigned char* inend;
1423 unsigned int c, d;
1424 int trailing;
1425
1426 if (in == NULL) {
1427 /*
1428 * initialization nothing to do
1429 */
1430 *outlen = 0;
1431 *inlen = 0;
1432 return(0);
1433 }
1434 inend = in + (*inlen);
1435 outend = out + (*outlen);
1436 while (in < inend) {
1437 d = *in++;
1438 if (d < 0x80) { c= d; trailing= 0; }
1439 else if (d < 0xC0) {
1440 /* trailing byte in leading position */
1441 *outlen = out - outstart;
1442 *inlen = processed - instart;
1443 return(-2);
1444 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1445 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1446 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1447 else {
1448 /* no chance for this in Ascii */
1449 *outlen = out - outstart;
1450 *inlen = processed - instart;
1451 return(-2);
1452 }
1453
1454 if (inend - in < trailing) {
1455 break;
1456 }
1457
1458 for ( ; trailing; trailing--) {
1459 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1460 break;
1461 c <<= 6;
1462 c |= d & 0x3F;
1463 }
1464
1465 /* assertion: c is a single UTF-4 value */
1466 if (c < 0x80) {
1467 if (out + 1 >= outend)
1468 break;
1469 *out++ = c;
1470 } else {
1471 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001472 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001473
1474 /*
1475 * Try to lookup a predefined HTML entity for it
1476 */
1477
1478 ent = htmlEntityValueLookup(c);
1479 if (ent == NULL) {
1480 /* no chance for this in Ascii */
1481 *outlen = out - outstart;
1482 *inlen = processed - instart;
1483 return(-2);
1484 }
1485 len = strlen(ent->name);
1486 if (out + 2 + len >= outend)
1487 break;
1488 *out++ = '&';
1489 memcpy(out, ent->name, len);
1490 out += len;
1491 *out++ = ';';
1492 }
1493 processed = in;
1494 }
1495 *outlen = out - outstart;
1496 *inlen = processed - instart;
1497 return(0);
1498}
1499
1500/**
1501 * htmlEncodeEntities:
1502 * @out: a pointer to an array of bytes to store the result
1503 * @outlen: the length of @out
1504 * @in: a pointer to an array of UTF-8 chars
1505 * @inlen: the length of @in
1506 * @quoteChar: the quote character to escape (' or ") or zero.
1507 *
1508 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1509 * plus HTML entities block of chars out.
1510 *
1511 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1512 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001513 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001514 * The value of @outlen after return is the number of octets consumed.
1515 */
1516int
1517htmlEncodeEntities(unsigned char* out, int *outlen,
1518 const unsigned char* in, int *inlen, int quoteChar) {
1519 const unsigned char* processed = in;
1520 const unsigned char* outend = out + (*outlen);
1521 const unsigned char* outstart = out;
1522 const unsigned char* instart = in;
1523 const unsigned char* inend = in + (*inlen);
1524 unsigned int c, d;
1525 int trailing;
1526
1527 while (in < inend) {
1528 d = *in++;
1529 if (d < 0x80) { c= d; trailing= 0; }
1530 else if (d < 0xC0) {
1531 /* trailing byte in leading position */
1532 *outlen = out - outstart;
1533 *inlen = processed - instart;
1534 return(-2);
1535 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1536 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1537 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1538 else {
1539 /* no chance for this in Ascii */
1540 *outlen = out - outstart;
1541 *inlen = processed - instart;
1542 return(-2);
1543 }
1544
1545 if (inend - in < trailing)
1546 break;
1547
1548 while (trailing--) {
1549 if (((d= *in++) & 0xC0) != 0x80) {
1550 *outlen = out - outstart;
1551 *inlen = processed - instart;
1552 return(-2);
1553 }
1554 c <<= 6;
1555 c |= d & 0x3F;
1556 }
1557
1558 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001559 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1560 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001561 if (out >= outend)
1562 break;
1563 *out++ = c;
1564 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001565 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001566 const char *cp;
1567 char nbuf[16];
1568 int len;
1569
1570 /*
1571 * Try to lookup a predefined HTML entity for it
1572 */
1573 ent = htmlEntityValueLookup(c);
1574 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00001575 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00001576 cp = nbuf;
1577 }
1578 else
1579 cp = ent->name;
1580 len = strlen(cp);
1581 if (out + 2 + len > outend)
1582 break;
1583 *out++ = '&';
1584 memcpy(out, cp, len);
1585 out += len;
1586 *out++ = ';';
1587 }
1588 processed = in;
1589 }
1590 *outlen = out - outstart;
1591 *inlen = processed - instart;
1592 return(0);
1593}
1594
1595/**
1596 * htmlDecodeEntities:
1597 * @ctxt: the parser context
1598 * @len: the len to decode (in bytes !), -1 for no size limit
1599 * @end: an end marker xmlChar, 0 if none
1600 * @end2: an end marker xmlChar, 0 if none
1601 * @end3: an end marker xmlChar, 0 if none
1602 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001603 * Substitute the HTML entities by their value
Owen Taylor3473f882001-02-23 17:55:21 +00001604 *
1605 * DEPRECATED !!!!
1606 *
1607 * Returns A newly allocated string with the substitution done. The caller
1608 * must deallocate it !
1609 */
1610xmlChar *
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00001611htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
1612 xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001613 static int deprecated = 0;
1614 if (!deprecated) {
1615 xmlGenericError(xmlGenericErrorContext,
1616 "htmlDecodeEntities() deprecated function reached\n");
1617 deprecated = 1;
1618 }
1619 return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00001620}
1621
1622/************************************************************************
1623 * *
1624 * Commodity functions to handle streams *
1625 * *
1626 ************************************************************************/
1627
1628/**
Owen Taylor3473f882001-02-23 17:55:21 +00001629 * htmlNewInputStream:
1630 * @ctxt: an HTML parser context
1631 *
1632 * Create a new input stream structure
1633 * Returns the new input stream or NULL
1634 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001635static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001636htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1637 htmlParserInputPtr input;
1638
1639 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1640 if (input == NULL) {
1641 ctxt->errNo = XML_ERR_NO_MEMORY;
1642 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1643 ctxt->sax->error(ctxt->userData,
1644 "malloc: couldn't allocate a new input stream\n");
1645 return(NULL);
1646 }
1647 memset(input, 0, sizeof(htmlParserInput));
1648 input->filename = NULL;
1649 input->directory = NULL;
1650 input->base = NULL;
1651 input->cur = NULL;
1652 input->buf = NULL;
1653 input->line = 1;
1654 input->col = 1;
1655 input->buf = NULL;
1656 input->free = NULL;
1657 input->version = NULL;
1658 input->consumed = 0;
1659 input->length = 0;
1660 return(input);
1661}
1662
1663
1664/************************************************************************
1665 * *
1666 * Commodity functions, cleanup needed ? *
1667 * *
1668 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001669/*
1670 * all tags allowing pc data from the html 4.01 loose dtd
1671 * NOTE: it might be more apropriate to integrate this information
1672 * into the html40ElementTable array but I don't want to risk any
1673 * binary incomptibility
1674 */
1675static const char *allowPCData[] = {
1676 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
1677 "blockquote", "body", "button", "caption", "center", "cite", "code",
1678 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
1679 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
1680 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
1681 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
1682};
Owen Taylor3473f882001-02-23 17:55:21 +00001683
1684/**
1685 * areBlanks:
1686 * @ctxt: an HTML parser context
1687 * @str: a xmlChar *
1688 * @len: the size of @str
1689 *
1690 * Is this a sequence of blank chars that one can ignore ?
1691 *
1692 * Returns 1 if ignorable 0 otherwise.
1693 */
1694
1695static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001696 unsigned int i;
1697 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00001698 xmlNodePtr lastChild;
1699
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001700 for (j = 0;j < len;j++)
1701 if (!(IS_BLANK(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001702
1703 if (CUR == 0) return(1);
1704 if (CUR != '<') return(0);
1705 if (ctxt->name == NULL)
1706 return(1);
1707 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
1708 return(1);
1709 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
1710 return(1);
1711 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
1712 return(1);
1713 if (ctxt->node == NULL) return(0);
1714 lastChild = xmlGetLastChild(ctxt->node);
1715 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00001716 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
1717 (ctxt->node->content != NULL)) return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001718 /* keep ws in constructs like ...<b> </b>...
1719 for all tags "b" allowing PCDATA */
1720 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
1721 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
1722 return(0);
1723 }
1724 }
Owen Taylor3473f882001-02-23 17:55:21 +00001725 } else if (xmlNodeIsText(lastChild)) {
1726 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001727 } else {
1728 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
1729 for all tags "p" allowing PCDATA */
1730 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
1731 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
1732 return(0);
1733 }
1734 }
Owen Taylor3473f882001-02-23 17:55:21 +00001735 }
1736 return(1);
1737}
1738
1739/**
Owen Taylor3473f882001-02-23 17:55:21 +00001740 * htmlNewDocNoDtD:
1741 * @URI: URI for the dtd, or NULL
1742 * @ExternalID: the external ID of the DTD, or NULL
1743 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001744 * Creates a new HTML document without a DTD node if @URI and @ExternalID
1745 * are NULL
1746 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001747 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00001748 */
1749htmlDocPtr
1750htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
1751 xmlDocPtr cur;
1752
1753 /*
1754 * Allocate a new document and fill the fields.
1755 */
1756 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
1757 if (cur == NULL) {
1758 xmlGenericError(xmlGenericErrorContext,
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001759 "htmlNewDocNoDtD : malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00001760 return(NULL);
1761 }
1762 memset(cur, 0, sizeof(xmlDoc));
1763
1764 cur->type = XML_HTML_DOCUMENT_NODE;
1765 cur->version = NULL;
1766 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001767 cur->doc = cur;
1768 cur->name = NULL;
1769 cur->children = NULL;
1770 cur->extSubset = NULL;
1771 cur->oldNs = NULL;
1772 cur->encoding = NULL;
1773 cur->standalone = 1;
1774 cur->compression = 0;
1775 cur->ids = NULL;
1776 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001777 cur->_private = NULL;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00001778 if ((ExternalID != NULL) ||
1779 (URI != NULL))
Daniel Veillard5151c062001-10-23 13:10:19 +00001780 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00001781 return(cur);
1782}
1783
1784/**
1785 * htmlNewDoc:
1786 * @URI: URI for the dtd, or NULL
1787 * @ExternalID: the external ID of the DTD, or NULL
1788 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001789 * Creates a new HTML document
1790 *
Owen Taylor3473f882001-02-23 17:55:21 +00001791 * Returns a new document
1792 */
1793htmlDocPtr
1794htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1795 if ((URI == NULL) && (ExternalID == NULL))
1796 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00001797 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
1798 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00001799
1800 return(htmlNewDocNoDtD(URI, ExternalID));
1801}
1802
1803
1804/************************************************************************
1805 * *
1806 * The parser itself *
1807 * Relates to http://www.w3.org/TR/html40 *
1808 * *
1809 ************************************************************************/
1810
1811/************************************************************************
1812 * *
1813 * The parser itself *
1814 * *
1815 ************************************************************************/
1816
1817/**
1818 * htmlParseHTMLName:
1819 * @ctxt: an HTML parser context
1820 *
1821 * parse an HTML tag or attribute name, note that we convert it to lowercase
1822 * since HTML names are not case-sensitive.
1823 *
1824 * Returns the Tag Name parsed or NULL
1825 */
1826
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001827static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001828htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1829 xmlChar *ret = NULL;
1830 int i = 0;
1831 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
1832
1833 if (!IS_LETTER(CUR) && (CUR != '_') &&
1834 (CUR != ':')) return(NULL);
1835
1836 while ((i < HTML_PARSER_BUFFER_SIZE) &&
1837 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1838 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
1839 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
1840 else loc[i] = CUR;
1841 i++;
1842
1843 NEXT;
1844 }
1845
1846 ret = xmlStrndup(loc, i);
1847
1848 return(ret);
1849}
1850
1851/**
1852 * htmlParseName:
1853 * @ctxt: an HTML parser context
1854 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001855 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00001856 *
1857 * Returns the Name parsed or NULL
1858 */
1859
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001860static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001861htmlParseName(htmlParserCtxtPtr ctxt) {
1862 xmlChar buf[HTML_MAX_NAMELEN];
1863 int len = 0;
1864
1865 GROW;
1866 if (!IS_LETTER(CUR) && (CUR != '_')) {
1867 return(NULL);
1868 }
1869
1870 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1871 (CUR == '.') || (CUR == '-') ||
1872 (CUR == '_') || (CUR == ':') ||
1873 (IS_COMBINING(CUR)) ||
1874 (IS_EXTENDER(CUR))) {
1875 buf[len++] = CUR;
1876 NEXT;
1877 if (len >= HTML_MAX_NAMELEN) {
1878 xmlGenericError(xmlGenericErrorContext,
1879 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1880 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1881 (CUR == '.') || (CUR == '-') ||
1882 (CUR == '_') || (CUR == ':') ||
1883 (IS_COMBINING(CUR)) ||
1884 (IS_EXTENDER(CUR)))
1885 NEXT;
1886 break;
1887 }
1888 }
1889 return(xmlStrndup(buf, len));
1890}
1891
1892/**
1893 * htmlParseHTMLAttribute:
1894 * @ctxt: an HTML parser context
1895 * @stop: a char stop value
1896 *
1897 * parse an HTML attribute value till the stop (quote), if
1898 * stop is 0 then it stops at the first space
1899 *
1900 * Returns the attribute parsed or NULL
1901 */
1902
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001903static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001904htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
1905 xmlChar *buffer = NULL;
1906 int buffer_size = 0;
1907 xmlChar *out = NULL;
1908 xmlChar *name = NULL;
1909
1910 xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00001911 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001912
1913 /*
1914 * allocate a translation buffer.
1915 */
1916 buffer_size = HTML_PARSER_BUFFER_SIZE;
1917 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1918 if (buffer == NULL) {
Daniel Veillard3487c8d2002-09-05 11:33:25 +00001919 xmlGenericError(xmlGenericErrorContext,
1920 "htmlParseHTMLAttribute: malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00001921 return(NULL);
1922 }
1923 out = buffer;
1924
1925 /*
1926 * Ok loop until we reach one of the ending chars
1927 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00001928 while ((CUR != 0) && (CUR != stop)) {
1929 if ((stop == 0) && (CUR == '>')) break;
Owen Taylor3473f882001-02-23 17:55:21 +00001930 if ((stop == 0) && (IS_BLANK(CUR))) break;
1931 if (CUR == '&') {
1932 if (NXT(1) == '#') {
1933 unsigned int c;
1934 int bits;
1935
1936 c = htmlParseCharRef(ctxt);
1937 if (c < 0x80)
1938 { *out++ = c; bits= -6; }
1939 else if (c < 0x800)
1940 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1941 else if (c < 0x10000)
1942 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1943 else
1944 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1945
1946 for ( ; bits >= 0; bits-= 6) {
1947 *out++ = ((c >> bits) & 0x3F) | 0x80;
1948 }
Daniel Veillardce02dbc2002-10-22 19:14:58 +00001949
1950 if (out - buffer > buffer_size - 100) {
1951 int indx = out - buffer;
1952
1953 growBuffer(buffer);
1954 out = &buffer[indx];
1955 }
Owen Taylor3473f882001-02-23 17:55:21 +00001956 } else {
1957 ent = htmlParseEntityRef(ctxt, &name);
1958 if (name == NULL) {
1959 *out++ = '&';
1960 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001961 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00001962
1963 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001964 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00001965 }
1966 } else if (ent == NULL) {
1967 *out++ = '&';
1968 cur = name;
1969 while (*cur != 0) {
1970 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001971 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00001972
1973 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001974 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00001975 }
1976 *out++ = *cur++;
1977 }
1978 xmlFree(name);
1979 } else {
1980 unsigned int c;
1981 int bits;
1982
1983 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001984 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00001985
1986 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001987 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00001988 }
1989 c = (xmlChar)ent->value;
1990 if (c < 0x80)
1991 { *out++ = c; bits= -6; }
1992 else if (c < 0x800)
1993 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1994 else if (c < 0x10000)
1995 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1996 else
1997 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1998
1999 for ( ; bits >= 0; bits-= 6) {
2000 *out++ = ((c >> bits) & 0x3F) | 0x80;
2001 }
2002 xmlFree(name);
2003 }
2004 }
2005 } else {
2006 unsigned int c;
2007 int bits, l;
2008
2009 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002010 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002011
2012 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002013 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002014 }
2015 c = CUR_CHAR(l);
2016 if (c < 0x80)
2017 { *out++ = c; bits= -6; }
2018 else if (c < 0x800)
2019 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2020 else if (c < 0x10000)
2021 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2022 else
2023 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2024
2025 for ( ; bits >= 0; bits-= 6) {
2026 *out++ = ((c >> bits) & 0x3F) | 0x80;
2027 }
2028 NEXT;
2029 }
2030 }
2031 *out++ = 0;
2032 return(buffer);
2033}
2034
2035/**
Owen Taylor3473f882001-02-23 17:55:21 +00002036 * htmlParseEntityRef:
2037 * @ctxt: an HTML parser context
2038 * @str: location to store the entity name
2039 *
2040 * parse an HTML ENTITY references
2041 *
2042 * [68] EntityRef ::= '&' Name ';'
2043 *
2044 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2045 * if non-NULL *str will have to be freed by the caller.
2046 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002047const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00002048htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2049 xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002050 const htmlEntityDesc * ent = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002051 *str = NULL;
2052
2053 if (CUR == '&') {
2054 NEXT;
2055 name = htmlParseName(ctxt);
2056 if (name == NULL) {
2057 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2058 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2059 ctxt->wellFormed = 0;
2060 } else {
2061 GROW;
2062 if (CUR == ';') {
2063 *str = name;
2064
2065 /*
2066 * Lookup the entity in the table.
2067 */
2068 ent = htmlEntityLookup(name);
2069 if (ent != NULL) /* OK that's ugly !!! */
2070 NEXT;
2071 } else {
2072 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2073 ctxt->sax->error(ctxt->userData,
2074 "htmlParseEntityRef: expecting ';'\n");
2075 *str = name;
2076 }
2077 }
2078 }
2079 return(ent);
2080}
2081
2082/**
2083 * htmlParseAttValue:
2084 * @ctxt: an HTML parser context
2085 *
2086 * parse a value for an attribute
2087 * Note: the parser won't do substitution of entities here, this
2088 * will be handled later in xmlStringGetNodeList, unless it was
2089 * asked for ctxt->replaceEntities != 0
2090 *
2091 * Returns the AttValue parsed or NULL.
2092 */
2093
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002094static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002095htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2096 xmlChar *ret = NULL;
2097
2098 if (CUR == '"') {
2099 NEXT;
2100 ret = htmlParseHTMLAttribute(ctxt, '"');
2101 if (CUR != '"') {
2102 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2103 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2104 ctxt->wellFormed = 0;
2105 } else
2106 NEXT;
2107 } else if (CUR == '\'') {
2108 NEXT;
2109 ret = htmlParseHTMLAttribute(ctxt, '\'');
2110 if (CUR != '\'') {
2111 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2112 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2113 ctxt->wellFormed = 0;
2114 } else
2115 NEXT;
2116 } else {
2117 /*
2118 * That's an HTMLism, the attribute value may not be quoted
2119 */
2120 ret = htmlParseHTMLAttribute(ctxt, 0);
2121 if (ret == NULL) {
2122 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2123 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2124 ctxt->wellFormed = 0;
2125 }
2126 }
2127 return(ret);
2128}
2129
2130/**
2131 * htmlParseSystemLiteral:
2132 * @ctxt: an HTML parser context
2133 *
2134 * parse an HTML Literal
2135 *
2136 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2137 *
2138 * Returns the SystemLiteral parsed or NULL
2139 */
2140
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002141static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002142htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2143 const xmlChar *q;
2144 xmlChar *ret = NULL;
2145
2146 if (CUR == '"') {
2147 NEXT;
2148 q = CUR_PTR;
2149 while ((IS_CHAR(CUR)) && (CUR != '"'))
2150 NEXT;
2151 if (!IS_CHAR(CUR)) {
2152 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2153 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2154 ctxt->wellFormed = 0;
2155 } else {
2156 ret = xmlStrndup(q, CUR_PTR - q);
2157 NEXT;
2158 }
2159 } else if (CUR == '\'') {
2160 NEXT;
2161 q = CUR_PTR;
2162 while ((IS_CHAR(CUR)) && (CUR != '\''))
2163 NEXT;
2164 if (!IS_CHAR(CUR)) {
2165 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2166 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2167 ctxt->wellFormed = 0;
2168 } else {
2169 ret = xmlStrndup(q, CUR_PTR - q);
2170 NEXT;
2171 }
2172 } else {
2173 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2174 ctxt->sax->error(ctxt->userData,
2175 "SystemLiteral \" or ' expected\n");
2176 ctxt->wellFormed = 0;
2177 }
2178
2179 return(ret);
2180}
2181
2182/**
2183 * htmlParsePubidLiteral:
2184 * @ctxt: an HTML parser context
2185 *
2186 * parse an HTML public literal
2187 *
2188 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2189 *
2190 * Returns the PubidLiteral parsed or NULL.
2191 */
2192
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002193static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002194htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2195 const xmlChar *q;
2196 xmlChar *ret = NULL;
2197 /*
2198 * Name ::= (Letter | '_') (NameChar)*
2199 */
2200 if (CUR == '"') {
2201 NEXT;
2202 q = CUR_PTR;
2203 while (IS_PUBIDCHAR(CUR)) NEXT;
2204 if (CUR != '"') {
2205 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2206 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2207 ctxt->wellFormed = 0;
2208 } else {
2209 ret = xmlStrndup(q, CUR_PTR - q);
2210 NEXT;
2211 }
2212 } else if (CUR == '\'') {
2213 NEXT;
2214 q = CUR_PTR;
2215 while ((IS_LETTER(CUR)) && (CUR != '\''))
2216 NEXT;
2217 if (!IS_LETTER(CUR)) {
2218 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2219 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2220 ctxt->wellFormed = 0;
2221 } else {
2222 ret = xmlStrndup(q, CUR_PTR - q);
2223 NEXT;
2224 }
2225 } else {
2226 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2227 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2228 ctxt->wellFormed = 0;
2229 }
2230
2231 return(ret);
2232}
2233
2234/**
2235 * htmlParseScript:
2236 * @ctxt: an HTML parser context
2237 *
2238 * parse the content of an HTML SCRIPT or STYLE element
2239 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2240 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2241 * http://www.w3.org/TR/html4/types.html#type-script
2242 * http://www.w3.org/TR/html4/types.html#h-6.15
2243 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2244 *
2245 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2246 * element and the value of intrinsic event attributes. User agents must
2247 * not evaluate script data as HTML markup but instead must pass it on as
2248 * data to a script engine.
2249 * NOTES:
2250 * - The content is passed like CDATA
2251 * - the attributes for style and scripting "onXXX" are also described
2252 * as CDATA but SGML allows entities references in attributes so their
2253 * processing is identical as other attributes
2254 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002255static void
Owen Taylor3473f882001-02-23 17:55:21 +00002256htmlParseScript(htmlParserCtxtPtr ctxt) {
2257 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2258 int nbchar = 0;
2259 xmlChar cur;
2260
2261 SHRINK;
2262 cur = CUR;
2263 while (IS_CHAR(cur)) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00002264 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2265 (NXT(3) == '-')) {
2266 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2267 if (ctxt->sax->cdataBlock!= NULL) {
2268 /*
2269 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2270 */
2271 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2272 }
2273 }
2274 nbchar = 0;
2275 htmlParseComment(ctxt);
2276 cur = CUR;
2277 continue;
2278 } else if ((cur == '<') && (NXT(1) == '/')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002279 /*
2280 * One should break here, the specification is clear:
2281 * Authors should therefore escape "</" within the content.
2282 * Escape mechanisms are specific to each scripting or
2283 * style sheet language.
2284 */
2285 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2286 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2287 break; /* while */
2288 }
2289 buf[nbchar++] = cur;
2290 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2291 if (ctxt->sax->cdataBlock!= NULL) {
2292 /*
2293 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2294 */
2295 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2296 }
2297 nbchar = 0;
2298 }
2299 NEXT;
2300 cur = CUR;
2301 }
2302 if (!(IS_CHAR(cur))) {
2303 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2304 ctxt->sax->error(ctxt->userData,
2305 "Invalid char in CDATA 0x%X\n", cur);
2306 ctxt->wellFormed = 0;
2307 NEXT;
2308 }
2309
2310 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2311 if (ctxt->sax->cdataBlock!= NULL) {
2312 /*
2313 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2314 */
2315 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2316 }
2317 }
2318}
2319
2320
2321/**
2322 * htmlParseCharData:
2323 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002324 *
2325 * parse a CharData section.
2326 * if we are within a CDATA section ']]>' marks an end of section.
2327 *
2328 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2329 */
2330
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002331static void
2332htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002333 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2334 int nbchar = 0;
2335 int cur, l;
2336
2337 SHRINK;
2338 cur = CUR_CHAR(l);
2339 while (((cur != '<') || (ctxt->token == '<')) &&
2340 ((cur != '&') || (ctxt->token == '&')) &&
2341 (IS_CHAR(cur))) {
2342 COPY_BUF(l,buf,nbchar,cur);
2343 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2344 /*
2345 * Ok the segment is to be consumed as chars.
2346 */
2347 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2348 if (areBlanks(ctxt, buf, nbchar)) {
2349 if (ctxt->sax->ignorableWhitespace != NULL)
2350 ctxt->sax->ignorableWhitespace(ctxt->userData,
2351 buf, nbchar);
2352 } else {
2353 htmlCheckParagraph(ctxt);
2354 if (ctxt->sax->characters != NULL)
2355 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2356 }
2357 }
2358 nbchar = 0;
2359 }
2360 NEXTL(l);
2361 cur = CUR_CHAR(l);
2362 }
2363 if (nbchar != 0) {
2364 /*
2365 * Ok the segment is to be consumed as chars.
2366 */
2367 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2368 if (areBlanks(ctxt, buf, nbchar)) {
2369 if (ctxt->sax->ignorableWhitespace != NULL)
2370 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2371 } else {
2372 htmlCheckParagraph(ctxt);
2373 if (ctxt->sax->characters != NULL)
2374 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2375 }
2376 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002377 } else {
2378 /*
2379 * Loop detection
2380 */
2381 if (cur == 0)
2382 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002383 }
2384}
2385
2386/**
2387 * htmlParseExternalID:
2388 * @ctxt: an HTML parser context
2389 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002390 *
2391 * Parse an External ID or a Public ID
2392 *
Owen Taylor3473f882001-02-23 17:55:21 +00002393 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2394 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2395 *
2396 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2397 *
2398 * Returns the function returns SystemLiteral and in the second
2399 * case publicID receives PubidLiteral, is strict is off
2400 * it is possible to return NULL and have publicID set.
2401 */
2402
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002403static xmlChar *
2404htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002405 xmlChar *URI = NULL;
2406
2407 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2408 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2409 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2410 SKIP(6);
2411 if (!IS_BLANK(CUR)) {
2412 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2413 ctxt->sax->error(ctxt->userData,
2414 "Space required after 'SYSTEM'\n");
2415 ctxt->wellFormed = 0;
2416 }
2417 SKIP_BLANKS;
2418 URI = htmlParseSystemLiteral(ctxt);
2419 if (URI == NULL) {
2420 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2421 ctxt->sax->error(ctxt->userData,
2422 "htmlParseExternalID: SYSTEM, no URI\n");
2423 ctxt->wellFormed = 0;
2424 }
2425 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2426 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2427 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2428 SKIP(6);
2429 if (!IS_BLANK(CUR)) {
2430 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2431 ctxt->sax->error(ctxt->userData,
2432 "Space required after 'PUBLIC'\n");
2433 ctxt->wellFormed = 0;
2434 }
2435 SKIP_BLANKS;
2436 *publicID = htmlParsePubidLiteral(ctxt);
2437 if (*publicID == NULL) {
2438 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2439 ctxt->sax->error(ctxt->userData,
2440 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2441 ctxt->wellFormed = 0;
2442 }
2443 SKIP_BLANKS;
2444 if ((CUR == '"') || (CUR == '\'')) {
2445 URI = htmlParseSystemLiteral(ctxt);
2446 }
2447 }
2448 return(URI);
2449}
2450
2451/**
2452 * htmlParseComment:
2453 * @ctxt: an HTML parser context
2454 *
2455 * Parse an XML (SGML) comment <!-- .... -->
2456 *
2457 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2458 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002459static void
Owen Taylor3473f882001-02-23 17:55:21 +00002460htmlParseComment(htmlParserCtxtPtr ctxt) {
2461 xmlChar *buf = NULL;
2462 int len;
2463 int size = HTML_PARSER_BUFFER_SIZE;
2464 int q, ql;
2465 int r, rl;
2466 int cur, l;
2467 xmlParserInputState state;
2468
2469 /*
2470 * Check that there is a comment right here.
2471 */
2472 if ((RAW != '<') || (NXT(1) != '!') ||
2473 (NXT(2) != '-') || (NXT(3) != '-')) return;
2474
2475 state = ctxt->instate;
2476 ctxt->instate = XML_PARSER_COMMENT;
2477 SHRINK;
2478 SKIP(4);
2479 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2480 if (buf == NULL) {
2481 xmlGenericError(xmlGenericErrorContext,
2482 "malloc of %d byte failed\n", size);
2483 ctxt->instate = state;
2484 return;
2485 }
2486 q = CUR_CHAR(ql);
2487 NEXTL(ql);
2488 r = CUR_CHAR(rl);
2489 NEXTL(rl);
2490 cur = CUR_CHAR(l);
2491 len = 0;
2492 while (IS_CHAR(cur) &&
2493 ((cur != '>') ||
2494 (r != '-') || (q != '-'))) {
2495 if (len + 5 >= size) {
2496 size *= 2;
2497 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2498 if (buf == NULL) {
2499 xmlGenericError(xmlGenericErrorContext,
2500 "realloc of %d byte failed\n", size);
2501 ctxt->instate = state;
2502 return;
2503 }
2504 }
2505 COPY_BUF(ql,buf,len,q);
2506 q = r;
2507 ql = rl;
2508 r = cur;
2509 rl = l;
2510 NEXTL(l);
2511 cur = CUR_CHAR(l);
2512 if (cur == 0) {
2513 SHRINK;
2514 GROW;
2515 cur = CUR_CHAR(l);
2516 }
2517 }
2518 buf[len] = 0;
2519 if (!IS_CHAR(cur)) {
2520 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2521 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2522 ctxt->sax->error(ctxt->userData,
2523 "Comment not terminated \n<!--%.50s\n", buf);
2524 ctxt->wellFormed = 0;
2525 xmlFree(buf);
2526 } else {
2527 NEXT;
2528 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2529 (!ctxt->disableSAX))
2530 ctxt->sax->comment(ctxt->userData, buf);
2531 xmlFree(buf);
2532 }
2533 ctxt->instate = state;
2534}
2535
2536/**
2537 * htmlParseCharRef:
2538 * @ctxt: an HTML parser context
2539 *
2540 * parse Reference declarations
2541 *
2542 * [66] CharRef ::= '&#' [0-9]+ ';' |
2543 * '&#x' [0-9a-fA-F]+ ';'
2544 *
2545 * Returns the value parsed (as an int)
2546 */
2547int
2548htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2549 int val = 0;
2550
2551 if ((CUR == '&') && (NXT(1) == '#') &&
2552 (NXT(2) == 'x')) {
2553 SKIP(3);
2554 while (CUR != ';') {
2555 if ((CUR >= '0') && (CUR <= '9'))
2556 val = val * 16 + (CUR - '0');
2557 else if ((CUR >= 'a') && (CUR <= 'f'))
2558 val = val * 16 + (CUR - 'a') + 10;
2559 else if ((CUR >= 'A') && (CUR <= 'F'))
2560 val = val * 16 + (CUR - 'A') + 10;
2561 else {
2562 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2563 ctxt->sax->error(ctxt->userData,
2564 "htmlParseCharRef: invalid hexadecimal value\n");
2565 ctxt->wellFormed = 0;
2566 return(0);
2567 }
2568 NEXT;
2569 }
2570 if (CUR == ';')
2571 NEXT;
2572 } else if ((CUR == '&') && (NXT(1) == '#')) {
2573 SKIP(2);
2574 while (CUR != ';') {
2575 if ((CUR >= '0') && (CUR <= '9'))
2576 val = val * 10 + (CUR - '0');
2577 else {
2578 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2579 ctxt->sax->error(ctxt->userData,
2580 "htmlParseCharRef: invalid decimal value\n");
2581 ctxt->wellFormed = 0;
2582 return(0);
2583 }
2584 NEXT;
2585 }
2586 if (CUR == ';')
2587 NEXT;
2588 } else {
2589 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2590 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2591 ctxt->wellFormed = 0;
2592 }
2593 /*
2594 * Check the value IS_CHAR ...
2595 */
2596 if (IS_CHAR(val)) {
2597 return(val);
2598 } else {
2599 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2600 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
2601 val);
2602 ctxt->wellFormed = 0;
2603 }
2604 return(0);
2605}
2606
2607
2608/**
2609 * htmlParseDocTypeDecl :
2610 * @ctxt: an HTML parser context
2611 *
2612 * parse a DOCTYPE declaration
2613 *
2614 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2615 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2616 */
2617
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002618static void
Owen Taylor3473f882001-02-23 17:55:21 +00002619htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
2620 xmlChar *name;
2621 xmlChar *ExternalID = NULL;
2622 xmlChar *URI = NULL;
2623
2624 /*
2625 * We know that '<!DOCTYPE' has been detected.
2626 */
2627 SKIP(9);
2628
2629 SKIP_BLANKS;
2630
2631 /*
2632 * Parse the DOCTYPE name.
2633 */
2634 name = htmlParseName(ctxt);
2635 if (name == NULL) {
2636 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2637 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2638 ctxt->wellFormed = 0;
2639 }
2640 /*
2641 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2642 */
2643
2644 SKIP_BLANKS;
2645
2646 /*
2647 * Check for SystemID and ExternalID
2648 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002649 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00002650 SKIP_BLANKS;
2651
2652 /*
2653 * We should be at the end of the DOCTYPE declaration.
2654 */
2655 if (CUR != '>') {
2656 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardf6ed8bc2001-10-02 09:22:47 +00002657 ctxt->sax->error(ctxt->userData, "DOCTYPE improperly terminated\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002658 ctxt->wellFormed = 0;
2659 /* We shouldn't try to resynchronize ... */
2660 }
2661 NEXT;
2662
2663 /*
2664 * Create or update the document accordingly to the DOCTYPE
2665 */
2666 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2667 (!ctxt->disableSAX))
2668 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
2669
2670 /*
2671 * Cleanup, since we don't use all those identifiers
2672 */
2673 if (URI != NULL) xmlFree(URI);
2674 if (ExternalID != NULL) xmlFree(ExternalID);
2675 if (name != NULL) xmlFree(name);
2676}
2677
2678/**
2679 * htmlParseAttribute:
2680 * @ctxt: an HTML parser context
2681 * @value: a xmlChar ** used to store the value of the attribute
2682 *
2683 * parse an attribute
2684 *
2685 * [41] Attribute ::= Name Eq AttValue
2686 *
2687 * [25] Eq ::= S? '=' S?
2688 *
2689 * With namespace:
2690 *
2691 * [NS 11] Attribute ::= QName Eq AttValue
2692 *
2693 * Also the case QName == xmlns:??? is handled independently as a namespace
2694 * definition.
2695 *
2696 * Returns the attribute name, and the value in *value.
2697 */
2698
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002699static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002700htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
2701 xmlChar *name, *val = NULL;
2702
2703 *value = NULL;
2704 name = htmlParseHTMLName(ctxt);
2705 if (name == NULL) {
2706 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2707 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2708 ctxt->wellFormed = 0;
2709 return(NULL);
2710 }
2711
2712 /*
2713 * read the value
2714 */
2715 SKIP_BLANKS;
2716 if (CUR == '=') {
2717 NEXT;
2718 SKIP_BLANKS;
2719 val = htmlParseAttValue(ctxt);
2720 /******
2721 } else {
2722 * TODO : some attribute must have values, some may not
2723 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2724 ctxt->sax->warning(ctxt->userData,
2725 "No value for attribute %s\n", name); */
2726 }
2727
2728 *value = val;
2729 return(name);
2730}
2731
2732/**
2733 * htmlCheckEncoding:
2734 * @ctxt: an HTML parser context
2735 * @attvalue: the attribute value
2736 *
2737 * Checks an http-equiv attribute from a Meta tag to detect
2738 * the encoding
2739 * If a new encoding is detected the parser is switched to decode
2740 * it and pass UTF8
2741 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002742static void
Owen Taylor3473f882001-02-23 17:55:21 +00002743htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2744 const xmlChar *encoding;
2745
2746 if ((ctxt == NULL) || (attvalue == NULL))
2747 return;
2748
2749 /* do not change encoding */
2750 if (ctxt->input->encoding != NULL)
2751 return;
2752
2753 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
2754 if (encoding != NULL) {
2755 encoding += 8;
2756 } else {
2757 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
2758 if (encoding != NULL)
2759 encoding += 9;
2760 }
2761 if (encoding != NULL) {
2762 xmlCharEncoding enc;
2763 xmlCharEncodingHandlerPtr handler;
2764
2765 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2766
2767 if (ctxt->input->encoding != NULL)
2768 xmlFree((xmlChar *) ctxt->input->encoding);
2769 ctxt->input->encoding = xmlStrdup(encoding);
2770
2771 enc = xmlParseCharEncoding((const char *) encoding);
2772 /*
2773 * registered set of known encodings
2774 */
2775 if (enc != XML_CHAR_ENCODING_ERROR) {
2776 xmlSwitchEncoding(ctxt, enc);
2777 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2778 } else {
2779 /*
2780 * fallback for unknown encodings
2781 */
2782 handler = xmlFindCharEncodingHandler((const char *) encoding);
2783 if (handler != NULL) {
2784 xmlSwitchToEncoding(ctxt, handler);
2785 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2786 } else {
2787 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2788 }
2789 }
2790
2791 if ((ctxt->input->buf != NULL) &&
2792 (ctxt->input->buf->encoder != NULL) &&
2793 (ctxt->input->buf->raw != NULL) &&
2794 (ctxt->input->buf->buffer != NULL)) {
2795 int nbchars;
2796 int processed;
2797
2798 /*
2799 * convert as much as possible to the parser reading buffer.
2800 */
2801 processed = ctxt->input->cur - ctxt->input->base;
2802 xmlBufferShrink(ctxt->input->buf->buffer, processed);
2803 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2804 ctxt->input->buf->buffer,
2805 ctxt->input->buf->raw);
2806 if (nbchars < 0) {
2807 ctxt->errNo = XML_ERR_INVALID_ENCODING;
2808 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2809 ctxt->sax->error(ctxt->userData,
2810 "htmlCheckEncoding: encoder error\n");
2811 }
2812 ctxt->input->base =
2813 ctxt->input->cur = ctxt->input->buf->buffer->content;
2814 }
2815 }
2816}
2817
2818/**
2819 * htmlCheckMeta:
2820 * @ctxt: an HTML parser context
2821 * @atts: the attributes values
2822 *
2823 * Checks an attributes from a Meta tag
2824 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002825static void
Owen Taylor3473f882001-02-23 17:55:21 +00002826htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2827 int i;
2828 const xmlChar *att, *value;
2829 int http = 0;
2830 const xmlChar *content = NULL;
2831
2832 if ((ctxt == NULL) || (atts == NULL))
2833 return;
2834
2835 i = 0;
2836 att = atts[i++];
2837 while (att != NULL) {
2838 value = atts[i++];
2839 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
2840 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
2841 http = 1;
2842 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
2843 content = value;
2844 att = atts[i++];
2845 }
2846 if ((http) && (content != NULL))
2847 htmlCheckEncoding(ctxt, content);
2848
2849}
2850
2851/**
2852 * htmlParseStartTag:
2853 * @ctxt: an HTML parser context
2854 *
2855 * parse a start of tag either for rule element or
2856 * EmptyElement. In both case we don't parse the tag closing chars.
2857 *
2858 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2859 *
2860 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2861 *
2862 * With namespace:
2863 *
2864 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2865 *
2866 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2867 *
2868 */
2869
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002870static void
Owen Taylor3473f882001-02-23 17:55:21 +00002871htmlParseStartTag(htmlParserCtxtPtr ctxt) {
2872 xmlChar *name;
2873 xmlChar *attname;
2874 xmlChar *attvalue;
2875 const xmlChar **atts = NULL;
2876 int nbatts = 0;
2877 int maxatts = 0;
2878 int meta = 0;
2879 int i;
2880
2881 if (CUR != '<') return;
2882 NEXT;
2883
2884 GROW;
2885 name = htmlParseHTMLName(ctxt);
2886 if (name == NULL) {
2887 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2888 ctxt->sax->error(ctxt->userData,
2889 "htmlParseStartTag: invalid element name\n");
2890 ctxt->wellFormed = 0;
2891 /* Dump the bogus tag like browsers do */
2892 while ((IS_CHAR(CUR)) && (CUR != '>'))
2893 NEXT;
2894 return;
2895 }
2896 if (xmlStrEqual(name, BAD_CAST"meta"))
2897 meta = 1;
2898
2899 /*
2900 * Check for auto-closure of HTML elements.
2901 */
2902 htmlAutoClose(ctxt, name);
2903
2904 /*
2905 * Check for implied HTML elements.
2906 */
2907 htmlCheckImplied(ctxt, name);
2908
2909 /*
2910 * Avoid html at any level > 0, head at any level != 1
2911 * or any attempt to recurse body
2912 */
2913 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
2914 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2915 ctxt->sax->error(ctxt->userData,
2916 "htmlParseStartTag: misplaced <html> tag\n");
2917 ctxt->wellFormed = 0;
2918 xmlFree(name);
2919 return;
2920 }
2921 if ((ctxt->nameNr != 1) &&
2922 (xmlStrEqual(name, BAD_CAST"head"))) {
2923 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2924 ctxt->sax->error(ctxt->userData,
2925 "htmlParseStartTag: misplaced <head> tag\n");
2926 ctxt->wellFormed = 0;
2927 xmlFree(name);
2928 return;
2929 }
2930 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002931 int indx;
2932 for (indx = 0;indx < ctxt->nameNr;indx++) {
2933 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor3473f882001-02-23 17:55:21 +00002934 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2935 ctxt->sax->error(ctxt->userData,
2936 "htmlParseStartTag: misplaced <body> tag\n");
2937 ctxt->wellFormed = 0;
2938 xmlFree(name);
2939 return;
2940 }
2941 }
2942 }
2943
2944 /*
2945 * Now parse the attributes, it ends up with the ending
2946 *
2947 * (S Attribute)* S?
2948 */
2949 SKIP_BLANKS;
2950 while ((IS_CHAR(CUR)) &&
2951 (CUR != '>') &&
2952 ((CUR != '/') || (NXT(1) != '>'))) {
2953 long cons = ctxt->nbChars;
2954
2955 GROW;
2956 attname = htmlParseAttribute(ctxt, &attvalue);
2957 if (attname != NULL) {
2958
2959 /*
2960 * Well formedness requires at most one declaration of an attribute
2961 */
2962 for (i = 0; i < nbatts;i += 2) {
2963 if (xmlStrEqual(atts[i], attname)) {
2964 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2965 ctxt->sax->error(ctxt->userData,
2966 "Attribute %s redefined\n",
2967 attname);
2968 ctxt->wellFormed = 0;
2969 xmlFree(attname);
2970 if (attvalue != NULL)
2971 xmlFree(attvalue);
2972 goto failed;
2973 }
2974 }
2975
2976 /*
2977 * Add the pair to atts
2978 */
2979 if (atts == NULL) {
2980 maxatts = 10;
2981 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
2982 if (atts == NULL) {
2983 xmlGenericError(xmlGenericErrorContext,
2984 "malloc of %ld byte failed\n",
2985 maxatts * (long)sizeof(xmlChar *));
2986 if (name != NULL) xmlFree(name);
2987 return;
2988 }
2989 } else if (nbatts + 4 > maxatts) {
2990 maxatts *= 2;
2991 atts = (const xmlChar **) xmlRealloc((void *) atts,
2992 maxatts * sizeof(xmlChar *));
2993 if (atts == NULL) {
2994 xmlGenericError(xmlGenericErrorContext,
2995 "realloc of %ld byte failed\n",
2996 maxatts * (long)sizeof(xmlChar *));
2997 if (name != NULL) xmlFree(name);
2998 return;
2999 }
3000 }
3001 atts[nbatts++] = attname;
3002 atts[nbatts++] = attvalue;
3003 atts[nbatts] = NULL;
3004 atts[nbatts + 1] = NULL;
3005 }
3006 else {
3007 /* Dump the bogus attribute string up to the next blank or
3008 * the end of the tag. */
Daniel Veillard561b7f82002-03-20 21:55:57 +00003009 while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
3010 && ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003011 NEXT;
3012 }
3013
3014failed:
3015 SKIP_BLANKS;
3016 if (cons == ctxt->nbChars) {
3017 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3018 ctxt->sax->error(ctxt->userData,
3019 "htmlParseStartTag: problem parsing attributes\n");
3020 ctxt->wellFormed = 0;
3021 break;
3022 }
3023 }
3024
3025 /*
3026 * Handle specific association to the META tag
3027 */
3028 if (meta)
3029 htmlCheckMeta(ctxt, atts);
3030
3031 /*
3032 * SAX: Start of Element !
3033 */
3034 htmlnamePush(ctxt, xmlStrdup(name));
3035#ifdef DEBUG
3036 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
3037#endif
3038 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3039 ctxt->sax->startElement(ctxt->userData, name, atts);
3040
3041 if (atts != NULL) {
3042 for (i = 0;i < nbatts;i++) {
3043 if (atts[i] != NULL)
3044 xmlFree((xmlChar *) atts[i]);
3045 }
3046 xmlFree((void *) atts);
3047 }
3048 if (name != NULL) xmlFree(name);
3049}
3050
3051/**
3052 * htmlParseEndTag:
3053 * @ctxt: an HTML parser context
3054 *
3055 * parse an end of tag
3056 *
3057 * [42] ETag ::= '</' Name S? '>'
3058 *
3059 * With namespace
3060 *
3061 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003062 *
3063 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003064 */
3065
Daniel Veillardf420ac52001-07-04 16:04:09 +00003066static int
Owen Taylor3473f882001-02-23 17:55:21 +00003067htmlParseEndTag(htmlParserCtxtPtr ctxt) {
3068 xmlChar *name;
3069 xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003070 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003071
3072 if ((CUR != '<') || (NXT(1) != '/')) {
3073 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3074 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3075 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003076 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003077 }
3078 SKIP(2);
3079
3080 name = htmlParseHTMLName(ctxt);
Daniel Veillardf420ac52001-07-04 16:04:09 +00003081 if (name == NULL) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003082
3083 /*
3084 * We should definitely be at the ending "S? '>'" part
3085 */
3086 SKIP_BLANKS;
3087 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3088 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3089 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3090 ctxt->wellFormed = 0;
3091 } else
3092 NEXT;
3093
3094 /*
3095 * If the name read is not one of the element in the parsing stack
3096 * then return, it's just an error.
3097 */
3098 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
3099 if (xmlStrEqual(name, ctxt->nameTab[i])) break;
3100 }
3101 if (i < 0) {
3102 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3103 ctxt->sax->error(ctxt->userData,
3104 "Unexpected end tag : %s\n", name);
3105 xmlFree(name);
3106 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003107 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003108 }
3109
3110
3111 /*
3112 * Check for auto-closure of HTML elements.
3113 */
3114
3115 htmlAutoCloseOnClose(ctxt, name);
3116
3117 /*
3118 * Well formedness constraints, opening and closing must match.
3119 * With the exception that the autoclose may have popped stuff out
3120 * of the stack.
3121 */
3122 if (!xmlStrEqual(name, ctxt->name)) {
3123#ifdef DEBUG
3124 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
3125#endif
3126 if ((ctxt->name != NULL) &&
3127 (!xmlStrEqual(ctxt->name, name))) {
3128 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3129 ctxt->sax->error(ctxt->userData,
3130 "Opening and ending tag mismatch: %s and %s\n",
3131 name, ctxt->name);
3132 ctxt->wellFormed = 0;
3133 }
3134 }
3135
3136 /*
3137 * SAX: End of Tag
3138 */
3139 oldname = ctxt->name;
3140 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3141 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3142 ctxt->sax->endElement(ctxt->userData, name);
3143 oldname = htmlnamePop(ctxt);
3144 if (oldname != NULL) {
3145#ifdef DEBUG
3146 xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
3147#endif
3148 xmlFree(oldname);
3149#ifdef DEBUG
3150 } else {
3151 xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
3152#endif
3153 }
Daniel Veillardf420ac52001-07-04 16:04:09 +00003154 ret = 1;
3155 } else {
3156 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003157 }
3158
3159 if (name != NULL)
3160 xmlFree(name);
3161
Daniel Veillardf420ac52001-07-04 16:04:09 +00003162 return(ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003163}
3164
3165
3166/**
3167 * htmlParseReference:
3168 * @ctxt: an HTML parser context
3169 *
3170 * parse and handle entity references in content,
3171 * this will end-up in a call to character() since this is either a
3172 * CharRef, or a predefined entity.
3173 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003174static void
Owen Taylor3473f882001-02-23 17:55:21 +00003175htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003176 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003177 xmlChar out[6];
3178 xmlChar *name;
3179 if (CUR != '&') return;
3180
3181 if (NXT(1) == '#') {
3182 unsigned int c;
3183 int bits, i = 0;
3184
3185 c = htmlParseCharRef(ctxt);
3186 if (c == 0)
3187 return;
3188
3189 if (c < 0x80) { out[i++]= c; bits= -6; }
3190 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3191 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3192 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3193
3194 for ( ; bits >= 0; bits-= 6) {
3195 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3196 }
3197 out[i] = 0;
3198
3199 htmlCheckParagraph(ctxt);
3200 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3201 ctxt->sax->characters(ctxt->userData, out, i);
3202 } else {
3203 ent = htmlParseEntityRef(ctxt, &name);
3204 if (name == NULL) {
3205 htmlCheckParagraph(ctxt);
3206 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3207 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3208 return;
3209 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003210 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003211 htmlCheckParagraph(ctxt);
3212 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3213 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3214 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3215 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3216 }
3217 } else {
3218 unsigned int c;
3219 int bits, i = 0;
3220
3221 c = ent->value;
3222 if (c < 0x80)
3223 { out[i++]= c; bits= -6; }
3224 else if (c < 0x800)
3225 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3226 else if (c < 0x10000)
3227 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3228 else
3229 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3230
3231 for ( ; bits >= 0; bits-= 6) {
3232 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3233 }
3234 out[i] = 0;
3235
3236 htmlCheckParagraph(ctxt);
3237 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3238 ctxt->sax->characters(ctxt->userData, out, i);
3239 }
3240 xmlFree(name);
3241 }
3242}
3243
3244/**
3245 * htmlParseContent:
3246 * @ctxt: an HTML parser context
3247 * @name: the node name
3248 *
3249 * Parse a content: comment, sub-element, reference or text.
3250 *
3251 */
3252
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003253static void
Owen Taylor3473f882001-02-23 17:55:21 +00003254htmlParseContent(htmlParserCtxtPtr ctxt) {
3255 xmlChar *currentNode;
3256 int depth;
3257
3258 currentNode = xmlStrdup(ctxt->name);
3259 depth = ctxt->nameNr;
3260 while (1) {
3261 long cons = ctxt->nbChars;
3262
3263 GROW;
3264 /*
3265 * Our tag or one of it's parent or children is ending.
3266 */
3267 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003268 if (htmlParseEndTag(ctxt) &&
3269 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3270 if (currentNode != NULL)
3271 xmlFree(currentNode);
3272 return;
3273 }
3274 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003275 }
3276
3277 /*
3278 * Has this node been popped out during parsing of
3279 * the next element
3280 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003281 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3282 (!xmlStrEqual(currentNode, ctxt->name)))
3283 {
Owen Taylor3473f882001-02-23 17:55:21 +00003284 if (currentNode != NULL) xmlFree(currentNode);
3285 return;
3286 }
3287
Daniel Veillardf9533d12001-03-03 10:04:57 +00003288 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3289 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003290 /*
3291 * Handle SCRIPT/STYLE separately
3292 */
3293 htmlParseScript(ctxt);
3294 } else {
3295 /*
3296 * Sometimes DOCTYPE arrives in the middle of the document
3297 */
3298 if ((CUR == '<') && (NXT(1) == '!') &&
3299 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3300 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3301 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3302 (UPP(8) == 'E')) {
3303 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3304 ctxt->sax->error(ctxt->userData,
3305 "Misplaced DOCTYPE declaration\n");
3306 ctxt->wellFormed = 0;
3307 htmlParseDocTypeDecl(ctxt);
3308 }
3309
3310 /*
3311 * First case : a comment
3312 */
3313 if ((CUR == '<') && (NXT(1) == '!') &&
3314 (NXT(2) == '-') && (NXT(3) == '-')) {
3315 htmlParseComment(ctxt);
3316 }
3317
3318 /*
3319 * Second case : a sub-element.
3320 */
3321 else if (CUR == '<') {
3322 htmlParseElement(ctxt);
3323 }
3324
3325 /*
3326 * Third case : a reference. If if has not been resolved,
3327 * parsing returns it's Name, create the node
3328 */
3329 else if (CUR == '&') {
3330 htmlParseReference(ctxt);
3331 }
3332
3333 /*
3334 * Fourth : end of the resource
3335 */
3336 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003337 htmlAutoCloseOnEnd(ctxt);
3338 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003339 }
3340
3341 /*
3342 * Last case, text. Note that References are handled directly.
3343 */
3344 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003345 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003346 }
3347
3348 if (cons == ctxt->nbChars) {
3349 if (ctxt->node != NULL) {
3350 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3351 ctxt->sax->error(ctxt->userData,
3352 "detected an error in element content\n");
3353 ctxt->wellFormed = 0;
3354 }
3355 break;
3356 }
3357 }
3358 GROW;
3359 }
3360 if (currentNode != NULL) xmlFree(currentNode);
3361}
3362
3363/**
3364 * htmlParseElement:
3365 * @ctxt: an HTML parser context
3366 *
3367 * parse an HTML element, this is highly recursive
3368 *
3369 * [39] element ::= EmptyElemTag | STag content ETag
3370 *
3371 * [41] Attribute ::= Name Eq AttValue
3372 */
3373
3374void
3375htmlParseElement(htmlParserCtxtPtr ctxt) {
3376 xmlChar *name;
3377 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003378 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003379 htmlParserNodeInfo node_info;
3380 xmlChar *oldname;
3381 int depth = ctxt->nameNr;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003382 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003383
3384 /* Capture start position */
3385 if (ctxt->record_info) {
3386 node_info.begin_pos = ctxt->input->consumed +
3387 (CUR_PTR - ctxt->input->base);
3388 node_info.begin_line = ctxt->input->line;
3389 }
3390
3391 oldname = xmlStrdup(ctxt->name);
3392 htmlParseStartTag(ctxt);
3393 name = ctxt->name;
3394#ifdef DEBUG
3395 if (oldname == NULL)
3396 xmlGenericError(xmlGenericErrorContext,
3397 "Start of element %s\n", name);
3398 else if (name == NULL)
3399 xmlGenericError(xmlGenericErrorContext,
3400 "Start of element failed, was %s\n", oldname);
3401 else
3402 xmlGenericError(xmlGenericErrorContext,
3403 "Start of element %s, was %s\n", name, oldname);
3404#endif
3405 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3406 (name == NULL)) {
3407 if (CUR == '>')
3408 NEXT;
3409 if (oldname != NULL)
3410 xmlFree(oldname);
3411 return;
3412 }
3413 if (oldname != NULL)
3414 xmlFree(oldname);
3415
3416 /*
3417 * Lookup the info for that element.
3418 */
3419 info = htmlTagLookup(name);
3420 if (info == NULL) {
3421 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3422 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3423 name);
3424 ctxt->wellFormed = 0;
3425 } else if (info->depr) {
3426/***************************
3427 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3428 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3429 name);
3430 ***************************/
3431 }
3432
3433 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00003434 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00003435 */
3436 if ((CUR == '/') && (NXT(1) == '>')) {
3437 SKIP(2);
3438 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3439 ctxt->sax->endElement(ctxt->userData, name);
3440 oldname = htmlnamePop(ctxt);
3441#ifdef DEBUG
3442 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
3443#endif
3444 if (oldname != NULL)
3445 xmlFree(oldname);
3446 return;
3447 }
3448
3449 if (CUR == '>') {
3450 NEXT;
3451 } else {
3452 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3453 ctxt->sax->error(ctxt->userData,
3454 "Couldn't find end of Start Tag %s\n",
3455 name);
3456 ctxt->wellFormed = 0;
3457
3458 /*
3459 * end of parsing of this node.
3460 */
3461 if (xmlStrEqual(name, ctxt->name)) {
3462 nodePop(ctxt);
3463 oldname = htmlnamePop(ctxt);
3464#ifdef DEBUG
3465 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
3466#endif
3467 if (oldname != NULL)
3468 xmlFree(oldname);
3469 }
3470
3471 /*
3472 * Capture end position and add node
3473 */
3474 if ( currentNode != NULL && ctxt->record_info ) {
3475 node_info.end_pos = ctxt->input->consumed +
3476 (CUR_PTR - ctxt->input->base);
3477 node_info.end_line = ctxt->input->line;
3478 node_info.node = ctxt->node;
3479 xmlParserAddNodeInfo(ctxt, &node_info);
3480 }
3481 return;
3482 }
3483
3484 /*
3485 * Check for an Empty Element from DTD definition
3486 */
3487 if ((info != NULL) && (info->empty)) {
3488 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3489 ctxt->sax->endElement(ctxt->userData, name);
3490 oldname = htmlnamePop(ctxt);
3491#ifdef DEBUG
3492 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
3493#endif
3494 if (oldname != NULL)
3495 xmlFree(oldname);
3496 return;
3497 }
3498
3499 /*
3500 * Parse the content of the element:
3501 */
3502 currentNode = xmlStrdup(ctxt->name);
3503 depth = ctxt->nameNr;
3504 while (IS_CHAR(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00003505 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00003506 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00003507 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00003508 if (ctxt->nameNr < depth) break;
3509 }
3510
Owen Taylor3473f882001-02-23 17:55:21 +00003511 /*
3512 * Capture end position and add node
3513 */
3514 if ( currentNode != NULL && ctxt->record_info ) {
3515 node_info.end_pos = ctxt->input->consumed +
3516 (CUR_PTR - ctxt->input->base);
3517 node_info.end_line = ctxt->input->line;
3518 node_info.node = ctxt->node;
3519 xmlParserAddNodeInfo(ctxt, &node_info);
3520 }
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003521 if (!IS_CHAR(CUR)) {
3522 htmlAutoCloseOnEnd(ctxt);
3523 }
3524
Owen Taylor3473f882001-02-23 17:55:21 +00003525 if (currentNode != NULL)
3526 xmlFree(currentNode);
3527}
3528
3529/**
3530 * htmlParseDocument :
3531 * @ctxt: an HTML parser context
3532 *
3533 * parse an HTML document (and build a tree if using the standard SAX
3534 * interface).
3535 *
3536 * Returns 0, -1 in case of error. the parser context is augmented
3537 * as a result of the parsing.
3538 */
3539
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00003540int
Owen Taylor3473f882001-02-23 17:55:21 +00003541htmlParseDocument(htmlParserCtxtPtr ctxt) {
3542 xmlDtdPtr dtd;
3543
Daniel Veillardd0463562001-10-13 09:15:48 +00003544 xmlInitParser();
3545
Owen Taylor3473f882001-02-23 17:55:21 +00003546 htmlDefaultSAXHandlerInit();
3547 ctxt->html = 1;
3548
3549 GROW;
3550 /*
3551 * SAX: beginning of the document processing.
3552 */
3553 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3554 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3555
3556 /*
3557 * Wipe out everything which is before the first '<'
3558 */
3559 SKIP_BLANKS;
3560 if (CUR == 0) {
3561 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3562 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3563 ctxt->wellFormed = 0;
3564 }
3565
3566 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3567 ctxt->sax->startDocument(ctxt->userData);
3568
3569
3570 /*
3571 * Parse possible comments before any content
3572 */
3573 while ((CUR == '<') && (NXT(1) == '!') &&
3574 (NXT(2) == '-') && (NXT(3) == '-')) {
3575 htmlParseComment(ctxt);
3576 SKIP_BLANKS;
3577 }
3578
3579
3580 /*
3581 * Then possibly doc type declaration(s) and more Misc
3582 * (doctypedecl Misc*)?
3583 */
3584 if ((CUR == '<') && (NXT(1) == '!') &&
3585 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3586 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3587 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3588 (UPP(8) == 'E')) {
3589 htmlParseDocTypeDecl(ctxt);
3590 }
3591 SKIP_BLANKS;
3592
3593 /*
3594 * Parse possible comments before any content
3595 */
3596 while ((CUR == '<') && (NXT(1) == '!') &&
3597 (NXT(2) == '-') && (NXT(3) == '-')) {
3598 htmlParseComment(ctxt);
3599 SKIP_BLANKS;
3600 }
3601
3602 /*
3603 * Time to start parsing the tree itself
3604 */
3605 htmlParseContent(ctxt);
3606
3607 /*
3608 * autoclose
3609 */
3610 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003611 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003612
3613
3614 /*
3615 * SAX: end of the document processing.
3616 */
3617 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3618 ctxt->sax->endDocument(ctxt->userData);
3619
3620 if (ctxt->myDoc != NULL) {
3621 dtd = xmlGetIntSubset(ctxt->myDoc);
3622 if (dtd == NULL)
3623 ctxt->myDoc->intSubset =
3624 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3625 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3626 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3627 }
3628 if (! ctxt->wellFormed) return(-1);
3629 return(0);
3630}
3631
3632
3633/************************************************************************
3634 * *
3635 * Parser contexts handling *
3636 * *
3637 ************************************************************************/
3638
3639/**
3640 * xmlInitParserCtxt:
3641 * @ctxt: an HTML parser context
3642 *
3643 * Initialize a parser context
3644 */
3645
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003646static void
Owen Taylor3473f882001-02-23 17:55:21 +00003647htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3648{
3649 htmlSAXHandler *sax;
3650
3651 if (ctxt == NULL) return;
3652 memset(ctxt, 0, sizeof(htmlParserCtxt));
3653
3654 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3655 if (sax == NULL) {
3656 xmlGenericError(xmlGenericErrorContext,
3657 "htmlInitParserCtxt: out of memory\n");
3658 }
3659 else
3660 memset(sax, 0, sizeof(htmlSAXHandler));
3661
3662 /* Allocate the Input stack */
3663 ctxt->inputTab = (htmlParserInputPtr *)
3664 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3665 if (ctxt->inputTab == NULL) {
3666 xmlGenericError(xmlGenericErrorContext,
3667 "htmlInitParserCtxt: out of memory\n");
3668 ctxt->inputNr = 0;
3669 ctxt->inputMax = 0;
3670 ctxt->input = NULL;
3671 return;
3672 }
3673 ctxt->inputNr = 0;
3674 ctxt->inputMax = 5;
3675 ctxt->input = NULL;
3676 ctxt->version = NULL;
3677 ctxt->encoding = NULL;
3678 ctxt->standalone = -1;
3679 ctxt->instate = XML_PARSER_START;
3680
3681 /* Allocate the Node stack */
3682 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3683 if (ctxt->nodeTab == NULL) {
3684 xmlGenericError(xmlGenericErrorContext,
3685 "htmlInitParserCtxt: out of memory\n");
3686 ctxt->nodeNr = 0;
3687 ctxt->nodeMax = 0;
3688 ctxt->node = NULL;
3689 ctxt->inputNr = 0;
3690 ctxt->inputMax = 0;
3691 ctxt->input = NULL;
3692 return;
3693 }
3694 ctxt->nodeNr = 0;
3695 ctxt->nodeMax = 10;
3696 ctxt->node = NULL;
3697
3698 /* Allocate the Name stack */
3699 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
3700 if (ctxt->nameTab == NULL) {
3701 xmlGenericError(xmlGenericErrorContext,
3702 "htmlInitParserCtxt: out of memory\n");
3703 ctxt->nameNr = 0;
3704 ctxt->nameMax = 10;
3705 ctxt->name = NULL;
3706 ctxt->nodeNr = 0;
3707 ctxt->nodeMax = 0;
3708 ctxt->node = NULL;
3709 ctxt->inputNr = 0;
3710 ctxt->inputMax = 0;
3711 ctxt->input = NULL;
3712 return;
3713 }
3714 ctxt->nameNr = 0;
3715 ctxt->nameMax = 10;
3716 ctxt->name = NULL;
3717
3718 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3719 else {
3720 ctxt->sax = sax;
3721 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3722 }
3723 ctxt->userData = ctxt;
3724 ctxt->myDoc = NULL;
3725 ctxt->wellFormed = 1;
3726 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00003727 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00003728 ctxt->html = 1;
3729 ctxt->record_info = 0;
3730 ctxt->validate = 0;
3731 ctxt->nbChars = 0;
3732 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00003733 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003734 xmlInitNodeInfoSeq(&ctxt->node_seq);
3735}
3736
3737/**
3738 * htmlFreeParserCtxt:
3739 * @ctxt: an HTML parser context
3740 *
3741 * Free all the memory used by a parser context. However the parsed
3742 * document in ctxt->myDoc is not freed.
3743 */
3744
3745void
3746htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3747{
3748 xmlFreeParserCtxt(ctxt);
3749}
3750
3751/**
Daniel Veillard1d995272002-07-22 16:43:32 +00003752 * htmlNewParserCtxt:
3753 *
3754 * Allocate and initialize a new parser context.
3755 *
3756 * Returns the xmlParserCtxtPtr or NULL
3757 */
3758
3759static htmlParserCtxtPtr
3760htmlNewParserCtxt(void)
3761{
3762 xmlParserCtxtPtr ctxt;
3763
3764 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
3765 if (ctxt == NULL) {
3766 xmlGenericError(xmlGenericErrorContext,
3767 "xmlNewParserCtxt : cannot allocate context\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00003768 return(NULL);
3769 }
3770 memset(ctxt, 0, sizeof(xmlParserCtxt));
3771 htmlInitParserCtxt(ctxt);
3772 return(ctxt);
3773}
3774
3775/**
3776 * htmlCreateMemoryParserCtxt:
3777 * @buffer: a pointer to a char array
3778 * @size: the size of the array
3779 *
3780 * Create a parser context for an HTML in-memory document.
3781 *
3782 * Returns the new parser context or NULL
3783 */
3784static htmlParserCtxtPtr
3785htmlCreateMemoryParserCtxt(const char *buffer, int size) {
3786 xmlParserCtxtPtr ctxt;
3787 xmlParserInputPtr input;
3788 xmlParserInputBufferPtr buf;
3789
3790 if (buffer == NULL)
3791 return(NULL);
3792 if (size <= 0)
3793 return(NULL);
3794
3795 ctxt = htmlNewParserCtxt();
3796 if (ctxt == NULL)
3797 return(NULL);
3798
3799 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
3800 if (buf == NULL) return(NULL);
3801
3802 input = xmlNewInputStream(ctxt);
3803 if (input == NULL) {
3804 xmlFreeParserCtxt(ctxt);
3805 return(NULL);
3806 }
3807
3808 input->filename = NULL;
3809 input->buf = buf;
3810 input->base = input->buf->buffer->content;
3811 input->cur = input->buf->buffer->content;
3812 input->end = &input->buf->buffer->content[input->buf->buffer->use];
3813
3814 inputPush(ctxt, input);
3815 return(ctxt);
3816}
3817
3818/**
Owen Taylor3473f882001-02-23 17:55:21 +00003819 * htmlCreateDocParserCtxt :
3820 * @cur: a pointer to an array of xmlChar
3821 * @encoding: a free form C string describing the HTML document encoding, or NULL
3822 *
3823 * Create a parser context for an HTML document.
3824 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003825 * TODO: check the need to add encoding handling there
3826 *
Owen Taylor3473f882001-02-23 17:55:21 +00003827 * Returns the new parser context or NULL
3828 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003829static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00003830htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Daniel Veillard1d995272002-07-22 16:43:32 +00003831 int len;
Owen Taylor3473f882001-02-23 17:55:21 +00003832
Daniel Veillard1d995272002-07-22 16:43:32 +00003833 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00003834 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00003835 len = xmlStrlen(cur);
3836 return(htmlCreateMemoryParserCtxt((char *)cur, len));
Owen Taylor3473f882001-02-23 17:55:21 +00003837}
3838
3839/************************************************************************
3840 * *
3841 * Progressive parsing interfaces *
3842 * *
3843 ************************************************************************/
3844
3845/**
3846 * htmlParseLookupSequence:
3847 * @ctxt: an HTML parser context
3848 * @first: the first char to lookup
3849 * @next: the next char to lookup or zero
3850 * @third: the next char to lookup or zero
3851 *
3852 * Try to find if a sequence (first, next, third) or just (first next) or
3853 * (first) is available in the input stream.
3854 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3855 * to avoid rescanning sequences of bytes, it DOES change the state of the
3856 * parser, do not use liberally.
3857 * This is basically similar to xmlParseLookupSequence()
3858 *
3859 * Returns the index to the current parsing point if the full sequence
3860 * is available, -1 otherwise.
3861 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003862static int
Owen Taylor3473f882001-02-23 17:55:21 +00003863htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3864 xmlChar next, xmlChar third) {
3865 int base, len;
3866 htmlParserInputPtr in;
3867 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00003868 int incomment = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003869
3870 in = ctxt->input;
3871 if (in == NULL) return(-1);
3872 base = in->cur - in->base;
3873 if (base < 0) return(-1);
3874 if (ctxt->checkIndex > base)
3875 base = ctxt->checkIndex;
3876 if (in->buf == NULL) {
3877 buf = in->base;
3878 len = in->length;
3879 } else {
3880 buf = in->buf->buffer->content;
3881 len = in->buf->buffer->use;
3882 }
3883 /* take into account the sequence length */
3884 if (third) len -= 2;
3885 else if (next) len --;
3886 for (;base < len;base++) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00003887 if (!incomment && (base + 4 < len)) {
3888 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
3889 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
3890 incomment = 1;
3891 }
3892 /* do not increment base, some people use <!--> */
3893 }
3894 if (incomment) {
3895 if (base + 3 < len)
3896 return(-1);
3897 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
3898 (buf[base + 2] == '>')) {
3899 incomment = 0;
3900 base += 2;
3901 }
3902 continue;
3903 }
Owen Taylor3473f882001-02-23 17:55:21 +00003904 if (buf[base] == first) {
3905 if (third != 0) {
3906 if ((buf[base + 1] != next) ||
3907 (buf[base + 2] != third)) continue;
3908 } else if (next != 0) {
3909 if (buf[base + 1] != next) continue;
3910 }
3911 ctxt->checkIndex = 0;
3912#ifdef DEBUG_PUSH
3913 if (next == 0)
3914 xmlGenericError(xmlGenericErrorContext,
3915 "HPP: lookup '%c' found at %d\n",
3916 first, base);
3917 else if (third == 0)
3918 xmlGenericError(xmlGenericErrorContext,
3919 "HPP: lookup '%c%c' found at %d\n",
3920 first, next, base);
3921 else
3922 xmlGenericError(xmlGenericErrorContext,
3923 "HPP: lookup '%c%c%c' found at %d\n",
3924 first, next, third, base);
3925#endif
3926 return(base - (in->cur - in->base));
3927 }
3928 }
3929 ctxt->checkIndex = base;
3930#ifdef DEBUG_PUSH
3931 if (next == 0)
3932 xmlGenericError(xmlGenericErrorContext,
3933 "HPP: lookup '%c' failed\n", first);
3934 else if (third == 0)
3935 xmlGenericError(xmlGenericErrorContext,
3936 "HPP: lookup '%c%c' failed\n", first, next);
3937 else
3938 xmlGenericError(xmlGenericErrorContext,
3939 "HPP: lookup '%c%c%c' failed\n", first, next, third);
3940#endif
3941 return(-1);
3942}
3943
3944/**
3945 * htmlParseTryOrFinish:
3946 * @ctxt: an HTML parser context
3947 * @terminate: last chunk indicator
3948 *
3949 * Try to progress on parsing
3950 *
3951 * Returns zero if no parsing was possible
3952 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003953static int
Owen Taylor3473f882001-02-23 17:55:21 +00003954htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
3955 int ret = 0;
3956 htmlParserInputPtr in;
3957 int avail = 0;
3958 xmlChar cur, next;
3959
3960#ifdef DEBUG_PUSH
3961 switch (ctxt->instate) {
3962 case XML_PARSER_EOF:
3963 xmlGenericError(xmlGenericErrorContext,
3964 "HPP: try EOF\n"); break;
3965 case XML_PARSER_START:
3966 xmlGenericError(xmlGenericErrorContext,
3967 "HPP: try START\n"); break;
3968 case XML_PARSER_MISC:
3969 xmlGenericError(xmlGenericErrorContext,
3970 "HPP: try MISC\n");break;
3971 case XML_PARSER_COMMENT:
3972 xmlGenericError(xmlGenericErrorContext,
3973 "HPP: try COMMENT\n");break;
3974 case XML_PARSER_PROLOG:
3975 xmlGenericError(xmlGenericErrorContext,
3976 "HPP: try PROLOG\n");break;
3977 case XML_PARSER_START_TAG:
3978 xmlGenericError(xmlGenericErrorContext,
3979 "HPP: try START_TAG\n");break;
3980 case XML_PARSER_CONTENT:
3981 xmlGenericError(xmlGenericErrorContext,
3982 "HPP: try CONTENT\n");break;
3983 case XML_PARSER_CDATA_SECTION:
3984 xmlGenericError(xmlGenericErrorContext,
3985 "HPP: try CDATA_SECTION\n");break;
3986 case XML_PARSER_END_TAG:
3987 xmlGenericError(xmlGenericErrorContext,
3988 "HPP: try END_TAG\n");break;
3989 case XML_PARSER_ENTITY_DECL:
3990 xmlGenericError(xmlGenericErrorContext,
3991 "HPP: try ENTITY_DECL\n");break;
3992 case XML_PARSER_ENTITY_VALUE:
3993 xmlGenericError(xmlGenericErrorContext,
3994 "HPP: try ENTITY_VALUE\n");break;
3995 case XML_PARSER_ATTRIBUTE_VALUE:
3996 xmlGenericError(xmlGenericErrorContext,
3997 "HPP: try ATTRIBUTE_VALUE\n");break;
3998 case XML_PARSER_DTD:
3999 xmlGenericError(xmlGenericErrorContext,
4000 "HPP: try DTD\n");break;
4001 case XML_PARSER_EPILOG:
4002 xmlGenericError(xmlGenericErrorContext,
4003 "HPP: try EPILOG\n");break;
4004 case XML_PARSER_PI:
4005 xmlGenericError(xmlGenericErrorContext,
4006 "HPP: try PI\n");break;
4007 case XML_PARSER_SYSTEM_LITERAL:
4008 xmlGenericError(xmlGenericErrorContext,
4009 "HPP: try SYSTEM_LITERAL\n");break;
4010 }
4011#endif
4012
4013 while (1) {
4014
4015 in = ctxt->input;
4016 if (in == NULL) break;
4017 if (in->buf == NULL)
4018 avail = in->length - (in->cur - in->base);
4019 else
4020 avail = in->buf->buffer->use - (in->cur - in->base);
4021 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004022 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004023 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4024 /*
4025 * SAX: end of the document processing.
4026 */
4027 ctxt->instate = XML_PARSER_EOF;
4028 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4029 ctxt->sax->endDocument(ctxt->userData);
4030 }
4031 }
4032 if (avail < 1)
4033 goto done;
4034 switch (ctxt->instate) {
4035 case XML_PARSER_EOF:
4036 /*
4037 * Document parsing is done !
4038 */
4039 goto done;
4040 case XML_PARSER_START:
4041 /*
4042 * Very first chars read from the document flow.
4043 */
4044 cur = in->cur[0];
4045 if (IS_BLANK(cur)) {
4046 SKIP_BLANKS;
4047 if (in->buf == NULL)
4048 avail = in->length - (in->cur - in->base);
4049 else
4050 avail = in->buf->buffer->use - (in->cur - in->base);
4051 }
4052 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4053 ctxt->sax->setDocumentLocator(ctxt->userData,
4054 &xmlDefaultSAXLocator);
4055 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4056 (!ctxt->disableSAX))
4057 ctxt->sax->startDocument(ctxt->userData);
4058
4059 cur = in->cur[0];
4060 next = in->cur[1];
4061 if ((cur == '<') && (next == '!') &&
4062 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4063 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4064 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4065 (UPP(8) == 'E')) {
4066 if ((!terminate) &&
4067 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4068 goto done;
4069#ifdef DEBUG_PUSH
4070 xmlGenericError(xmlGenericErrorContext,
4071 "HPP: Parsing internal subset\n");
4072#endif
4073 htmlParseDocTypeDecl(ctxt);
4074 ctxt->instate = XML_PARSER_PROLOG;
4075#ifdef DEBUG_PUSH
4076 xmlGenericError(xmlGenericErrorContext,
4077 "HPP: entering PROLOG\n");
4078#endif
4079 } else {
4080 ctxt->instate = XML_PARSER_MISC;
4081 }
4082#ifdef DEBUG_PUSH
4083 xmlGenericError(xmlGenericErrorContext,
4084 "HPP: entering MISC\n");
4085#endif
4086 break;
4087 case XML_PARSER_MISC:
4088 SKIP_BLANKS;
4089 if (in->buf == NULL)
4090 avail = in->length - (in->cur - in->base);
4091 else
4092 avail = in->buf->buffer->use - (in->cur - in->base);
4093 if (avail < 2)
4094 goto done;
4095 cur = in->cur[0];
4096 next = in->cur[1];
4097 if ((cur == '<') && (next == '!') &&
4098 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4099 if ((!terminate) &&
4100 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4101 goto done;
4102#ifdef DEBUG_PUSH
4103 xmlGenericError(xmlGenericErrorContext,
4104 "HPP: Parsing Comment\n");
4105#endif
4106 htmlParseComment(ctxt);
4107 ctxt->instate = XML_PARSER_MISC;
4108 } else if ((cur == '<') && (next == '!') &&
4109 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4110 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4111 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4112 (UPP(8) == 'E')) {
4113 if ((!terminate) &&
4114 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4115 goto done;
4116#ifdef DEBUG_PUSH
4117 xmlGenericError(xmlGenericErrorContext,
4118 "HPP: Parsing internal subset\n");
4119#endif
4120 htmlParseDocTypeDecl(ctxt);
4121 ctxt->instate = XML_PARSER_PROLOG;
4122#ifdef DEBUG_PUSH
4123 xmlGenericError(xmlGenericErrorContext,
4124 "HPP: entering PROLOG\n");
4125#endif
4126 } else if ((cur == '<') && (next == '!') &&
4127 (avail < 9)) {
4128 goto done;
4129 } else {
4130 ctxt->instate = XML_PARSER_START_TAG;
4131#ifdef DEBUG_PUSH
4132 xmlGenericError(xmlGenericErrorContext,
4133 "HPP: entering START_TAG\n");
4134#endif
4135 }
4136 break;
4137 case XML_PARSER_PROLOG:
4138 SKIP_BLANKS;
4139 if (in->buf == NULL)
4140 avail = in->length - (in->cur - in->base);
4141 else
4142 avail = in->buf->buffer->use - (in->cur - in->base);
4143 if (avail < 2)
4144 goto done;
4145 cur = in->cur[0];
4146 next = in->cur[1];
4147 if ((cur == '<') && (next == '!') &&
4148 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4149 if ((!terminate) &&
4150 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4151 goto done;
4152#ifdef DEBUG_PUSH
4153 xmlGenericError(xmlGenericErrorContext,
4154 "HPP: Parsing Comment\n");
4155#endif
4156 htmlParseComment(ctxt);
4157 ctxt->instate = XML_PARSER_PROLOG;
4158 } else if ((cur == '<') && (next == '!') &&
4159 (avail < 4)) {
4160 goto done;
4161 } else {
4162 ctxt->instate = XML_PARSER_START_TAG;
4163#ifdef DEBUG_PUSH
4164 xmlGenericError(xmlGenericErrorContext,
4165 "HPP: entering START_TAG\n");
4166#endif
4167 }
4168 break;
4169 case XML_PARSER_EPILOG:
4170 if (in->buf == NULL)
4171 avail = in->length - (in->cur - in->base);
4172 else
4173 avail = in->buf->buffer->use - (in->cur - in->base);
4174 if (avail < 1)
4175 goto done;
4176 cur = in->cur[0];
4177 if (IS_BLANK(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004178 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004179 goto done;
4180 }
4181 if (avail < 2)
4182 goto done;
4183 next = in->cur[1];
4184 if ((cur == '<') && (next == '!') &&
4185 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4186 if ((!terminate) &&
4187 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4188 goto done;
4189#ifdef DEBUG_PUSH
4190 xmlGenericError(xmlGenericErrorContext,
4191 "HPP: Parsing Comment\n");
4192#endif
4193 htmlParseComment(ctxt);
4194 ctxt->instate = XML_PARSER_EPILOG;
4195 } else if ((cur == '<') && (next == '!') &&
4196 (avail < 4)) {
4197 goto done;
4198 } else {
4199 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004200 ctxt->wellFormed = 0;
4201 ctxt->instate = XML_PARSER_EOF;
4202#ifdef DEBUG_PUSH
4203 xmlGenericError(xmlGenericErrorContext,
4204 "HPP: entering EOF\n");
4205#endif
4206 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4207 ctxt->sax->endDocument(ctxt->userData);
4208 goto done;
4209 }
4210 break;
4211 case XML_PARSER_START_TAG: {
4212 xmlChar *name, *oldname;
4213 int depth = ctxt->nameNr;
Daniel Veillardbb371292001-08-16 23:26:59 +00004214 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004215
4216 if (avail < 2)
4217 goto done;
4218 cur = in->cur[0];
4219 if (cur != '<') {
4220 ctxt->instate = XML_PARSER_CONTENT;
4221#ifdef DEBUG_PUSH
4222 xmlGenericError(xmlGenericErrorContext,
4223 "HPP: entering CONTENT\n");
4224#endif
4225 break;
4226 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004227 if (in->cur[1] == '/') {
4228 ctxt->instate = XML_PARSER_END_TAG;
4229 ctxt->checkIndex = 0;
4230#ifdef DEBUG_PUSH
4231 xmlGenericError(xmlGenericErrorContext,
4232 "HPP: entering END_TAG\n");
4233#endif
4234 break;
4235 }
Owen Taylor3473f882001-02-23 17:55:21 +00004236 if ((!terminate) &&
4237 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4238 goto done;
4239
4240 oldname = xmlStrdup(ctxt->name);
4241 htmlParseStartTag(ctxt);
4242 name = ctxt->name;
4243#ifdef DEBUG
4244 if (oldname == NULL)
4245 xmlGenericError(xmlGenericErrorContext,
4246 "Start of element %s\n", name);
4247 else if (name == NULL)
4248 xmlGenericError(xmlGenericErrorContext,
4249 "Start of element failed, was %s\n",
4250 oldname);
4251 else
4252 xmlGenericError(xmlGenericErrorContext,
4253 "Start of element %s, was %s\n",
4254 name, oldname);
4255#endif
4256 if (((depth == ctxt->nameNr) &&
4257 (xmlStrEqual(oldname, ctxt->name))) ||
4258 (name == NULL)) {
4259 if (CUR == '>')
4260 NEXT;
4261 if (oldname != NULL)
4262 xmlFree(oldname);
4263 break;
4264 }
4265 if (oldname != NULL)
4266 xmlFree(oldname);
4267
4268 /*
4269 * Lookup the info for that element.
4270 */
4271 info = htmlTagLookup(name);
4272 if (info == NULL) {
4273 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4274 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4275 name);
4276 ctxt->wellFormed = 0;
4277 } else if (info->depr) {
4278 /***************************
4279 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4280 ctxt->sax->warning(ctxt->userData,
4281 "Tag %s is deprecated\n",
4282 name);
4283 ***************************/
4284 }
4285
4286 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004287 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004288 */
4289 if ((CUR == '/') && (NXT(1) == '>')) {
4290 SKIP(2);
4291 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4292 ctxt->sax->endElement(ctxt->userData, name);
4293 oldname = htmlnamePop(ctxt);
4294#ifdef DEBUG
4295 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
4296 oldname);
4297#endif
4298 if (oldname != NULL)
4299 xmlFree(oldname);
4300 ctxt->instate = XML_PARSER_CONTENT;
4301#ifdef DEBUG_PUSH
4302 xmlGenericError(xmlGenericErrorContext,
4303 "HPP: entering CONTENT\n");
4304#endif
4305 break;
4306 }
4307
4308 if (CUR == '>') {
4309 NEXT;
4310 } else {
4311 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4312 ctxt->sax->error(ctxt->userData,
4313 "Couldn't find end of Start Tag %s\n",
4314 name);
4315 ctxt->wellFormed = 0;
4316
4317 /*
4318 * end of parsing of this node.
4319 */
4320 if (xmlStrEqual(name, ctxt->name)) {
4321 nodePop(ctxt);
4322 oldname = htmlnamePop(ctxt);
4323#ifdef DEBUG
4324 xmlGenericError(xmlGenericErrorContext,
4325 "End of start tag problem: popping out %s\n", oldname);
4326#endif
4327 if (oldname != NULL)
4328 xmlFree(oldname);
4329 }
4330
4331 ctxt->instate = XML_PARSER_CONTENT;
4332#ifdef DEBUG_PUSH
4333 xmlGenericError(xmlGenericErrorContext,
4334 "HPP: entering CONTENT\n");
4335#endif
4336 break;
4337 }
4338
4339 /*
4340 * Check for an Empty Element from DTD definition
4341 */
4342 if ((info != NULL) && (info->empty)) {
4343 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4344 ctxt->sax->endElement(ctxt->userData, name);
4345 oldname = htmlnamePop(ctxt);
4346#ifdef DEBUG
4347 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
4348#endif
4349 if (oldname != NULL)
4350 xmlFree(oldname);
4351 }
4352 ctxt->instate = XML_PARSER_CONTENT;
4353#ifdef DEBUG_PUSH
4354 xmlGenericError(xmlGenericErrorContext,
4355 "HPP: entering CONTENT\n");
4356#endif
4357 break;
4358 }
4359 case XML_PARSER_CONTENT: {
4360 long cons;
4361 /*
4362 * Handle preparsed entities and charRef
4363 */
4364 if (ctxt->token != 0) {
4365 xmlChar chr[2] = { 0 , 0 } ;
4366
4367 chr[0] = (xmlChar) ctxt->token;
4368 htmlCheckParagraph(ctxt);
4369 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4370 ctxt->sax->characters(ctxt->userData, chr, 1);
4371 ctxt->token = 0;
4372 ctxt->checkIndex = 0;
4373 }
4374 if ((avail == 1) && (terminate)) {
4375 cur = in->cur[0];
4376 if ((cur != '<') && (cur != '&')) {
4377 if (ctxt->sax != NULL) {
4378 if (IS_BLANK(cur)) {
4379 if (ctxt->sax->ignorableWhitespace != NULL)
4380 ctxt->sax->ignorableWhitespace(
4381 ctxt->userData, &cur, 1);
4382 } else {
4383 htmlCheckParagraph(ctxt);
4384 if (ctxt->sax->characters != NULL)
4385 ctxt->sax->characters(
4386 ctxt->userData, &cur, 1);
4387 }
4388 }
4389 ctxt->token = 0;
4390 ctxt->checkIndex = 0;
4391 NEXT;
William M. Brack1633d182001-10-05 15:41:19 +00004392 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004393 }
Owen Taylor3473f882001-02-23 17:55:21 +00004394 }
4395 if (avail < 2)
4396 goto done;
4397 cur = in->cur[0];
4398 next = in->cur[1];
4399 cons = ctxt->nbChars;
4400 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4401 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4402 /*
4403 * Handle SCRIPT/STYLE separately
4404 */
4405 if ((!terminate) &&
4406 (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4407 goto done;
4408 htmlParseScript(ctxt);
4409 if ((cur == '<') && (next == '/')) {
4410 ctxt->instate = XML_PARSER_END_TAG;
4411 ctxt->checkIndex = 0;
4412#ifdef DEBUG_PUSH
4413 xmlGenericError(xmlGenericErrorContext,
4414 "HPP: entering END_TAG\n");
4415#endif
4416 break;
4417 }
4418 } else {
4419 /*
4420 * Sometimes DOCTYPE arrives in the middle of the document
4421 */
4422 if ((cur == '<') && (next == '!') &&
4423 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4424 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4425 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4426 (UPP(8) == 'E')) {
4427 if ((!terminate) &&
4428 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4429 goto done;
4430 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4431 ctxt->sax->error(ctxt->userData,
4432 "Misplaced DOCTYPE declaration\n");
4433 ctxt->wellFormed = 0;
4434 htmlParseDocTypeDecl(ctxt);
4435 } else if ((cur == '<') && (next == '!') &&
4436 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4437 if ((!terminate) &&
4438 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4439 goto done;
4440#ifdef DEBUG_PUSH
4441 xmlGenericError(xmlGenericErrorContext,
4442 "HPP: Parsing Comment\n");
4443#endif
4444 htmlParseComment(ctxt);
4445 ctxt->instate = XML_PARSER_CONTENT;
4446 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4447 goto done;
4448 } else if ((cur == '<') && (next == '/')) {
4449 ctxt->instate = XML_PARSER_END_TAG;
4450 ctxt->checkIndex = 0;
4451#ifdef DEBUG_PUSH
4452 xmlGenericError(xmlGenericErrorContext,
4453 "HPP: entering END_TAG\n");
4454#endif
4455 break;
4456 } else if (cur == '<') {
4457 ctxt->instate = XML_PARSER_START_TAG;
4458 ctxt->checkIndex = 0;
4459#ifdef DEBUG_PUSH
4460 xmlGenericError(xmlGenericErrorContext,
4461 "HPP: entering START_TAG\n");
4462#endif
4463 break;
4464 } else if (cur == '&') {
4465 if ((!terminate) &&
4466 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4467 goto done;
4468#ifdef DEBUG_PUSH
4469 xmlGenericError(xmlGenericErrorContext,
4470 "HPP: Parsing Reference\n");
4471#endif
4472 /* TODO: check generation of subtrees if noent !!! */
4473 htmlParseReference(ctxt);
4474 } else {
4475 /* TODO Avoid the extra copy, handle directly !!!!!! */
4476 /*
4477 * Goal of the following test is :
4478 * - minimize calls to the SAX 'character' callback
4479 * when they are mergeable
4480 */
4481 if ((ctxt->inputNr == 1) &&
4482 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4483 if ((!terminate) &&
4484 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4485 goto done;
4486 }
4487 ctxt->checkIndex = 0;
4488#ifdef DEBUG_PUSH
4489 xmlGenericError(xmlGenericErrorContext,
4490 "HPP: Parsing char data\n");
4491#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004492 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004493 }
4494 }
4495 if (cons == ctxt->nbChars) {
4496 if (ctxt->node != NULL) {
4497 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4498 ctxt->sax->error(ctxt->userData,
4499 "detected an error in element content\n");
4500 ctxt->wellFormed = 0;
4501 }
4502 NEXT;
4503 break;
4504 }
4505
4506 break;
4507 }
4508 case XML_PARSER_END_TAG:
4509 if (avail < 2)
4510 goto done;
4511 if ((!terminate) &&
4512 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4513 goto done;
4514 htmlParseEndTag(ctxt);
4515 if (ctxt->nameNr == 0) {
4516 ctxt->instate = XML_PARSER_EPILOG;
4517 } else {
4518 ctxt->instate = XML_PARSER_CONTENT;
4519 }
4520 ctxt->checkIndex = 0;
4521#ifdef DEBUG_PUSH
4522 xmlGenericError(xmlGenericErrorContext,
4523 "HPP: entering CONTENT\n");
4524#endif
4525 break;
4526 case XML_PARSER_CDATA_SECTION:
4527 xmlGenericError(xmlGenericErrorContext,
4528 "HPP: internal error, state == CDATA\n");
4529 ctxt->instate = XML_PARSER_CONTENT;
4530 ctxt->checkIndex = 0;
4531#ifdef DEBUG_PUSH
4532 xmlGenericError(xmlGenericErrorContext,
4533 "HPP: entering CONTENT\n");
4534#endif
4535 break;
4536 case XML_PARSER_DTD:
4537 xmlGenericError(xmlGenericErrorContext,
4538 "HPP: internal error, state == DTD\n");
4539 ctxt->instate = XML_PARSER_CONTENT;
4540 ctxt->checkIndex = 0;
4541#ifdef DEBUG_PUSH
4542 xmlGenericError(xmlGenericErrorContext,
4543 "HPP: entering CONTENT\n");
4544#endif
4545 break;
4546 case XML_PARSER_COMMENT:
4547 xmlGenericError(xmlGenericErrorContext,
4548 "HPP: internal error, state == COMMENT\n");
4549 ctxt->instate = XML_PARSER_CONTENT;
4550 ctxt->checkIndex = 0;
4551#ifdef DEBUG_PUSH
4552 xmlGenericError(xmlGenericErrorContext,
4553 "HPP: entering CONTENT\n");
4554#endif
4555 break;
4556 case XML_PARSER_PI:
4557 xmlGenericError(xmlGenericErrorContext,
4558 "HPP: internal error, state == PI\n");
4559 ctxt->instate = XML_PARSER_CONTENT;
4560 ctxt->checkIndex = 0;
4561#ifdef DEBUG_PUSH
4562 xmlGenericError(xmlGenericErrorContext,
4563 "HPP: entering CONTENT\n");
4564#endif
4565 break;
4566 case XML_PARSER_ENTITY_DECL:
4567 xmlGenericError(xmlGenericErrorContext,
4568 "HPP: internal error, state == ENTITY_DECL\n");
4569 ctxt->instate = XML_PARSER_CONTENT;
4570 ctxt->checkIndex = 0;
4571#ifdef DEBUG_PUSH
4572 xmlGenericError(xmlGenericErrorContext,
4573 "HPP: entering CONTENT\n");
4574#endif
4575 break;
4576 case XML_PARSER_ENTITY_VALUE:
4577 xmlGenericError(xmlGenericErrorContext,
4578 "HPP: internal error, state == ENTITY_VALUE\n");
4579 ctxt->instate = XML_PARSER_CONTENT;
4580 ctxt->checkIndex = 0;
4581#ifdef DEBUG_PUSH
4582 xmlGenericError(xmlGenericErrorContext,
4583 "HPP: entering DTD\n");
4584#endif
4585 break;
4586 case XML_PARSER_ATTRIBUTE_VALUE:
4587 xmlGenericError(xmlGenericErrorContext,
4588 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
4589 ctxt->instate = XML_PARSER_START_TAG;
4590 ctxt->checkIndex = 0;
4591#ifdef DEBUG_PUSH
4592 xmlGenericError(xmlGenericErrorContext,
4593 "HPP: entering START_TAG\n");
4594#endif
4595 break;
4596 case XML_PARSER_SYSTEM_LITERAL:
4597 xmlGenericError(xmlGenericErrorContext,
4598 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
4599 ctxt->instate = XML_PARSER_CONTENT;
4600 ctxt->checkIndex = 0;
4601#ifdef DEBUG_PUSH
4602 xmlGenericError(xmlGenericErrorContext,
4603 "HPP: entering CONTENT\n");
4604#endif
4605 break;
4606 case XML_PARSER_IGNORE:
4607 xmlGenericError(xmlGenericErrorContext,
4608 "HPP: internal error, state == XML_PARSER_IGNORE\n");
4609 ctxt->instate = XML_PARSER_CONTENT;
4610 ctxt->checkIndex = 0;
4611#ifdef DEBUG_PUSH
4612 xmlGenericError(xmlGenericErrorContext,
4613 "HPP: entering CONTENT\n");
4614#endif
4615 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00004616 case XML_PARSER_PUBLIC_LITERAL:
4617 xmlGenericError(xmlGenericErrorContext,
4618 "HPP: internal error, state == XML_PARSER_LITERAL\n");
4619 ctxt->instate = XML_PARSER_CONTENT;
4620 ctxt->checkIndex = 0;
4621#ifdef DEBUG_PUSH
4622 xmlGenericError(xmlGenericErrorContext,
4623 "HPP: entering CONTENT\n");
4624#endif
4625 break;
4626
Owen Taylor3473f882001-02-23 17:55:21 +00004627 }
4628 }
4629done:
4630 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004631 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004632 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4633 /*
4634 * SAX: end of the document processing.
4635 */
4636 ctxt->instate = XML_PARSER_EOF;
4637 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4638 ctxt->sax->endDocument(ctxt->userData);
4639 }
4640 }
4641 if ((ctxt->myDoc != NULL) &&
4642 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4643 (ctxt->instate == XML_PARSER_EPILOG))) {
4644 xmlDtdPtr dtd;
4645 dtd = xmlGetIntSubset(ctxt->myDoc);
4646 if (dtd == NULL)
4647 ctxt->myDoc->intSubset =
4648 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4649 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4650 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4651 }
4652#ifdef DEBUG_PUSH
4653 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4654#endif
4655 return(ret);
4656}
4657
4658/**
Owen Taylor3473f882001-02-23 17:55:21 +00004659 * htmlParseChunk:
4660 * @ctxt: an XML parser context
4661 * @chunk: an char array
4662 * @size: the size in byte of the chunk
4663 * @terminate: last chunk indicator
4664 *
4665 * Parse a Chunk of memory
4666 *
4667 * Returns zero if no error, the xmlParserErrors otherwise.
4668 */
4669int
4670htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4671 int terminate) {
4672 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4673 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4674 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4675 int cur = ctxt->input->cur - ctxt->input->base;
4676
4677 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4678 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4679 ctxt->input->cur = ctxt->input->base + cur;
4680#ifdef DEBUG_PUSH
4681 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4682#endif
4683
4684 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4685 htmlParseTryOrFinish(ctxt, terminate);
4686 } else if (ctxt->instate != XML_PARSER_EOF) {
4687 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
4688 htmlParseTryOrFinish(ctxt, terminate);
4689 }
4690 if (terminate) {
4691 if ((ctxt->instate != XML_PARSER_EOF) &&
4692 (ctxt->instate != XML_PARSER_EPILOG) &&
4693 (ctxt->instate != XML_PARSER_MISC)) {
4694 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004695 ctxt->wellFormed = 0;
4696 }
4697 if (ctxt->instate != XML_PARSER_EOF) {
4698 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4699 ctxt->sax->endDocument(ctxt->userData);
4700 }
4701 ctxt->instate = XML_PARSER_EOF;
4702 }
4703 return((xmlParserErrors) ctxt->errNo);
4704}
4705
4706/************************************************************************
4707 * *
4708 * User entry points *
4709 * *
4710 ************************************************************************/
4711
4712/**
4713 * htmlCreatePushParserCtxt :
4714 * @sax: a SAX handler
4715 * @user_data: The user data returned on SAX callbacks
4716 * @chunk: a pointer to an array of chars
4717 * @size: number of chars in the array
4718 * @filename: an optional file name or URI
4719 * @enc: an optional encoding
4720 *
4721 * Create a parser context for using the HTML parser in push mode
4722 * To allow content encoding detection, @size should be >= 4
4723 * The value of @filename is used for fetching external entities
4724 * and error/warning reports.
4725 *
4726 * Returns the new parser context or NULL
4727 */
4728htmlParserCtxtPtr
4729htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4730 const char *chunk, int size, const char *filename,
4731 xmlCharEncoding enc) {
4732 htmlParserCtxtPtr ctxt;
4733 htmlParserInputPtr inputStream;
4734 xmlParserInputBufferPtr buf;
4735
Daniel Veillardd0463562001-10-13 09:15:48 +00004736 xmlInitParser();
4737
Owen Taylor3473f882001-02-23 17:55:21 +00004738 buf = xmlAllocParserInputBuffer(enc);
4739 if (buf == NULL) return(NULL);
4740
4741 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4742 if (ctxt == NULL) {
4743 xmlFree(buf);
4744 return(NULL);
4745 }
4746 memset(ctxt, 0, sizeof(htmlParserCtxt));
4747 htmlInitParserCtxt(ctxt);
4748 if (sax != NULL) {
4749 if (ctxt->sax != &htmlDefaultSAXHandler)
4750 xmlFree(ctxt->sax);
4751 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4752 if (ctxt->sax == NULL) {
4753 xmlFree(buf);
4754 xmlFree(ctxt);
4755 return(NULL);
4756 }
4757 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4758 if (user_data != NULL)
4759 ctxt->userData = user_data;
4760 }
4761 if (filename == NULL) {
4762 ctxt->directory = NULL;
4763 } else {
4764 ctxt->directory = xmlParserGetDirectory(filename);
4765 }
4766
4767 inputStream = htmlNewInputStream(ctxt);
4768 if (inputStream == NULL) {
4769 xmlFreeParserCtxt(ctxt);
4770 return(NULL);
4771 }
4772
4773 if (filename == NULL)
4774 inputStream->filename = NULL;
4775 else
4776 inputStream->filename = xmlMemStrdup(filename);
4777 inputStream->buf = buf;
4778 inputStream->base = inputStream->buf->buffer->content;
4779 inputStream->cur = inputStream->buf->buffer->content;
4780
4781 inputPush(ctxt, inputStream);
4782
4783 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4784 (ctxt->input->buf != NULL)) {
4785 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4786#ifdef DEBUG_PUSH
4787 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4788#endif
4789 }
4790
4791 return(ctxt);
4792}
4793
4794/**
4795 * htmlSAXParseDoc :
4796 * @cur: a pointer to an array of xmlChar
4797 * @encoding: a free form C string describing the HTML document encoding, or NULL
4798 * @sax: the SAX handler block
4799 * @userData: if using SAX, this pointer will be provided on callbacks.
4800 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004801 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
4802 * to handle parse events. If sax is NULL, fallback to the default DOM
4803 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00004804 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004805 * Returns the resulting document tree unless SAX is NULL or the document is
4806 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00004807 */
4808
4809htmlDocPtr
4810htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
4811 htmlDocPtr ret;
4812 htmlParserCtxtPtr ctxt;
4813
Daniel Veillardd0463562001-10-13 09:15:48 +00004814 xmlInitParser();
4815
Owen Taylor3473f882001-02-23 17:55:21 +00004816 if (cur == NULL) return(NULL);
4817
4818
4819 ctxt = htmlCreateDocParserCtxt(cur, encoding);
4820 if (ctxt == NULL) return(NULL);
4821 if (sax != NULL) {
4822 ctxt->sax = sax;
4823 ctxt->userData = userData;
4824 }
4825
4826 htmlParseDocument(ctxt);
4827 ret = ctxt->myDoc;
4828 if (sax != NULL) {
4829 ctxt->sax = NULL;
4830 ctxt->userData = NULL;
4831 }
4832 htmlFreeParserCtxt(ctxt);
4833
4834 return(ret);
4835}
4836
4837/**
4838 * htmlParseDoc :
4839 * @cur: a pointer to an array of xmlChar
4840 * @encoding: a free form C string describing the HTML document encoding, or NULL
4841 *
4842 * parse an HTML in-memory document and build a tree.
4843 *
4844 * Returns the resulting document tree
4845 */
4846
4847htmlDocPtr
4848htmlParseDoc(xmlChar *cur, const char *encoding) {
4849 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4850}
4851
4852
4853/**
4854 * htmlCreateFileParserCtxt :
4855 * @filename: the filename
4856 * @encoding: a free form C string describing the HTML document encoding, or NULL
4857 *
4858 * Create a parser context for a file content.
4859 * Automatic support for ZLIB/Compress compressed document is provided
4860 * by default if found at compile-time.
4861 *
4862 * Returns the new parser context or NULL
4863 */
4864htmlParserCtxtPtr
4865htmlCreateFileParserCtxt(const char *filename, const char *encoding)
4866{
4867 htmlParserCtxtPtr ctxt;
4868 htmlParserInputPtr inputStream;
4869 xmlParserInputBufferPtr buf;
4870 /* htmlCharEncoding enc; */
4871 xmlChar *content, *content_line = (xmlChar *) "charset=";
4872
4873 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
4874 if (buf == NULL) return(NULL);
4875
4876 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4877 if (ctxt == NULL) {
Daniel Veillard3487c8d2002-09-05 11:33:25 +00004878 xmlGenericError(xmlGenericErrorContext, "malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004879 return(NULL);
4880 }
4881 memset(ctxt, 0, sizeof(htmlParserCtxt));
4882 htmlInitParserCtxt(ctxt);
4883 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
4884 if (inputStream == NULL) {
Daniel Veillard3487c8d2002-09-05 11:33:25 +00004885 xmlGenericError(xmlGenericErrorContext, "malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004886 xmlFree(ctxt);
4887 return(NULL);
4888 }
4889 memset(inputStream, 0, sizeof(htmlParserInput));
4890
Daniel Veillarda646cfd2002-09-17 21:50:03 +00004891 inputStream->filename = (char *)
4892 xmlNormalizeWindowsPath((xmlChar *)filename);
Owen Taylor3473f882001-02-23 17:55:21 +00004893 inputStream->line = 1;
4894 inputStream->col = 1;
4895 inputStream->buf = buf;
4896 inputStream->directory = NULL;
4897
4898 inputStream->base = inputStream->buf->buffer->content;
4899 inputStream->cur = inputStream->buf->buffer->content;
4900 inputStream->free = NULL;
4901
4902 inputPush(ctxt, inputStream);
4903
4904 /* set encoding */
4905 if (encoding) {
4906 content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
4907 if (content) {
4908 strcpy ((char *)content, (char *)content_line);
4909 strcat ((char *)content, (char *)encoding);
4910 htmlCheckEncoding (ctxt, content);
4911 xmlFree (content);
4912 }
4913 }
4914
4915 return(ctxt);
4916}
4917
4918/**
4919 * htmlSAXParseFile :
4920 * @filename: the filename
4921 * @encoding: a free form C string describing the HTML document encoding, or NULL
4922 * @sax: the SAX handler block
4923 * @userData: if using SAX, this pointer will be provided on callbacks.
4924 *
4925 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4926 * compressed document is provided by default if found at compile-time.
4927 * It use the given SAX function block to handle the parsing callback.
4928 * If sax is NULL, fallback to the default DOM tree building routines.
4929 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004930 * Returns the resulting document tree unless SAX is NULL or the document is
4931 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00004932 */
4933
4934htmlDocPtr
4935htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
4936 void *userData) {
4937 htmlDocPtr ret;
4938 htmlParserCtxtPtr ctxt;
4939 htmlSAXHandlerPtr oldsax = NULL;
4940
Daniel Veillardd0463562001-10-13 09:15:48 +00004941 xmlInitParser();
4942
Owen Taylor3473f882001-02-23 17:55:21 +00004943 ctxt = htmlCreateFileParserCtxt(filename, encoding);
4944 if (ctxt == NULL) return(NULL);
4945 if (sax != NULL) {
4946 oldsax = ctxt->sax;
4947 ctxt->sax = sax;
4948 ctxt->userData = userData;
4949 }
4950
4951 htmlParseDocument(ctxt);
4952
4953 ret = ctxt->myDoc;
4954 if (sax != NULL) {
4955 ctxt->sax = oldsax;
4956 ctxt->userData = NULL;
4957 }
4958 htmlFreeParserCtxt(ctxt);
4959
4960 return(ret);
4961}
4962
4963/**
4964 * htmlParseFile :
4965 * @filename: the filename
4966 * @encoding: a free form C string describing the HTML document encoding, or NULL
4967 *
4968 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4969 * compressed document is provided by default if found at compile-time.
4970 *
4971 * Returns the resulting document tree
4972 */
4973
4974htmlDocPtr
4975htmlParseFile(const char *filename, const char *encoding) {
4976 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
4977}
4978
4979/**
4980 * htmlHandleOmittedElem:
4981 * @val: int 0 or 1
4982 *
4983 * Set and return the previous value for handling HTML omitted tags.
4984 *
4985 * Returns the last value for 0 for no handling, 1 for auto insertion.
4986 */
4987
4988int
4989htmlHandleOmittedElem(int val) {
4990 int old = htmlOmittedDefaultValue;
4991
4992 htmlOmittedDefaultValue = val;
4993 return(old);
4994}
4995
4996#endif /* LIBXML_HTML_ENABLED */