blob: 014acc979256ba9feab57fffa883f6d06506a829 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Bjorn Reese70a9da52001-04-21 16:57:29 +00009#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000010#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000011
Owen Taylor3473f882001-02-23 17:55:21 +000012#include <string.h>
13#ifdef HAVE_CTYPE_H
14#include <ctype.h>
15#endif
16#ifdef HAVE_STDLIB_H
17#include <stdlib.h>
18#endif
19#ifdef HAVE_SYS_STAT_H
20#include <sys/stat.h>
21#endif
22#ifdef HAVE_FCNTL_H
23#include <fcntl.h>
24#endif
25#ifdef HAVE_UNISTD_H
26#include <unistd.h>
27#endif
28#ifdef HAVE_ZLIB_H
29#include <zlib.h>
30#endif
31
32#include <libxml/xmlmemory.h>
33#include <libxml/tree.h>
34#include <libxml/parser.h>
35#include <libxml/parserInternals.h>
36#include <libxml/xmlerror.h>
37#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000038#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000039#include <libxml/entities.h>
40#include <libxml/encoding.h>
41#include <libxml/valid.h>
42#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000043#include <libxml/globals.h>
Owen Taylor3473f882001-02-23 17:55:21 +000044
45#define HTML_MAX_NAMELEN 1000
46#define HTML_PARSER_BIG_BUFFER_SIZE 1000
47#define HTML_PARSER_BUFFER_SIZE 100
48
49/* #define DEBUG */
50/* #define DEBUG_PUSH */
51
Daniel Veillard22090732001-07-16 00:06:07 +000052static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000053
Daniel Veillard56a4cb82001-03-24 17:00:36 +000054xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
55 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000056static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000057
58/************************************************************************
59 * *
Owen Taylor3473f882001-02-23 17:55:21 +000060 * Parser stacks related functions and macros *
61 * *
62 ************************************************************************/
63
64/*
65 * Generic function for accessing stacks in the Parser Context
66 */
67
68#define PUSH_AND_POP(scope, type, name) \
69scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
70 if (ctxt->name##Nr >= ctxt->name##Max) { \
71 ctxt->name##Max *= 2; \
72 ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
73 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
74 if (ctxt->name##Tab == NULL) { \
75 xmlGenericError(xmlGenericErrorContext, \
76 "realloc failed !\n"); \
77 return(0); \
78 } \
79 } \
80 ctxt->name##Tab[ctxt->name##Nr] = value; \
81 ctxt->name = value; \
82 return(ctxt->name##Nr++); \
83} \
84scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
85 type ret; \
86 if (ctxt->name##Nr < 0) return(0); \
87 ctxt->name##Nr--; \
88 if (ctxt->name##Nr < 0) return(0); \
89 if (ctxt->name##Nr > 0) \
90 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
91 else \
92 ctxt->name = NULL; \
93 ret = ctxt->name##Tab[ctxt->name##Nr]; \
94 ctxt->name##Tab[ctxt->name##Nr] = 0; \
95 return(ret); \
96} \
97
Daniel Veillard56a4cb82001-03-24 17:00:36 +000098/* PUSH_AND_POP(static, xmlNodePtr, node) */
99PUSH_AND_POP(static, xmlChar*, name)
Owen Taylor3473f882001-02-23 17:55:21 +0000100
101/*
102 * Macros for accessing the content. Those should be used only by the parser,
103 * and not exported.
104 *
105 * Dirty macros, i.e. one need to make assumption on the context to use them
106 *
107 * CUR_PTR return the current pointer to the xmlChar to be parsed.
108 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
109 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
110 * in UNICODE mode. This should be used internally by the parser
111 * only to compare to ASCII values otherwise it would break when
112 * running with UTF-8 encoding.
113 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
114 * to compare on ASCII based substring.
115 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
116 * it should be used only to compare on ASCII based substring.
117 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
118 * strings within the parser.
119 *
120 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
121 *
122 * CURRENT Returns the current char value, with the full decoding of
123 * UTF-8 if we are using this mode. It returns an int.
124 * NEXT Skip to the next character, this does the proper decoding
125 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
126 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
127 */
128
129#define UPPER (toupper(*ctxt->input->cur))
130
131#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
132
133#define NXT(val) ctxt->input->cur[(val)]
134
135#define UPP(val) (toupper(ctxt->input->cur[(val)]))
136
137#define CUR_PTR ctxt->input->cur
138
139#define SHRINK xmlParserInputShrink(ctxt->input)
140
141#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
142
143#define CURRENT ((int) (*ctxt->input->cur))
144
145#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
146
147/* Inported from XML */
148
149/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
150#define CUR ((int) (*ctxt->input->cur))
151#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
152
153#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
154#define NXT(val) ctxt->input->cur[(val)]
155#define CUR_PTR ctxt->input->cur
156
157
158#define NEXTL(l) do { \
159 if (*(ctxt->input->cur) == '\n') { \
160 ctxt->input->line++; ctxt->input->col = 1; \
161 } else ctxt->input->col++; \
162 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
163 } while (0)
164
165/************
166 \
167 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
168 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
169 ************/
170
171#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
172#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
173
174#define COPY_BUF(l,b,i,v) \
175 if (l == 1) b[i++] = (xmlChar) v; \
176 else i += xmlCopyChar(l,&b[i],v)
177
178/**
179 * htmlCurrentChar:
180 * @ctxt: the HTML parser context
181 * @len: pointer to the length of the char read
182 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000183 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000184 * bytes in the input buffer. Implement the end of line normalization:
185 * 2.11 End-of-Line Handling
186 * If the encoding is unspecified, in the case we find an ISO-Latin-1
187 * char, then the encoding converter is plugged in automatically.
188 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000189 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000190 */
191
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000192static int
Owen Taylor3473f882001-02-23 17:55:21 +0000193htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
194 if (ctxt->instate == XML_PARSER_EOF)
195 return(0);
196
197 if (ctxt->token != 0) {
198 *len = 0;
199 return(ctxt->token);
200 }
201 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
202 /*
203 * We are supposed to handle UTF8, check it's valid
204 * From rfc2044: encoding of the Unicode values on UTF-8:
205 *
206 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
207 * 0000 0000-0000 007F 0xxxxxxx
208 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
209 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
210 *
211 * Check for the 0x110000 limit too
212 */
213 const unsigned char *cur = ctxt->input->cur;
214 unsigned char c;
215 unsigned int val;
216
217 c = *cur;
218 if (c & 0x80) {
219 if (cur[1] == 0)
220 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
221 if ((cur[1] & 0xc0) != 0x80)
222 goto encoding_error;
223 if ((c & 0xe0) == 0xe0) {
224
225 if (cur[2] == 0)
226 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
227 if ((cur[2] & 0xc0) != 0x80)
228 goto encoding_error;
229 if ((c & 0xf0) == 0xf0) {
230 if (cur[3] == 0)
231 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
232 if (((c & 0xf8) != 0xf0) ||
233 ((cur[3] & 0xc0) != 0x80))
234 goto encoding_error;
235 /* 4-byte code */
236 *len = 4;
237 val = (cur[0] & 0x7) << 18;
238 val |= (cur[1] & 0x3f) << 12;
239 val |= (cur[2] & 0x3f) << 6;
240 val |= cur[3] & 0x3f;
241 } else {
242 /* 3-byte code */
243 *len = 3;
244 val = (cur[0] & 0xf) << 12;
245 val |= (cur[1] & 0x3f) << 6;
246 val |= cur[2] & 0x3f;
247 }
248 } else {
249 /* 2-byte code */
250 *len = 2;
251 val = (cur[0] & 0x1f) << 6;
252 val |= cur[1] & 0x3f;
253 }
254 if (!IS_CHAR(val)) {
255 ctxt->errNo = XML_ERR_INVALID_ENCODING;
256 if ((ctxt->sax != NULL) &&
257 (ctxt->sax->error != NULL))
258 ctxt->sax->error(ctxt->userData,
259 "Char 0x%X out of allowed range\n", val);
260 ctxt->wellFormed = 0;
261 ctxt->disableSAX = 1;
262 }
263 return(val);
264 } else {
265 /* 1-byte code */
266 *len = 1;
267 return((int) *ctxt->input->cur);
268 }
269 }
270 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000271 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000272 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000273 * XML constructs only use < 128 chars
274 */
275 *len = 1;
276 if ((int) *ctxt->input->cur < 0x80)
277 return((int) *ctxt->input->cur);
278
279 /*
280 * Humm this is bad, do an automatic flow conversion
281 */
282 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
283 ctxt->charset = XML_CHAR_ENCODING_UTF8;
284 return(xmlCurrentChar(ctxt, len));
285
286encoding_error:
287 /*
288 * If we detect an UTF8 error that probably mean that the
289 * input encoding didn't get properly advertized in the
290 * declaration header. Report the error and switch the encoding
291 * to ISO-Latin-1 (if you don't like this policy, just declare the
292 * encoding !)
293 */
294 ctxt->errNo = XML_ERR_INVALID_ENCODING;
295 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
296 ctxt->sax->error(ctxt->userData,
297 "Input is not proper UTF-8, indicate encoding !\n");
298 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
299 ctxt->input->cur[0], ctxt->input->cur[1],
300 ctxt->input->cur[2], ctxt->input->cur[3]);
301 }
302
303 ctxt->charset = XML_CHAR_ENCODING_8859_1;
304 *len = 1;
305 return((int) *ctxt->input->cur);
306}
307
308/**
Owen Taylor3473f882001-02-23 17:55:21 +0000309 * htmlSkipBlankChars:
310 * @ctxt: the HTML parser context
311 *
312 * skip all blanks character found at that point in the input streams.
313 *
314 * Returns the number of space chars skipped
315 */
316
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000317static int
Owen Taylor3473f882001-02-23 17:55:21 +0000318htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
319 int res = 0;
320
321 while (IS_BLANK(*(ctxt->input->cur))) {
322 if ((*ctxt->input->cur == 0) &&
323 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
324 xmlPopInput(ctxt);
325 } else {
326 if (*(ctxt->input->cur) == '\n') {
327 ctxt->input->line++; ctxt->input->col = 1;
328 } else ctxt->input->col++;
329 ctxt->input->cur++;
330 ctxt->nbChars++;
331 if (*ctxt->input->cur == 0)
332 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
333 }
334 res++;
335 }
336 return(res);
337}
338
339
340
341/************************************************************************
342 * *
343 * The list of HTML elements and their properties *
344 * *
345 ************************************************************************/
346
347/*
348 * Start Tag: 1 means the start tag can be ommited
349 * End Tag: 1 means the end tag can be ommited
350 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000351 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000352 * Depr: this element is deprecated
353 * DTD: 1 means that this element is valid only in the Loose DTD
354 * 2 means that this element is valid only in the Frameset DTD
355 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000356 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Owen Taylor3473f882001-02-23 17:55:21 +0000357 */
Daniel Veillard22090732001-07-16 00:06:07 +0000358static const htmlElemDesc
359html40ElementTable[] = {
Daniel Veillard02bb1702001-06-13 21:11:59 +0000360{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor " },
361{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form" },
362{ "acronym", 0, 0, 0, 0, 0, 0, 1, "" },
363{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author " },
364{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet " },
365{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area " },
366{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style" },
367{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri " },
368{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " },
369{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride " },
370{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style" },
371{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation " },
372{ "body", 1, 1, 0, 0, 0, 0, 0, "document body " },
373{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break " },
374{ "button", 0, 0, 0, 0, 0, 0, 2, "push button " },
375{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption " },
376{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center " },
377{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation" },
378{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment" },
379{ "col", 0, 2, 2, 1, 0, 0, 0, "table column " },
380{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group " },
381{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description " },
382{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text " },
383{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition" },
384{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list" },
385{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container"},
386{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list " },
387{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term " },
388{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis" },
389{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group " },
390{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font " },
391{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form " },
392{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " },
393{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" },
394{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading " },
395{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading " },
396{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading " },
397{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading " },
398{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading " },
399{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading " },
400{ "head", 1, 1, 0, 0, 0, 0, 0, "document head " },
401{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " },
402{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element " },
403{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style" },
404{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow " },
405{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image " },
406{ "input", 0, 2, 2, 1, 0, 0, 1, "form control " },
407{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text" },
408{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt " },
409{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user" },
410{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text " },
411{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend " },
412{ "li", 0, 1, 1, 0, 0, 0, 0, "list item " },
413{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link " },
414{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map " },
415{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list " },
416{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation " },
417{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering " },
418{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
419{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object " },
420{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list " },
421{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group " },
422{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " },
423{ "p", 0, 1, 1, 0, 0, 0, 0, "paragraph " },
424{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value " },
425{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text " },
426{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation " },
427{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style" },
428{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc." },
429{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements " },
430{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector " },
431{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style" },
432{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container " },
433{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text" },
434{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis" },
435{ "style", 0, 0, 0, 0, 0, 0, 0, "style info " },
436{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript" },
437{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript " },
438{ "table", 0, 0, 0, 0, 0, 0, 0, "&#160;" },
439{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body " },
440{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell" },
441{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field " },
442{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer " },
443{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell" },
444{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header " },
445{ "title", 0, 0, 0, 0, 0, 0, 0, "document title " },
446{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row " },
447{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style" },
448{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style" },
449{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list " },
450{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument" },
Owen Taylor3473f882001-02-23 17:55:21 +0000451};
452
453/*
Owen Taylor3473f882001-02-23 17:55:21 +0000454 * start tags that imply the end of current element
455 */
Daniel Veillard22090732001-07-16 00:06:07 +0000456static const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000457"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
458 "dl", "ul", "ol", "menu", "dir", "address", "pre",
459 "listing", "xmp", "head", NULL,
460"head", "p", NULL,
461"title", "p", NULL,
462"body", "head", "style", "link", "title", "p", NULL,
463"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
464 "pre", "listing", "xmp", "head", "li", NULL,
465"hr", "p", "head", NULL,
466"h1", "p", "head", NULL,
467"h2", "p", "head", NULL,
468"h3", "p", "head", NULL,
469"h4", "p", "head", NULL,
470"h5", "p", "head", NULL,
471"h6", "p", "head", NULL,
472"dir", "p", "head", NULL,
473"address", "p", "head", "ul", NULL,
474"pre", "p", "head", "ul", NULL,
475"listing", "p", "head", NULL,
476"xmp", "p", "head", NULL,
477"blockquote", "p", "head", NULL,
478"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
479 "xmp", "head", NULL,
480"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
481 "head", "dd", NULL,
482"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
483 "head", "dt", NULL,
484"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
485 "listing", "xmp", NULL,
486"ol", "p", "head", "ul", NULL,
487"menu", "p", "head", "ul", NULL,
488"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
489"div", "p", "head", NULL,
490"noscript", "p", "head", NULL,
491"center", "font", "b", "i", "p", "head", NULL,
492"a", "a", NULL,
493"caption", "p", NULL,
494"colgroup", "caption", "colgroup", "col", "p", NULL,
495"col", "caption", "col", "p", NULL,
496"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
497 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000498"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
499"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000500"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
501"thead", "caption", "col", "colgroup", NULL,
502"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
503 "tbody", "p", NULL,
504"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
505 "tfoot", "tbody", "p", NULL,
506"optgroup", "option", NULL,
507"option", "option", NULL,
508"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
509 "pre", "listing", "xmp", "a", NULL,
510NULL
511};
512
513/*
514 * The list of HTML elements which are supposed not to have
515 * CDATA content and where a p element will be implied
516 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000517 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +0000518 * implied paragraph
519 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000520static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000521 "html",
522 "head",
523 "body",
524 NULL
525};
526
527/*
528 * The list of HTML attributes which are of content %Script;
529 * NOTE: when adding ones, check htmlIsScriptAttribute() since
530 * it assumes the name starts with 'on'
531 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000532static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000533 "onclick",
534 "ondblclick",
535 "onmousedown",
536 "onmouseup",
537 "onmouseover",
538 "onmousemove",
539 "onmouseout",
540 "onkeypress",
541 "onkeydown",
542 "onkeyup",
543 "onload",
544 "onunload",
545 "onfocus",
546 "onblur",
547 "onsubmit",
548 "onrest",
549 "onchange",
550 "onselect"
551};
552
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000553/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000554 * This table is used by the htmlparser to know what to do with
555 * broken html pages. By assigning different priorities to different
556 * elements the parser can decide how to handle extra endtags.
557 * Endtags are only allowed to close elements with lower or equal
558 * priority.
559 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000560
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000561typedef struct {
562 const char *name;
563 int priority;
564} elementPriority;
565
Daniel Veillard22090732001-07-16 00:06:07 +0000566static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000567 {"div", 150},
568 {"td", 160},
569 {"th", 160},
570 {"tr", 170},
571 {"thead", 180},
572 {"tbody", 180},
573 {"tfoot", 180},
574 {"table", 190},
575 {"head", 200},
576 {"body", 200},
577 {"html", 220},
578 {NULL, 100} /* Default priority */
579};
Owen Taylor3473f882001-02-23 17:55:21 +0000580
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000581static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +0000582static int htmlStartCloseIndexinitialized = 0;
583
584/************************************************************************
585 * *
586 * functions to handle HTML specific data *
587 * *
588 ************************************************************************/
589
590/**
591 * htmlInitAutoClose:
592 *
593 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
594 * This is not reentrant. Call xmlInitParser() once before processing in
595 * case of use in multithreaded programs.
596 */
597void
598htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000599 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000600
601 if (htmlStartCloseIndexinitialized) return;
602
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000603 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
604 indx = 0;
605 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
606 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +0000607 while (htmlStartClose[i] != NULL) i++;
608 i++;
609 }
610 htmlStartCloseIndexinitialized = 1;
611}
612
613/**
614 * htmlTagLookup:
615 * @tag: The tag name in lowercase
616 *
617 * Lookup the HTML tag in the ElementTable
618 *
619 * Returns the related htmlElemDescPtr or NULL if not found.
620 */
Daniel Veillardbb371292001-08-16 23:26:59 +0000621const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +0000622htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000623 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000624
625 for (i = 0; i < (sizeof(html40ElementTable) /
626 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000627 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillard22090732001-07-16 00:06:07 +0000628 return((const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +0000629 }
630 return(NULL);
631}
632
633/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000634 * htmlGetEndPriority:
635 * @name: The name of the element to look up the priority for.
636 *
637 * Return value: The "endtag" priority.
638 **/
639static int
640htmlGetEndPriority (const xmlChar *name) {
641 int i = 0;
642
643 while ((htmlEndPriority[i].name != NULL) &&
644 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
645 i++;
646
647 return(htmlEndPriority[i].priority);
648}
649
650/**
Owen Taylor3473f882001-02-23 17:55:21 +0000651 * htmlCheckAutoClose:
652 * @newtag: The new tag name
653 * @oldtag: The old tag name
654 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000655 * Checks whether the new tag is one of the registered valid tags for
656 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +0000657 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
658 *
659 * Returns 0 if no, 1 if yes.
660 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000661static int
Owen Taylor3473f882001-02-23 17:55:21 +0000662htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000663 int i, indx;
664 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +0000665
666 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
667
668 /* inefficient, but not a big deal */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000669 for (indx = 0; indx < 100;indx++) {
670 closed = htmlStartCloseIndex[indx];
671 if (closed == NULL) return(0);
672 if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor3473f882001-02-23 17:55:21 +0000673 }
674
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000675 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +0000676 i++;
677 while (htmlStartClose[i] != NULL) {
678 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
679 return(1);
680 }
681 i++;
682 }
683 return(0);
684}
685
686/**
687 * htmlAutoCloseOnClose:
688 * @ctxt: an HTML parser context
689 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000690 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +0000691 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000692 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000693 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000694static void
Owen Taylor3473f882001-02-23 17:55:21 +0000695htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillardbb371292001-08-16 23:26:59 +0000696 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +0000697 xmlChar *oldname;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000698 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +0000699
700#ifdef DEBUG
701 xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
702 for (i = 0;i < ctxt->nameNr;i++)
703 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
704#endif
705
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000706 priority = htmlGetEndPriority (newtag);
707
Owen Taylor3473f882001-02-23 17:55:21 +0000708 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000709
Owen Taylor3473f882001-02-23 17:55:21 +0000710 if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000711 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000712 * A missplaced endtag can only close elements with lower
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000713 * or equal priority, so if we find an element with higher
714 * priority before we find an element with
715 * matching name, we just ignore this endtag
716 */
717 if (htmlGetEndPriority (ctxt->nameTab[i]) > priority) return;
Owen Taylor3473f882001-02-23 17:55:21 +0000718 }
719 if (i < 0) return;
720
721 while (!xmlStrEqual(newtag, ctxt->name)) {
722 info = htmlTagLookup(ctxt->name);
723 if ((info == NULL) || (info->endTag == 1)) {
724#ifdef DEBUG
725 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
726#endif
Daniel Veillard56098d42001-04-24 12:51:09 +0000727 } else if (info->endTag == 3) {
728#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +0000729 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", newtag, ctxt->name);
William M. Brack1633d182001-10-05 15:41:19 +0000730
Daniel Veillard56098d42001-04-24 12:51:09 +0000731#endif
732 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
733 ctxt->sax->error(ctxt->userData,
734 "Opening and ending tag mismatch: %s and %s\n",
735 newtag, ctxt->name);
736 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000737 }
738 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
739 ctxt->sax->endElement(ctxt->userData, ctxt->name);
740 oldname = htmlnamePop(ctxt);
741 if (oldname != NULL) {
742#ifdef DEBUG
743 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
744#endif
745 xmlFree(oldname);
746 }
747 }
748}
749
750/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000751 * htmlAutoCloseOnEnd:
752 * @ctxt: an HTML parser context
753 *
754 * Close all remaining tags at the end of the stream
755 */
756static void
757htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
758 xmlChar *oldname;
759 int i;
760
761 if (ctxt->nameNr == 0)
762 return;
763#ifdef DEBUG
764 xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
765#endif
766
767 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
768#ifdef DEBUG
769 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
770#endif
771 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
772 ctxt->sax->endElement(ctxt->userData, ctxt->name);
773 oldname = htmlnamePop(ctxt);
774 if (oldname != NULL) {
775#ifdef DEBUG
776 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
777#endif
778 xmlFree(oldname);
779 }
780 }
781}
782
783/**
Owen Taylor3473f882001-02-23 17:55:21 +0000784 * htmlAutoClose:
785 * @ctxt: an HTML parser context
786 * @newtag: The new tag name or NULL
787 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000788 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000789 * The list is kept in htmlStartClose array. This function is
790 * called when a new tag has been detected and generates the
791 * appropriates closes if possible/needed.
792 * If newtag is NULL this mean we are at the end of the resource
793 * and we should check
794 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000795static void
Owen Taylor3473f882001-02-23 17:55:21 +0000796htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
797 xmlChar *oldname;
798 while ((newtag != NULL) && (ctxt->name != NULL) &&
799 (htmlCheckAutoClose(newtag, ctxt->name))) {
800#ifdef DEBUG
801 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
802#endif
803 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
804 ctxt->sax->endElement(ctxt->userData, ctxt->name);
805 oldname = htmlnamePop(ctxt);
806 if (oldname != NULL) {
807#ifdef DEBUG
808 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
809#endif
810 xmlFree(oldname);
811 }
812 }
813 if (newtag == NULL) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000814 htmlAutoCloseOnEnd(ctxt);
815 return;
Owen Taylor3473f882001-02-23 17:55:21 +0000816 }
817 while ((newtag == NULL) && (ctxt->name != NULL) &&
818 ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
819 (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
820 (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
821#ifdef DEBUG
822 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
823#endif
824 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
825 ctxt->sax->endElement(ctxt->userData, ctxt->name);
826 oldname = htmlnamePop(ctxt);
827 if (oldname != NULL) {
828#ifdef DEBUG
829 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
830#endif
831 xmlFree(oldname);
832 }
833 }
834
835}
836
837/**
838 * htmlAutoCloseTag:
839 * @doc: the HTML document
840 * @name: The tag name
841 * @elem: the HTML element
842 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000843 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000844 * The list is kept in htmlStartClose array. This function checks
845 * if the element or one of it's children would autoclose the
846 * given tag.
847 *
848 * Returns 1 if autoclose, 0 otherwise
849 */
850int
851htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
852 htmlNodePtr child;
853
854 if (elem == NULL) return(1);
855 if (xmlStrEqual(name, elem->name)) return(0);
856 if (htmlCheckAutoClose(elem->name, name)) return(1);
857 child = elem->children;
858 while (child != NULL) {
859 if (htmlAutoCloseTag(doc, name, child)) return(1);
860 child = child->next;
861 }
862 return(0);
863}
864
865/**
866 * htmlIsAutoClosed:
867 * @doc: the HTML document
868 * @elem: the HTML element
869 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000870 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000871 * The list is kept in htmlStartClose array. This function checks
872 * if a tag is autoclosed by one of it's child
873 *
874 * Returns 1 if autoclosed, 0 otherwise
875 */
876int
877htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
878 htmlNodePtr child;
879
880 if (elem == NULL) return(1);
881 child = elem->children;
882 while (child != NULL) {
883 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
884 child = child->next;
885 }
886 return(0);
887}
888
889/**
890 * htmlCheckImplied:
891 * @ctxt: an HTML parser context
892 * @newtag: The new tag name
893 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000894 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +0000895 * called when a new tag has been detected and generates the
896 * appropriates implicit tags if missing
897 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000898static void
Owen Taylor3473f882001-02-23 17:55:21 +0000899htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
900 if (!htmlOmittedDefaultValue)
901 return;
902 if (xmlStrEqual(newtag, BAD_CAST"html"))
903 return;
904 if (ctxt->nameNr <= 0) {
905#ifdef DEBUG
906 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
907#endif
908 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
909 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
910 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
911 }
912 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
913 return;
914 if ((ctxt->nameNr <= 1) &&
915 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
916 (xmlStrEqual(newtag, BAD_CAST"style")) ||
917 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
918 (xmlStrEqual(newtag, BAD_CAST"link")) ||
919 (xmlStrEqual(newtag, BAD_CAST"title")) ||
920 (xmlStrEqual(newtag, BAD_CAST"base")))) {
921 /*
922 * dropped OBJECT ... i you put it first BODY will be
923 * assumed !
924 */
925#ifdef DEBUG
926 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
927#endif
928 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
929 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
930 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
931 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
932 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
933 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
934 int i;
935 for (i = 0;i < ctxt->nameNr;i++) {
936 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
937 return;
938 }
939 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
940 return;
941 }
942 }
943
944#ifdef DEBUG
945 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
946#endif
947 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
948 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
949 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
950 }
951}
952
953/**
954 * htmlCheckParagraph
955 * @ctxt: an HTML parser context
956 *
957 * Check whether a p element need to be implied before inserting
958 * characters in the current element.
959 *
960 * Returns 1 if a paragraph has been inserted, 0 if not and -1
961 * in case of error.
962 */
963
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000964static int
Owen Taylor3473f882001-02-23 17:55:21 +0000965htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
966 const xmlChar *tag;
967 int i;
968
969 if (ctxt == NULL)
970 return(-1);
971 tag = ctxt->name;
972 if (tag == NULL) {
973 htmlAutoClose(ctxt, BAD_CAST"p");
974 htmlCheckImplied(ctxt, BAD_CAST"p");
975 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
976 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
977 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
978 return(1);
979 }
980 if (!htmlOmittedDefaultValue)
981 return(0);
982 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
983 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
984#ifdef DEBUG
985 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
986#endif
987 htmlAutoClose(ctxt, BAD_CAST"p");
988 htmlCheckImplied(ctxt, BAD_CAST"p");
989 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
990 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
991 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
992 return(1);
993 }
994 }
995 return(0);
996}
997
998/**
999 * htmlIsScriptAttribute:
1000 * @name: an attribute name
1001 *
1002 * Check if an attribute is of content type Script
1003 *
1004 * Returns 1 is the attribute is a script 0 otherwise
1005 */
1006int
1007htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001008 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001009
1010 if (name == NULL)
1011 return(0);
1012 /*
1013 * all script attributes start with 'on'
1014 */
1015 if ((name[0] != 'o') || (name[1] != 'n'))
1016 return(0);
1017 for (i = 0;
1018 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1019 i++) {
1020 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1021 return(1);
1022 }
1023 return(0);
1024}
1025
1026/************************************************************************
1027 * *
1028 * The list of HTML predefined entities *
1029 * *
1030 ************************************************************************/
1031
1032
Daniel Veillard22090732001-07-16 00:06:07 +00001033static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001034/*
1035 * the 4 absolute ones, plus apostrophe.
1036 */
1037{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1038{ 38, "amp", "ampersand, U+0026 ISOnum" },
1039{ 39, "apos", "single quote" },
1040{ 60, "lt", "less-than sign, U+003C ISOnum" },
1041{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1042
1043/*
1044 * A bunch still in the 128-255 range
1045 * Replacing them depend really on the charset used.
1046 */
1047{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1048{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1049{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1050{ 163, "pound","pound sign, U+00A3 ISOnum" },
1051{ 164, "curren","currency sign, U+00A4 ISOnum" },
1052{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1053{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1054{ 167, "sect", "section sign, U+00A7 ISOnum" },
1055{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1056{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1057{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1058{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1059{ 172, "not", "not sign, U+00AC ISOnum" },
1060{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1061{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1062{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1063{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1064{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1065{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1066{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1067{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1068{ 181, "micro","micro sign, U+00B5 ISOnum" },
1069{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1070{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1071{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1072{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1073{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1074{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1075{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1076{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1077{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1078{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1079{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1080{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1081{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1082{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1083{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1084{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1085{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1086{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1087{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1088{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1089{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1090{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1091{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1092{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1093{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1094{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1095{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1096{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1097{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1098{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1099{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1100{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1101{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1102{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1103{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1104{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1105{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1106{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1107{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1108{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1109{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1110{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1111{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1112{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1113{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1114{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1115{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1116{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1117{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1118{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1119{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1120{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1121{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1122{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1123{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1124{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1125{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1126{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1127{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1128{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1129{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1130{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1131{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1132{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1133{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1134{ 247, "divide","division sign, U+00F7 ISOnum" },
1135{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1136{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1137{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1138{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1139{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1140{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1141{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1142{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1143
1144{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1145{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1146{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1147{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1148{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1149
1150/*
1151 * Anything below should really be kept as entities references
1152 */
1153{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1154
1155{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1156{ 732, "tilde","small tilde, U+02DC ISOdia" },
1157
1158{ 913, "Alpha","greek capital letter alpha, U+0391" },
1159{ 914, "Beta", "greek capital letter beta, U+0392" },
1160{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1161{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1162{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1163{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1164{ 919, "Eta", "greek capital letter eta, U+0397" },
1165{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1166{ 921, "Iota", "greek capital letter iota, U+0399" },
1167{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001168{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001169{ 924, "Mu", "greek capital letter mu, U+039C" },
1170{ 925, "Nu", "greek capital letter nu, U+039D" },
1171{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1172{ 927, "Omicron","greek capital letter omicron, U+039F" },
1173{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1174{ 929, "Rho", "greek capital letter rho, U+03A1" },
1175{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1176{ 932, "Tau", "greek capital letter tau, U+03A4" },
1177{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1178{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1179{ 935, "Chi", "greek capital letter chi, U+03A7" },
1180{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1181{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1182
1183{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1184{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1185{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1186{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1187{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1188{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1189{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1190{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1191{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1192{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1193{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1194{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1195{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1196{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1197{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1198{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1199{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1200{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1201{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1202{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1203{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1204{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1205{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1206{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1207{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1208{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1209{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1210{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1211
1212{ 8194, "ensp", "en space, U+2002 ISOpub" },
1213{ 8195, "emsp", "em space, U+2003 ISOpub" },
1214{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1215{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1216{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1217{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1218{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1219{ 8211, "ndash","en dash, U+2013 ISOpub" },
1220{ 8212, "mdash","em dash, U+2014 ISOpub" },
1221{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1222{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1223{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1224{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1225{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1226{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1227{ 8224, "dagger","dagger, U+2020 ISOpub" },
1228{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1229
1230{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1231{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1232
1233{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1234
1235{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1236{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1237
1238{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1239{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1240
1241{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1242{ 8260, "frasl","fraction slash, U+2044 NEW" },
1243
1244{ 8364, "euro", "euro sign, U+20AC NEW" },
1245
1246{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1247{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1248{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1249{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1250{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1251{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1252{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1253{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1254{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1255{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1256{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1257{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1258{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1259{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1260{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1261{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1262
1263{ 8704, "forall","for all, U+2200 ISOtech" },
1264{ 8706, "part", "partial differential, U+2202 ISOtech" },
1265{ 8707, "exist","there exists, U+2203 ISOtech" },
1266{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1267{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1268{ 8712, "isin", "element of, U+2208 ISOtech" },
1269{ 8713, "notin","not an element of, U+2209 ISOtech" },
1270{ 8715, "ni", "contains as member, U+220B ISOtech" },
1271{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001272{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001273{ 8722, "minus","minus sign, U+2212 ISOtech" },
1274{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1275{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1276{ 8733, "prop", "proportional to, U+221D ISOtech" },
1277{ 8734, "infin","infinity, U+221E ISOtech" },
1278{ 8736, "ang", "angle, U+2220 ISOamso" },
1279{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1280{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1281{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1282{ 8746, "cup", "union = cup, U+222A ISOtech" },
1283{ 8747, "int", "integral, U+222B ISOtech" },
1284{ 8756, "there4","therefore, U+2234 ISOtech" },
1285{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1286{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1287{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1288{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1289{ 8801, "equiv","identical to, U+2261 ISOtech" },
1290{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1291{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1292{ 8834, "sub", "subset of, U+2282 ISOtech" },
1293{ 8835, "sup", "superset of, U+2283 ISOtech" },
1294{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1295{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1296{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1297{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1298{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1299{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1300{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1301{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1302{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1303{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1304{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1305{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1306{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1307{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1308
1309{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1310{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1311{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1312{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1313
1314};
1315
1316/************************************************************************
1317 * *
1318 * Commodity functions to handle entities *
1319 * *
1320 ************************************************************************/
1321
1322/*
1323 * Macro used to grow the current buffer.
1324 */
1325#define growBuffer(buffer) { \
1326 buffer##_size *= 2; \
1327 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1328 if (buffer == NULL) { \
1329 perror("realloc failed"); \
1330 return(NULL); \
1331 } \
1332}
1333
1334/**
1335 * htmlEntityLookup:
1336 * @name: the entity name
1337 *
1338 * Lookup the given entity in EntitiesTable
1339 *
1340 * TODO: the linear scan is really ugly, an hash table is really needed.
1341 *
1342 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1343 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001344const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001345htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001346 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001347
1348 for (i = 0;i < (sizeof(html40EntitiesTable)/
1349 sizeof(html40EntitiesTable[0]));i++) {
1350 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1351#ifdef DEBUG
1352 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
1353#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001354 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001355 }
1356 }
1357 return(NULL);
1358}
1359
1360/**
1361 * htmlEntityValueLookup:
1362 * @value: the entity's unicode value
1363 *
1364 * Lookup the given entity in EntitiesTable
1365 *
1366 * TODO: the linear scan is really ugly, an hash table is really needed.
1367 *
1368 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1369 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001370const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001371htmlEntityValueLookup(unsigned int value) {
1372 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001373#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00001374 unsigned int lv = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001375#endif
1376
1377 for (i = 0;i < (sizeof(html40EntitiesTable)/
1378 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001379 if (html40EntitiesTable[i].value >= value) {
1380 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001381 break;
1382#ifdef DEBUG
1383 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
1384#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001385 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001386 }
1387#ifdef DEBUG
1388 if (lv > html40EntitiesTable[i].value) {
1389 xmlGenericError(xmlGenericErrorContext,
1390 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1391 lv, html40EntitiesTable[i].value);
1392 }
1393 lv = html40EntitiesTable[i].value;
1394#endif
1395 }
1396 return(NULL);
1397}
1398
1399/**
1400 * UTF8ToHtml:
1401 * @out: a pointer to an array of bytes to store the result
1402 * @outlen: the length of @out
1403 * @in: a pointer to an array of UTF-8 chars
1404 * @inlen: the length of @in
1405 *
1406 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1407 * plus HTML entities block of chars out.
1408 *
1409 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1410 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001411 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001412 * The value of @outlen after return is the number of octets consumed.
1413 */
1414int
1415UTF8ToHtml(unsigned char* out, int *outlen,
1416 const unsigned char* in, int *inlen) {
1417 const unsigned char* processed = in;
1418 const unsigned char* outend;
1419 const unsigned char* outstart = out;
1420 const unsigned char* instart = in;
1421 const unsigned char* inend;
1422 unsigned int c, d;
1423 int trailing;
1424
1425 if (in == NULL) {
1426 /*
1427 * initialization nothing to do
1428 */
1429 *outlen = 0;
1430 *inlen = 0;
1431 return(0);
1432 }
1433 inend = in + (*inlen);
1434 outend = out + (*outlen);
1435 while (in < inend) {
1436 d = *in++;
1437 if (d < 0x80) { c= d; trailing= 0; }
1438 else if (d < 0xC0) {
1439 /* trailing byte in leading position */
1440 *outlen = out - outstart;
1441 *inlen = processed - instart;
1442 return(-2);
1443 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1444 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1445 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1446 else {
1447 /* no chance for this in Ascii */
1448 *outlen = out - outstart;
1449 *inlen = processed - instart;
1450 return(-2);
1451 }
1452
1453 if (inend - in < trailing) {
1454 break;
1455 }
1456
1457 for ( ; trailing; trailing--) {
1458 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1459 break;
1460 c <<= 6;
1461 c |= d & 0x3F;
1462 }
1463
1464 /* assertion: c is a single UTF-4 value */
1465 if (c < 0x80) {
1466 if (out + 1 >= outend)
1467 break;
1468 *out++ = c;
1469 } else {
1470 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001471 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001472
1473 /*
1474 * Try to lookup a predefined HTML entity for it
1475 */
1476
1477 ent = htmlEntityValueLookup(c);
1478 if (ent == NULL) {
1479 /* no chance for this in Ascii */
1480 *outlen = out - outstart;
1481 *inlen = processed - instart;
1482 return(-2);
1483 }
1484 len = strlen(ent->name);
1485 if (out + 2 + len >= outend)
1486 break;
1487 *out++ = '&';
1488 memcpy(out, ent->name, len);
1489 out += len;
1490 *out++ = ';';
1491 }
1492 processed = in;
1493 }
1494 *outlen = out - outstart;
1495 *inlen = processed - instart;
1496 return(0);
1497}
1498
1499/**
1500 * htmlEncodeEntities:
1501 * @out: a pointer to an array of bytes to store the result
1502 * @outlen: the length of @out
1503 * @in: a pointer to an array of UTF-8 chars
1504 * @inlen: the length of @in
1505 * @quoteChar: the quote character to escape (' or ") or zero.
1506 *
1507 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1508 * plus HTML entities block of chars out.
1509 *
1510 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1511 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001512 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001513 * The value of @outlen after return is the number of octets consumed.
1514 */
1515int
1516htmlEncodeEntities(unsigned char* out, int *outlen,
1517 const unsigned char* in, int *inlen, int quoteChar) {
1518 const unsigned char* processed = in;
1519 const unsigned char* outend = out + (*outlen);
1520 const unsigned char* outstart = out;
1521 const unsigned char* instart = in;
1522 const unsigned char* inend = in + (*inlen);
1523 unsigned int c, d;
1524 int trailing;
1525
1526 while (in < inend) {
1527 d = *in++;
1528 if (d < 0x80) { c= d; trailing= 0; }
1529 else if (d < 0xC0) {
1530 /* trailing byte in leading position */
1531 *outlen = out - outstart;
1532 *inlen = processed - instart;
1533 return(-2);
1534 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1535 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1536 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1537 else {
1538 /* no chance for this in Ascii */
1539 *outlen = out - outstart;
1540 *inlen = processed - instart;
1541 return(-2);
1542 }
1543
1544 if (inend - in < trailing)
1545 break;
1546
1547 while (trailing--) {
1548 if (((d= *in++) & 0xC0) != 0x80) {
1549 *outlen = out - outstart;
1550 *inlen = processed - instart;
1551 return(-2);
1552 }
1553 c <<= 6;
1554 c |= d & 0x3F;
1555 }
1556
1557 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001558 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1559 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001560 if (out >= outend)
1561 break;
1562 *out++ = c;
1563 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001564 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001565 const char *cp;
1566 char nbuf[16];
1567 int len;
1568
1569 /*
1570 * Try to lookup a predefined HTML entity for it
1571 */
1572 ent = htmlEntityValueLookup(c);
1573 if (ent == NULL) {
1574 sprintf(nbuf, "#%u", c);
1575 cp = nbuf;
1576 }
1577 else
1578 cp = ent->name;
1579 len = strlen(cp);
1580 if (out + 2 + len > outend)
1581 break;
1582 *out++ = '&';
1583 memcpy(out, cp, len);
1584 out += len;
1585 *out++ = ';';
1586 }
1587 processed = in;
1588 }
1589 *outlen = out - outstart;
1590 *inlen = processed - instart;
1591 return(0);
1592}
1593
1594/**
1595 * htmlDecodeEntities:
1596 * @ctxt: the parser context
1597 * @len: the len to decode (in bytes !), -1 for no size limit
1598 * @end: an end marker xmlChar, 0 if none
1599 * @end2: an end marker xmlChar, 0 if none
1600 * @end3: an end marker xmlChar, 0 if none
1601 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001602 * Substitute the HTML entities by their value
Owen Taylor3473f882001-02-23 17:55:21 +00001603 *
1604 * DEPRECATED !!!!
1605 *
1606 * Returns A newly allocated string with the substitution done. The caller
1607 * must deallocate it !
1608 */
1609xmlChar *
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00001610htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
1611 xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001612 static int deprecated = 0;
1613 if (!deprecated) {
1614 xmlGenericError(xmlGenericErrorContext,
1615 "htmlDecodeEntities() deprecated function reached\n");
1616 deprecated = 1;
1617 }
1618 return(NULL);
1619#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00001620 xmlChar *name = NULL;
1621 xmlChar *buffer = NULL;
1622 unsigned int buffer_size = 0;
1623 unsigned int nbchars = 0;
1624 htmlEntityDescPtr ent;
1625 unsigned int max = (unsigned int) len;
1626 int c,l;
1627
1628 if (ctxt->depth > 40) {
1629 ctxt->errNo = XML_ERR_ENTITY_LOOP;
1630 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1631 ctxt->sax->error(ctxt->userData,
1632 "Detected entity reference loop\n");
1633 ctxt->wellFormed = 0;
1634 ctxt->disableSAX = 1;
1635 return(NULL);
1636 }
1637
1638 /*
1639 * allocate a translation buffer.
1640 */
1641 buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1642 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1643 if (buffer == NULL) {
1644 perror("xmlDecodeEntities: malloc failed");
1645 return(NULL);
1646 }
1647
1648 /*
1649 * Ok loop until we reach one of the ending char or a size limit.
1650 */
1651 c = CUR_CHAR(l);
1652 while ((nbchars < max) && (c != end) &&
1653 (c != end2) && (c != end3)) {
1654
1655 if (c == 0) break;
1656 if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
1657 int val = htmlParseCharRef(ctxt);
1658 COPY_BUF(0,buffer,nbchars,val);
1659 NEXTL(l);
1660 } else if ((c == '&') && (ctxt->token != '&')) {
1661 ent = htmlParseEntityRef(ctxt, &name);
1662 if (name != NULL) {
1663 if (ent != NULL) {
1664 int val = ent->value;
1665 COPY_BUF(0,buffer,nbchars,val);
1666 NEXTL(l);
1667 } else {
1668 const xmlChar *cur = name;
1669
1670 buffer[nbchars++] = '&';
1671 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1672 growBuffer(buffer);
1673 }
1674 while (*cur != 0) {
1675 buffer[nbchars++] = *cur++;
1676 }
1677 buffer[nbchars++] = ';';
1678 }
1679 }
1680 } else {
1681 COPY_BUF(l,buffer,nbchars,c);
1682 NEXTL(l);
1683 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1684 growBuffer(buffer);
1685 }
1686 }
1687 c = CUR_CHAR(l);
1688 }
1689 buffer[nbchars++] = 0;
1690 return(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001691#endif
Owen Taylor3473f882001-02-23 17:55:21 +00001692}
1693
1694/************************************************************************
1695 * *
1696 * Commodity functions to handle streams *
1697 * *
1698 ************************************************************************/
1699
1700/**
Owen Taylor3473f882001-02-23 17:55:21 +00001701 * htmlNewInputStream:
1702 * @ctxt: an HTML parser context
1703 *
1704 * Create a new input stream structure
1705 * Returns the new input stream or NULL
1706 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001707static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001708htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1709 htmlParserInputPtr input;
1710
1711 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1712 if (input == NULL) {
1713 ctxt->errNo = XML_ERR_NO_MEMORY;
1714 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1715 ctxt->sax->error(ctxt->userData,
1716 "malloc: couldn't allocate a new input stream\n");
1717 return(NULL);
1718 }
1719 memset(input, 0, sizeof(htmlParserInput));
1720 input->filename = NULL;
1721 input->directory = NULL;
1722 input->base = NULL;
1723 input->cur = NULL;
1724 input->buf = NULL;
1725 input->line = 1;
1726 input->col = 1;
1727 input->buf = NULL;
1728 input->free = NULL;
1729 input->version = NULL;
1730 input->consumed = 0;
1731 input->length = 0;
1732 return(input);
1733}
1734
1735
1736/************************************************************************
1737 * *
1738 * Commodity functions, cleanup needed ? *
1739 * *
1740 ************************************************************************/
1741
1742/**
1743 * areBlanks:
1744 * @ctxt: an HTML parser context
1745 * @str: a xmlChar *
1746 * @len: the size of @str
1747 *
1748 * Is this a sequence of blank chars that one can ignore ?
1749 *
1750 * Returns 1 if ignorable 0 otherwise.
1751 */
1752
1753static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
1754 int i;
1755 xmlNodePtr lastChild;
1756
1757 for (i = 0;i < len;i++)
1758 if (!(IS_BLANK(str[i]))) return(0);
1759
1760 if (CUR == 0) return(1);
1761 if (CUR != '<') return(0);
1762 if (ctxt->name == NULL)
1763 return(1);
1764 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
1765 return(1);
1766 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
1767 return(1);
1768 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
1769 return(1);
1770 if (ctxt->node == NULL) return(0);
1771 lastChild = xmlGetLastChild(ctxt->node);
1772 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00001773 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
1774 (ctxt->node->content != NULL)) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001775 } else if (xmlNodeIsText(lastChild)) {
1776 return(0);
1777 } else if (xmlStrEqual(lastChild->name, BAD_CAST"b")) {
1778 return(0);
1779 } else if (xmlStrEqual(lastChild->name, BAD_CAST"bold")) {
1780 return(0);
1781 } else if (xmlStrEqual(lastChild->name, BAD_CAST"em")) {
1782 return(0);
1783 }
1784 return(1);
1785}
1786
1787/**
Owen Taylor3473f882001-02-23 17:55:21 +00001788 * htmlNewDocNoDtD:
1789 * @URI: URI for the dtd, or NULL
1790 * @ExternalID: the external ID of the DTD, or NULL
1791 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001792 * Creates a new HTML document without a DTD node if @URI and @ExternalID
1793 * are NULL
1794 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001795 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00001796 */
1797htmlDocPtr
1798htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
1799 xmlDocPtr cur;
1800
1801 /*
1802 * Allocate a new document and fill the fields.
1803 */
1804 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
1805 if (cur == NULL) {
1806 xmlGenericError(xmlGenericErrorContext,
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001807 "htmlNewDocNoDtD : malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00001808 return(NULL);
1809 }
1810 memset(cur, 0, sizeof(xmlDoc));
1811
1812 cur->type = XML_HTML_DOCUMENT_NODE;
1813 cur->version = NULL;
1814 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001815 cur->doc = cur;
1816 cur->name = NULL;
1817 cur->children = NULL;
1818 cur->extSubset = NULL;
1819 cur->oldNs = NULL;
1820 cur->encoding = NULL;
1821 cur->standalone = 1;
1822 cur->compression = 0;
1823 cur->ids = NULL;
1824 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001825 cur->_private = NULL;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00001826 if ((ExternalID != NULL) ||
1827 (URI != NULL))
Daniel Veillard5151c062001-10-23 13:10:19 +00001828 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00001829 return(cur);
1830}
1831
1832/**
1833 * htmlNewDoc:
1834 * @URI: URI for the dtd, or NULL
1835 * @ExternalID: the external ID of the DTD, or NULL
1836 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001837 * Creates a new HTML document
1838 *
Owen Taylor3473f882001-02-23 17:55:21 +00001839 * Returns a new document
1840 */
1841htmlDocPtr
1842htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1843 if ((URI == NULL) && (ExternalID == NULL))
1844 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00001845 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
1846 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00001847
1848 return(htmlNewDocNoDtD(URI, ExternalID));
1849}
1850
1851
1852/************************************************************************
1853 * *
1854 * The parser itself *
1855 * Relates to http://www.w3.org/TR/html40 *
1856 * *
1857 ************************************************************************/
1858
1859/************************************************************************
1860 * *
1861 * The parser itself *
1862 * *
1863 ************************************************************************/
1864
1865/**
1866 * htmlParseHTMLName:
1867 * @ctxt: an HTML parser context
1868 *
1869 * parse an HTML tag or attribute name, note that we convert it to lowercase
1870 * since HTML names are not case-sensitive.
1871 *
1872 * Returns the Tag Name parsed or NULL
1873 */
1874
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001875static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001876htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1877 xmlChar *ret = NULL;
1878 int i = 0;
1879 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
1880
1881 if (!IS_LETTER(CUR) && (CUR != '_') &&
1882 (CUR != ':')) return(NULL);
1883
1884 while ((i < HTML_PARSER_BUFFER_SIZE) &&
1885 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1886 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
1887 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
1888 else loc[i] = CUR;
1889 i++;
1890
1891 NEXT;
1892 }
1893
1894 ret = xmlStrndup(loc, i);
1895
1896 return(ret);
1897}
1898
1899/**
1900 * htmlParseName:
1901 * @ctxt: an HTML parser context
1902 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001903 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00001904 *
1905 * Returns the Name parsed or NULL
1906 */
1907
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001908static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001909htmlParseName(htmlParserCtxtPtr ctxt) {
1910 xmlChar buf[HTML_MAX_NAMELEN];
1911 int len = 0;
1912
1913 GROW;
1914 if (!IS_LETTER(CUR) && (CUR != '_')) {
1915 return(NULL);
1916 }
1917
1918 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1919 (CUR == '.') || (CUR == '-') ||
1920 (CUR == '_') || (CUR == ':') ||
1921 (IS_COMBINING(CUR)) ||
1922 (IS_EXTENDER(CUR))) {
1923 buf[len++] = CUR;
1924 NEXT;
1925 if (len >= HTML_MAX_NAMELEN) {
1926 xmlGenericError(xmlGenericErrorContext,
1927 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1928 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1929 (CUR == '.') || (CUR == '-') ||
1930 (CUR == '_') || (CUR == ':') ||
1931 (IS_COMBINING(CUR)) ||
1932 (IS_EXTENDER(CUR)))
1933 NEXT;
1934 break;
1935 }
1936 }
1937 return(xmlStrndup(buf, len));
1938}
1939
1940/**
1941 * htmlParseHTMLAttribute:
1942 * @ctxt: an HTML parser context
1943 * @stop: a char stop value
1944 *
1945 * parse an HTML attribute value till the stop (quote), if
1946 * stop is 0 then it stops at the first space
1947 *
1948 * Returns the attribute parsed or NULL
1949 */
1950
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001951static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001952htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
1953 xmlChar *buffer = NULL;
1954 int buffer_size = 0;
1955 xmlChar *out = NULL;
1956 xmlChar *name = NULL;
1957
1958 xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00001959 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001960
1961 /*
1962 * allocate a translation buffer.
1963 */
1964 buffer_size = HTML_PARSER_BUFFER_SIZE;
1965 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1966 if (buffer == NULL) {
1967 perror("htmlParseHTMLAttribute: malloc failed");
1968 return(NULL);
1969 }
1970 out = buffer;
1971
1972 /*
1973 * Ok loop until we reach one of the ending chars
1974 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00001975 while ((CUR != 0) && (CUR != stop)) {
1976 if ((stop == 0) && (CUR == '>')) break;
Owen Taylor3473f882001-02-23 17:55:21 +00001977 if ((stop == 0) && (IS_BLANK(CUR))) break;
1978 if (CUR == '&') {
1979 if (NXT(1) == '#') {
1980 unsigned int c;
1981 int bits;
1982
1983 c = htmlParseCharRef(ctxt);
1984 if (c < 0x80)
1985 { *out++ = c; bits= -6; }
1986 else if (c < 0x800)
1987 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1988 else if (c < 0x10000)
1989 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1990 else
1991 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1992
1993 for ( ; bits >= 0; bits-= 6) {
1994 *out++ = ((c >> bits) & 0x3F) | 0x80;
1995 }
1996 } else {
1997 ent = htmlParseEntityRef(ctxt, &name);
1998 if (name == NULL) {
1999 *out++ = '&';
2000 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002001 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002002
2003 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002004 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002005 }
2006 } else if (ent == NULL) {
2007 *out++ = '&';
2008 cur = name;
2009 while (*cur != 0) {
2010 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002011 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002012
2013 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002014 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002015 }
2016 *out++ = *cur++;
2017 }
2018 xmlFree(name);
2019 } else {
2020 unsigned int c;
2021 int bits;
2022
2023 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002024 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002025
2026 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002027 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002028 }
2029 c = (xmlChar)ent->value;
2030 if (c < 0x80)
2031 { *out++ = c; bits= -6; }
2032 else if (c < 0x800)
2033 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2034 else if (c < 0x10000)
2035 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2036 else
2037 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2038
2039 for ( ; bits >= 0; bits-= 6) {
2040 *out++ = ((c >> bits) & 0x3F) | 0x80;
2041 }
2042 xmlFree(name);
2043 }
2044 }
2045 } else {
2046 unsigned int c;
2047 int bits, l;
2048
2049 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002050 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002051
2052 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002053 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002054 }
2055 c = CUR_CHAR(l);
2056 if (c < 0x80)
2057 { *out++ = c; bits= -6; }
2058 else if (c < 0x800)
2059 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2060 else if (c < 0x10000)
2061 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2062 else
2063 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2064
2065 for ( ; bits >= 0; bits-= 6) {
2066 *out++ = ((c >> bits) & 0x3F) | 0x80;
2067 }
2068 NEXT;
2069 }
2070 }
2071 *out++ = 0;
2072 return(buffer);
2073}
2074
2075/**
Owen Taylor3473f882001-02-23 17:55:21 +00002076 * htmlParseEntityRef:
2077 * @ctxt: an HTML parser context
2078 * @str: location to store the entity name
2079 *
2080 * parse an HTML ENTITY references
2081 *
2082 * [68] EntityRef ::= '&' Name ';'
2083 *
2084 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2085 * if non-NULL *str will have to be freed by the caller.
2086 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002087const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00002088htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2089 xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002090 const htmlEntityDesc * ent = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002091 *str = NULL;
2092
2093 if (CUR == '&') {
2094 NEXT;
2095 name = htmlParseName(ctxt);
2096 if (name == NULL) {
2097 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2098 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2099 ctxt->wellFormed = 0;
2100 } else {
2101 GROW;
2102 if (CUR == ';') {
2103 *str = name;
2104
2105 /*
2106 * Lookup the entity in the table.
2107 */
2108 ent = htmlEntityLookup(name);
2109 if (ent != NULL) /* OK that's ugly !!! */
2110 NEXT;
2111 } else {
2112 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2113 ctxt->sax->error(ctxt->userData,
2114 "htmlParseEntityRef: expecting ';'\n");
2115 *str = name;
2116 }
2117 }
2118 }
2119 return(ent);
2120}
2121
2122/**
2123 * htmlParseAttValue:
2124 * @ctxt: an HTML parser context
2125 *
2126 * parse a value for an attribute
2127 * Note: the parser won't do substitution of entities here, this
2128 * will be handled later in xmlStringGetNodeList, unless it was
2129 * asked for ctxt->replaceEntities != 0
2130 *
2131 * Returns the AttValue parsed or NULL.
2132 */
2133
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002134static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002135htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2136 xmlChar *ret = NULL;
2137
2138 if (CUR == '"') {
2139 NEXT;
2140 ret = htmlParseHTMLAttribute(ctxt, '"');
2141 if (CUR != '"') {
2142 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2143 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2144 ctxt->wellFormed = 0;
2145 } else
2146 NEXT;
2147 } else if (CUR == '\'') {
2148 NEXT;
2149 ret = htmlParseHTMLAttribute(ctxt, '\'');
2150 if (CUR != '\'') {
2151 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2152 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2153 ctxt->wellFormed = 0;
2154 } else
2155 NEXT;
2156 } else {
2157 /*
2158 * That's an HTMLism, the attribute value may not be quoted
2159 */
2160 ret = htmlParseHTMLAttribute(ctxt, 0);
2161 if (ret == NULL) {
2162 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2163 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2164 ctxt->wellFormed = 0;
2165 }
2166 }
2167 return(ret);
2168}
2169
2170/**
2171 * htmlParseSystemLiteral:
2172 * @ctxt: an HTML parser context
2173 *
2174 * parse an HTML Literal
2175 *
2176 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2177 *
2178 * Returns the SystemLiteral parsed or NULL
2179 */
2180
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002181static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002182htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2183 const xmlChar *q;
2184 xmlChar *ret = NULL;
2185
2186 if (CUR == '"') {
2187 NEXT;
2188 q = CUR_PTR;
2189 while ((IS_CHAR(CUR)) && (CUR != '"'))
2190 NEXT;
2191 if (!IS_CHAR(CUR)) {
2192 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2193 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2194 ctxt->wellFormed = 0;
2195 } else {
2196 ret = xmlStrndup(q, CUR_PTR - q);
2197 NEXT;
2198 }
2199 } else if (CUR == '\'') {
2200 NEXT;
2201 q = CUR_PTR;
2202 while ((IS_CHAR(CUR)) && (CUR != '\''))
2203 NEXT;
2204 if (!IS_CHAR(CUR)) {
2205 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2206 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2207 ctxt->wellFormed = 0;
2208 } else {
2209 ret = xmlStrndup(q, CUR_PTR - q);
2210 NEXT;
2211 }
2212 } else {
2213 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2214 ctxt->sax->error(ctxt->userData,
2215 "SystemLiteral \" or ' expected\n");
2216 ctxt->wellFormed = 0;
2217 }
2218
2219 return(ret);
2220}
2221
2222/**
2223 * htmlParsePubidLiteral:
2224 * @ctxt: an HTML parser context
2225 *
2226 * parse an HTML public literal
2227 *
2228 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2229 *
2230 * Returns the PubidLiteral parsed or NULL.
2231 */
2232
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002233static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002234htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2235 const xmlChar *q;
2236 xmlChar *ret = NULL;
2237 /*
2238 * Name ::= (Letter | '_') (NameChar)*
2239 */
2240 if (CUR == '"') {
2241 NEXT;
2242 q = CUR_PTR;
2243 while (IS_PUBIDCHAR(CUR)) NEXT;
2244 if (CUR != '"') {
2245 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2246 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2247 ctxt->wellFormed = 0;
2248 } else {
2249 ret = xmlStrndup(q, CUR_PTR - q);
2250 NEXT;
2251 }
2252 } else if (CUR == '\'') {
2253 NEXT;
2254 q = CUR_PTR;
2255 while ((IS_LETTER(CUR)) && (CUR != '\''))
2256 NEXT;
2257 if (!IS_LETTER(CUR)) {
2258 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2259 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2260 ctxt->wellFormed = 0;
2261 } else {
2262 ret = xmlStrndup(q, CUR_PTR - q);
2263 NEXT;
2264 }
2265 } else {
2266 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2267 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2268 ctxt->wellFormed = 0;
2269 }
2270
2271 return(ret);
2272}
2273
2274/**
2275 * htmlParseScript:
2276 * @ctxt: an HTML parser context
2277 *
2278 * parse the content of an HTML SCRIPT or STYLE element
2279 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2280 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2281 * http://www.w3.org/TR/html4/types.html#type-script
2282 * http://www.w3.org/TR/html4/types.html#h-6.15
2283 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2284 *
2285 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2286 * element and the value of intrinsic event attributes. User agents must
2287 * not evaluate script data as HTML markup but instead must pass it on as
2288 * data to a script engine.
2289 * NOTES:
2290 * - The content is passed like CDATA
2291 * - the attributes for style and scripting "onXXX" are also described
2292 * as CDATA but SGML allows entities references in attributes so their
2293 * processing is identical as other attributes
2294 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002295static void
Owen Taylor3473f882001-02-23 17:55:21 +00002296htmlParseScript(htmlParserCtxtPtr ctxt) {
2297 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2298 int nbchar = 0;
2299 xmlChar cur;
2300
2301 SHRINK;
2302 cur = CUR;
2303 while (IS_CHAR(cur)) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00002304 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2305 (NXT(3) == '-')) {
2306 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2307 if (ctxt->sax->cdataBlock!= NULL) {
2308 /*
2309 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2310 */
2311 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2312 }
2313 }
2314 nbchar = 0;
2315 htmlParseComment(ctxt);
2316 cur = CUR;
2317 continue;
2318 } else if ((cur == '<') && (NXT(1) == '/')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002319 /*
2320 * One should break here, the specification is clear:
2321 * Authors should therefore escape "</" within the content.
2322 * Escape mechanisms are specific to each scripting or
2323 * style sheet language.
2324 */
2325 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2326 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2327 break; /* while */
2328 }
2329 buf[nbchar++] = cur;
2330 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2331 if (ctxt->sax->cdataBlock!= NULL) {
2332 /*
2333 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2334 */
2335 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2336 }
2337 nbchar = 0;
2338 }
2339 NEXT;
2340 cur = CUR;
2341 }
2342 if (!(IS_CHAR(cur))) {
2343 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2344 ctxt->sax->error(ctxt->userData,
2345 "Invalid char in CDATA 0x%X\n", cur);
2346 ctxt->wellFormed = 0;
2347 NEXT;
2348 }
2349
2350 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2351 if (ctxt->sax->cdataBlock!= NULL) {
2352 /*
2353 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2354 */
2355 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2356 }
2357 }
2358}
2359
2360
2361/**
2362 * htmlParseCharData:
2363 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002364 *
2365 * parse a CharData section.
2366 * if we are within a CDATA section ']]>' marks an end of section.
2367 *
2368 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2369 */
2370
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002371static void
2372htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002373 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2374 int nbchar = 0;
2375 int cur, l;
2376
2377 SHRINK;
2378 cur = CUR_CHAR(l);
2379 while (((cur != '<') || (ctxt->token == '<')) &&
2380 ((cur != '&') || (ctxt->token == '&')) &&
2381 (IS_CHAR(cur))) {
2382 COPY_BUF(l,buf,nbchar,cur);
2383 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2384 /*
2385 * Ok the segment is to be consumed as chars.
2386 */
2387 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2388 if (areBlanks(ctxt, buf, nbchar)) {
2389 if (ctxt->sax->ignorableWhitespace != NULL)
2390 ctxt->sax->ignorableWhitespace(ctxt->userData,
2391 buf, nbchar);
2392 } else {
2393 htmlCheckParagraph(ctxt);
2394 if (ctxt->sax->characters != NULL)
2395 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2396 }
2397 }
2398 nbchar = 0;
2399 }
2400 NEXTL(l);
2401 cur = CUR_CHAR(l);
2402 }
2403 if (nbchar != 0) {
2404 /*
2405 * Ok the segment is to be consumed as chars.
2406 */
2407 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2408 if (areBlanks(ctxt, buf, nbchar)) {
2409 if (ctxt->sax->ignorableWhitespace != NULL)
2410 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2411 } else {
2412 htmlCheckParagraph(ctxt);
2413 if (ctxt->sax->characters != NULL)
2414 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2415 }
2416 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002417 } else {
2418 /*
2419 * Loop detection
2420 */
2421 if (cur == 0)
2422 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002423 }
2424}
2425
2426/**
2427 * htmlParseExternalID:
2428 * @ctxt: an HTML parser context
2429 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002430 *
2431 * Parse an External ID or a Public ID
2432 *
Owen Taylor3473f882001-02-23 17:55:21 +00002433 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2434 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2435 *
2436 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2437 *
2438 * Returns the function returns SystemLiteral and in the second
2439 * case publicID receives PubidLiteral, is strict is off
2440 * it is possible to return NULL and have publicID set.
2441 */
2442
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002443static xmlChar *
2444htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002445 xmlChar *URI = NULL;
2446
2447 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2448 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2449 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2450 SKIP(6);
2451 if (!IS_BLANK(CUR)) {
2452 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2453 ctxt->sax->error(ctxt->userData,
2454 "Space required after 'SYSTEM'\n");
2455 ctxt->wellFormed = 0;
2456 }
2457 SKIP_BLANKS;
2458 URI = htmlParseSystemLiteral(ctxt);
2459 if (URI == NULL) {
2460 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2461 ctxt->sax->error(ctxt->userData,
2462 "htmlParseExternalID: SYSTEM, no URI\n");
2463 ctxt->wellFormed = 0;
2464 }
2465 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2466 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2467 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2468 SKIP(6);
2469 if (!IS_BLANK(CUR)) {
2470 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2471 ctxt->sax->error(ctxt->userData,
2472 "Space required after 'PUBLIC'\n");
2473 ctxt->wellFormed = 0;
2474 }
2475 SKIP_BLANKS;
2476 *publicID = htmlParsePubidLiteral(ctxt);
2477 if (*publicID == NULL) {
2478 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2479 ctxt->sax->error(ctxt->userData,
2480 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2481 ctxt->wellFormed = 0;
2482 }
2483 SKIP_BLANKS;
2484 if ((CUR == '"') || (CUR == '\'')) {
2485 URI = htmlParseSystemLiteral(ctxt);
2486 }
2487 }
2488 return(URI);
2489}
2490
2491/**
2492 * htmlParseComment:
2493 * @ctxt: an HTML parser context
2494 *
2495 * Parse an XML (SGML) comment <!-- .... -->
2496 *
2497 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2498 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002499static void
Owen Taylor3473f882001-02-23 17:55:21 +00002500htmlParseComment(htmlParserCtxtPtr ctxt) {
2501 xmlChar *buf = NULL;
2502 int len;
2503 int size = HTML_PARSER_BUFFER_SIZE;
2504 int q, ql;
2505 int r, rl;
2506 int cur, l;
2507 xmlParserInputState state;
2508
2509 /*
2510 * Check that there is a comment right here.
2511 */
2512 if ((RAW != '<') || (NXT(1) != '!') ||
2513 (NXT(2) != '-') || (NXT(3) != '-')) return;
2514
2515 state = ctxt->instate;
2516 ctxt->instate = XML_PARSER_COMMENT;
2517 SHRINK;
2518 SKIP(4);
2519 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2520 if (buf == NULL) {
2521 xmlGenericError(xmlGenericErrorContext,
2522 "malloc of %d byte failed\n", size);
2523 ctxt->instate = state;
2524 return;
2525 }
2526 q = CUR_CHAR(ql);
2527 NEXTL(ql);
2528 r = CUR_CHAR(rl);
2529 NEXTL(rl);
2530 cur = CUR_CHAR(l);
2531 len = 0;
2532 while (IS_CHAR(cur) &&
2533 ((cur != '>') ||
2534 (r != '-') || (q != '-'))) {
2535 if (len + 5 >= size) {
2536 size *= 2;
2537 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2538 if (buf == NULL) {
2539 xmlGenericError(xmlGenericErrorContext,
2540 "realloc of %d byte failed\n", size);
2541 ctxt->instate = state;
2542 return;
2543 }
2544 }
2545 COPY_BUF(ql,buf,len,q);
2546 q = r;
2547 ql = rl;
2548 r = cur;
2549 rl = l;
2550 NEXTL(l);
2551 cur = CUR_CHAR(l);
2552 if (cur == 0) {
2553 SHRINK;
2554 GROW;
2555 cur = CUR_CHAR(l);
2556 }
2557 }
2558 buf[len] = 0;
2559 if (!IS_CHAR(cur)) {
2560 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2561 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2562 ctxt->sax->error(ctxt->userData,
2563 "Comment not terminated \n<!--%.50s\n", buf);
2564 ctxt->wellFormed = 0;
2565 xmlFree(buf);
2566 } else {
2567 NEXT;
2568 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2569 (!ctxt->disableSAX))
2570 ctxt->sax->comment(ctxt->userData, buf);
2571 xmlFree(buf);
2572 }
2573 ctxt->instate = state;
2574}
2575
2576/**
2577 * htmlParseCharRef:
2578 * @ctxt: an HTML parser context
2579 *
2580 * parse Reference declarations
2581 *
2582 * [66] CharRef ::= '&#' [0-9]+ ';' |
2583 * '&#x' [0-9a-fA-F]+ ';'
2584 *
2585 * Returns the value parsed (as an int)
2586 */
2587int
2588htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2589 int val = 0;
2590
2591 if ((CUR == '&') && (NXT(1) == '#') &&
2592 (NXT(2) == 'x')) {
2593 SKIP(3);
2594 while (CUR != ';') {
2595 if ((CUR >= '0') && (CUR <= '9'))
2596 val = val * 16 + (CUR - '0');
2597 else if ((CUR >= 'a') && (CUR <= 'f'))
2598 val = val * 16 + (CUR - 'a') + 10;
2599 else if ((CUR >= 'A') && (CUR <= 'F'))
2600 val = val * 16 + (CUR - 'A') + 10;
2601 else {
2602 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2603 ctxt->sax->error(ctxt->userData,
2604 "htmlParseCharRef: invalid hexadecimal value\n");
2605 ctxt->wellFormed = 0;
2606 return(0);
2607 }
2608 NEXT;
2609 }
2610 if (CUR == ';')
2611 NEXT;
2612 } else if ((CUR == '&') && (NXT(1) == '#')) {
2613 SKIP(2);
2614 while (CUR != ';') {
2615 if ((CUR >= '0') && (CUR <= '9'))
2616 val = val * 10 + (CUR - '0');
2617 else {
2618 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2619 ctxt->sax->error(ctxt->userData,
2620 "htmlParseCharRef: invalid decimal value\n");
2621 ctxt->wellFormed = 0;
2622 return(0);
2623 }
2624 NEXT;
2625 }
2626 if (CUR == ';')
2627 NEXT;
2628 } else {
2629 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2630 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2631 ctxt->wellFormed = 0;
2632 }
2633 /*
2634 * Check the value IS_CHAR ...
2635 */
2636 if (IS_CHAR(val)) {
2637 return(val);
2638 } else {
2639 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2640 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
2641 val);
2642 ctxt->wellFormed = 0;
2643 }
2644 return(0);
2645}
2646
2647
2648/**
2649 * htmlParseDocTypeDecl :
2650 * @ctxt: an HTML parser context
2651 *
2652 * parse a DOCTYPE declaration
2653 *
2654 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2655 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2656 */
2657
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002658static void
Owen Taylor3473f882001-02-23 17:55:21 +00002659htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
2660 xmlChar *name;
2661 xmlChar *ExternalID = NULL;
2662 xmlChar *URI = NULL;
2663
2664 /*
2665 * We know that '<!DOCTYPE' has been detected.
2666 */
2667 SKIP(9);
2668
2669 SKIP_BLANKS;
2670
2671 /*
2672 * Parse the DOCTYPE name.
2673 */
2674 name = htmlParseName(ctxt);
2675 if (name == NULL) {
2676 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2677 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2678 ctxt->wellFormed = 0;
2679 }
2680 /*
2681 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2682 */
2683
2684 SKIP_BLANKS;
2685
2686 /*
2687 * Check for SystemID and ExternalID
2688 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002689 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00002690 SKIP_BLANKS;
2691
2692 /*
2693 * We should be at the end of the DOCTYPE declaration.
2694 */
2695 if (CUR != '>') {
2696 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardf6ed8bc2001-10-02 09:22:47 +00002697 ctxt->sax->error(ctxt->userData, "DOCTYPE improperly terminated\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002698 ctxt->wellFormed = 0;
2699 /* We shouldn't try to resynchronize ... */
2700 }
2701 NEXT;
2702
2703 /*
2704 * Create or update the document accordingly to the DOCTYPE
2705 */
2706 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2707 (!ctxt->disableSAX))
2708 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
2709
2710 /*
2711 * Cleanup, since we don't use all those identifiers
2712 */
2713 if (URI != NULL) xmlFree(URI);
2714 if (ExternalID != NULL) xmlFree(ExternalID);
2715 if (name != NULL) xmlFree(name);
2716}
2717
2718/**
2719 * htmlParseAttribute:
2720 * @ctxt: an HTML parser context
2721 * @value: a xmlChar ** used to store the value of the attribute
2722 *
2723 * parse an attribute
2724 *
2725 * [41] Attribute ::= Name Eq AttValue
2726 *
2727 * [25] Eq ::= S? '=' S?
2728 *
2729 * With namespace:
2730 *
2731 * [NS 11] Attribute ::= QName Eq AttValue
2732 *
2733 * Also the case QName == xmlns:??? is handled independently as a namespace
2734 * definition.
2735 *
2736 * Returns the attribute name, and the value in *value.
2737 */
2738
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002739static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002740htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
2741 xmlChar *name, *val = NULL;
2742
2743 *value = NULL;
2744 name = htmlParseHTMLName(ctxt);
2745 if (name == NULL) {
2746 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2747 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2748 ctxt->wellFormed = 0;
2749 return(NULL);
2750 }
2751
2752 /*
2753 * read the value
2754 */
2755 SKIP_BLANKS;
2756 if (CUR == '=') {
2757 NEXT;
2758 SKIP_BLANKS;
2759 val = htmlParseAttValue(ctxt);
2760 /******
2761 } else {
2762 * TODO : some attribute must have values, some may not
2763 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2764 ctxt->sax->warning(ctxt->userData,
2765 "No value for attribute %s\n", name); */
2766 }
2767
2768 *value = val;
2769 return(name);
2770}
2771
2772/**
2773 * htmlCheckEncoding:
2774 * @ctxt: an HTML parser context
2775 * @attvalue: the attribute value
2776 *
2777 * Checks an http-equiv attribute from a Meta tag to detect
2778 * the encoding
2779 * If a new encoding is detected the parser is switched to decode
2780 * it and pass UTF8
2781 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002782static void
Owen Taylor3473f882001-02-23 17:55:21 +00002783htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2784 const xmlChar *encoding;
2785
2786 if ((ctxt == NULL) || (attvalue == NULL))
2787 return;
2788
2789 /* do not change encoding */
2790 if (ctxt->input->encoding != NULL)
2791 return;
2792
2793 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
2794 if (encoding != NULL) {
2795 encoding += 8;
2796 } else {
2797 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
2798 if (encoding != NULL)
2799 encoding += 9;
2800 }
2801 if (encoding != NULL) {
2802 xmlCharEncoding enc;
2803 xmlCharEncodingHandlerPtr handler;
2804
2805 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2806
2807 if (ctxt->input->encoding != NULL)
2808 xmlFree((xmlChar *) ctxt->input->encoding);
2809 ctxt->input->encoding = xmlStrdup(encoding);
2810
2811 enc = xmlParseCharEncoding((const char *) encoding);
2812 /*
2813 * registered set of known encodings
2814 */
2815 if (enc != XML_CHAR_ENCODING_ERROR) {
2816 xmlSwitchEncoding(ctxt, enc);
2817 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2818 } else {
2819 /*
2820 * fallback for unknown encodings
2821 */
2822 handler = xmlFindCharEncodingHandler((const char *) encoding);
2823 if (handler != NULL) {
2824 xmlSwitchToEncoding(ctxt, handler);
2825 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2826 } else {
2827 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2828 }
2829 }
2830
2831 if ((ctxt->input->buf != NULL) &&
2832 (ctxt->input->buf->encoder != NULL) &&
2833 (ctxt->input->buf->raw != NULL) &&
2834 (ctxt->input->buf->buffer != NULL)) {
2835 int nbchars;
2836 int processed;
2837
2838 /*
2839 * convert as much as possible to the parser reading buffer.
2840 */
2841 processed = ctxt->input->cur - ctxt->input->base;
2842 xmlBufferShrink(ctxt->input->buf->buffer, processed);
2843 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2844 ctxt->input->buf->buffer,
2845 ctxt->input->buf->raw);
2846 if (nbchars < 0) {
2847 ctxt->errNo = XML_ERR_INVALID_ENCODING;
2848 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2849 ctxt->sax->error(ctxt->userData,
2850 "htmlCheckEncoding: encoder error\n");
2851 }
2852 ctxt->input->base =
2853 ctxt->input->cur = ctxt->input->buf->buffer->content;
2854 }
2855 }
2856}
2857
2858/**
2859 * htmlCheckMeta:
2860 * @ctxt: an HTML parser context
2861 * @atts: the attributes values
2862 *
2863 * Checks an attributes from a Meta tag
2864 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002865static void
Owen Taylor3473f882001-02-23 17:55:21 +00002866htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2867 int i;
2868 const xmlChar *att, *value;
2869 int http = 0;
2870 const xmlChar *content = NULL;
2871
2872 if ((ctxt == NULL) || (atts == NULL))
2873 return;
2874
2875 i = 0;
2876 att = atts[i++];
2877 while (att != NULL) {
2878 value = atts[i++];
2879 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
2880 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
2881 http = 1;
2882 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
2883 content = value;
2884 att = atts[i++];
2885 }
2886 if ((http) && (content != NULL))
2887 htmlCheckEncoding(ctxt, content);
2888
2889}
2890
2891/**
2892 * htmlParseStartTag:
2893 * @ctxt: an HTML parser context
2894 *
2895 * parse a start of tag either for rule element or
2896 * EmptyElement. In both case we don't parse the tag closing chars.
2897 *
2898 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2899 *
2900 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2901 *
2902 * With namespace:
2903 *
2904 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2905 *
2906 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2907 *
2908 */
2909
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002910static void
Owen Taylor3473f882001-02-23 17:55:21 +00002911htmlParseStartTag(htmlParserCtxtPtr ctxt) {
2912 xmlChar *name;
2913 xmlChar *attname;
2914 xmlChar *attvalue;
2915 const xmlChar **atts = NULL;
2916 int nbatts = 0;
2917 int maxatts = 0;
2918 int meta = 0;
2919 int i;
2920
2921 if (CUR != '<') return;
2922 NEXT;
2923
2924 GROW;
2925 name = htmlParseHTMLName(ctxt);
2926 if (name == NULL) {
2927 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2928 ctxt->sax->error(ctxt->userData,
2929 "htmlParseStartTag: invalid element name\n");
2930 ctxt->wellFormed = 0;
2931 /* Dump the bogus tag like browsers do */
2932 while ((IS_CHAR(CUR)) && (CUR != '>'))
2933 NEXT;
2934 return;
2935 }
2936 if (xmlStrEqual(name, BAD_CAST"meta"))
2937 meta = 1;
2938
2939 /*
2940 * Check for auto-closure of HTML elements.
2941 */
2942 htmlAutoClose(ctxt, name);
2943
2944 /*
2945 * Check for implied HTML elements.
2946 */
2947 htmlCheckImplied(ctxt, name);
2948
2949 /*
2950 * Avoid html at any level > 0, head at any level != 1
2951 * or any attempt to recurse body
2952 */
2953 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
2954 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2955 ctxt->sax->error(ctxt->userData,
2956 "htmlParseStartTag: misplaced <html> tag\n");
2957 ctxt->wellFormed = 0;
2958 xmlFree(name);
2959 return;
2960 }
2961 if ((ctxt->nameNr != 1) &&
2962 (xmlStrEqual(name, BAD_CAST"head"))) {
2963 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2964 ctxt->sax->error(ctxt->userData,
2965 "htmlParseStartTag: misplaced <head> tag\n");
2966 ctxt->wellFormed = 0;
2967 xmlFree(name);
2968 return;
2969 }
2970 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002971 int indx;
2972 for (indx = 0;indx < ctxt->nameNr;indx++) {
2973 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor3473f882001-02-23 17:55:21 +00002974 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2975 ctxt->sax->error(ctxt->userData,
2976 "htmlParseStartTag: misplaced <body> tag\n");
2977 ctxt->wellFormed = 0;
2978 xmlFree(name);
2979 return;
2980 }
2981 }
2982 }
2983
2984 /*
2985 * Now parse the attributes, it ends up with the ending
2986 *
2987 * (S Attribute)* S?
2988 */
2989 SKIP_BLANKS;
2990 while ((IS_CHAR(CUR)) &&
2991 (CUR != '>') &&
2992 ((CUR != '/') || (NXT(1) != '>'))) {
2993 long cons = ctxt->nbChars;
2994
2995 GROW;
2996 attname = htmlParseAttribute(ctxt, &attvalue);
2997 if (attname != NULL) {
2998
2999 /*
3000 * Well formedness requires at most one declaration of an attribute
3001 */
3002 for (i = 0; i < nbatts;i += 2) {
3003 if (xmlStrEqual(atts[i], attname)) {
3004 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3005 ctxt->sax->error(ctxt->userData,
3006 "Attribute %s redefined\n",
3007 attname);
3008 ctxt->wellFormed = 0;
3009 xmlFree(attname);
3010 if (attvalue != NULL)
3011 xmlFree(attvalue);
3012 goto failed;
3013 }
3014 }
3015
3016 /*
3017 * Add the pair to atts
3018 */
3019 if (atts == NULL) {
3020 maxatts = 10;
3021 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
3022 if (atts == NULL) {
3023 xmlGenericError(xmlGenericErrorContext,
3024 "malloc of %ld byte failed\n",
3025 maxatts * (long)sizeof(xmlChar *));
3026 if (name != NULL) xmlFree(name);
3027 return;
3028 }
3029 } else if (nbatts + 4 > maxatts) {
3030 maxatts *= 2;
3031 atts = (const xmlChar **) xmlRealloc((void *) atts,
3032 maxatts * sizeof(xmlChar *));
3033 if (atts == NULL) {
3034 xmlGenericError(xmlGenericErrorContext,
3035 "realloc of %ld byte failed\n",
3036 maxatts * (long)sizeof(xmlChar *));
3037 if (name != NULL) xmlFree(name);
3038 return;
3039 }
3040 }
3041 atts[nbatts++] = attname;
3042 atts[nbatts++] = attvalue;
3043 atts[nbatts] = NULL;
3044 atts[nbatts + 1] = NULL;
3045 }
3046 else {
3047 /* Dump the bogus attribute string up to the next blank or
3048 * the end of the tag. */
3049 while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
3050 && ((CUR != '/') || (NXT(1) != '>')))
3051 NEXT;
3052 }
3053
3054failed:
3055 SKIP_BLANKS;
3056 if (cons == ctxt->nbChars) {
3057 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3058 ctxt->sax->error(ctxt->userData,
3059 "htmlParseStartTag: problem parsing attributes\n");
3060 ctxt->wellFormed = 0;
3061 break;
3062 }
3063 }
3064
3065 /*
3066 * Handle specific association to the META tag
3067 */
3068 if (meta)
3069 htmlCheckMeta(ctxt, atts);
3070
3071 /*
3072 * SAX: Start of Element !
3073 */
3074 htmlnamePush(ctxt, xmlStrdup(name));
3075#ifdef DEBUG
3076 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
3077#endif
3078 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3079 ctxt->sax->startElement(ctxt->userData, name, atts);
3080
3081 if (atts != NULL) {
3082 for (i = 0;i < nbatts;i++) {
3083 if (atts[i] != NULL)
3084 xmlFree((xmlChar *) atts[i]);
3085 }
3086 xmlFree((void *) atts);
3087 }
3088 if (name != NULL) xmlFree(name);
3089}
3090
3091/**
3092 * htmlParseEndTag:
3093 * @ctxt: an HTML parser context
3094 *
3095 * parse an end of tag
3096 *
3097 * [42] ETag ::= '</' Name S? '>'
3098 *
3099 * With namespace
3100 *
3101 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003102 *
3103 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003104 */
3105
Daniel Veillardf420ac52001-07-04 16:04:09 +00003106static int
Owen Taylor3473f882001-02-23 17:55:21 +00003107htmlParseEndTag(htmlParserCtxtPtr ctxt) {
3108 xmlChar *name;
3109 xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003110 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003111
3112 if ((CUR != '<') || (NXT(1) != '/')) {
3113 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3114 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3115 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003116 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003117 }
3118 SKIP(2);
3119
3120 name = htmlParseHTMLName(ctxt);
Daniel Veillardf420ac52001-07-04 16:04:09 +00003121 if (name == NULL) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003122
3123 /*
3124 * We should definitely be at the ending "S? '>'" part
3125 */
3126 SKIP_BLANKS;
3127 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3128 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3129 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3130 ctxt->wellFormed = 0;
3131 } else
3132 NEXT;
3133
3134 /*
3135 * If the name read is not one of the element in the parsing stack
3136 * then return, it's just an error.
3137 */
3138 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
3139 if (xmlStrEqual(name, ctxt->nameTab[i])) break;
3140 }
3141 if (i < 0) {
3142 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3143 ctxt->sax->error(ctxt->userData,
3144 "Unexpected end tag : %s\n", name);
3145 xmlFree(name);
3146 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003147 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003148 }
3149
3150
3151 /*
3152 * Check for auto-closure of HTML elements.
3153 */
3154
3155 htmlAutoCloseOnClose(ctxt, name);
3156
3157 /*
3158 * Well formedness constraints, opening and closing must match.
3159 * With the exception that the autoclose may have popped stuff out
3160 * of the stack.
3161 */
3162 if (!xmlStrEqual(name, ctxt->name)) {
3163#ifdef DEBUG
3164 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
3165#endif
3166 if ((ctxt->name != NULL) &&
3167 (!xmlStrEqual(ctxt->name, name))) {
3168 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3169 ctxt->sax->error(ctxt->userData,
3170 "Opening and ending tag mismatch: %s and %s\n",
3171 name, ctxt->name);
3172 ctxt->wellFormed = 0;
3173 }
3174 }
3175
3176 /*
3177 * SAX: End of Tag
3178 */
3179 oldname = ctxt->name;
3180 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3181 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3182 ctxt->sax->endElement(ctxt->userData, name);
3183 oldname = htmlnamePop(ctxt);
3184 if (oldname != NULL) {
3185#ifdef DEBUG
3186 xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
3187#endif
3188 xmlFree(oldname);
3189#ifdef DEBUG
3190 } else {
3191 xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
3192#endif
3193 }
Daniel Veillardf420ac52001-07-04 16:04:09 +00003194 ret = 1;
3195 } else {
3196 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003197 }
3198
3199 if (name != NULL)
3200 xmlFree(name);
3201
Daniel Veillardf420ac52001-07-04 16:04:09 +00003202 return(ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003203}
3204
3205
3206/**
3207 * htmlParseReference:
3208 * @ctxt: an HTML parser context
3209 *
3210 * parse and handle entity references in content,
3211 * this will end-up in a call to character() since this is either a
3212 * CharRef, or a predefined entity.
3213 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003214static void
Owen Taylor3473f882001-02-23 17:55:21 +00003215htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003216 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003217 xmlChar out[6];
3218 xmlChar *name;
3219 if (CUR != '&') return;
3220
3221 if (NXT(1) == '#') {
3222 unsigned int c;
3223 int bits, i = 0;
3224
3225 c = htmlParseCharRef(ctxt);
3226 if (c == 0)
3227 return;
3228
3229 if (c < 0x80) { out[i++]= c; bits= -6; }
3230 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3231 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3232 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3233
3234 for ( ; bits >= 0; bits-= 6) {
3235 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3236 }
3237 out[i] = 0;
3238
3239 htmlCheckParagraph(ctxt);
3240 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3241 ctxt->sax->characters(ctxt->userData, out, i);
3242 } else {
3243 ent = htmlParseEntityRef(ctxt, &name);
3244 if (name == NULL) {
3245 htmlCheckParagraph(ctxt);
3246 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3247 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3248 return;
3249 }
3250 if ((ent == NULL) || (ent->value <= 0)) {
3251 htmlCheckParagraph(ctxt);
3252 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3253 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3254 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3255 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3256 }
3257 } else {
3258 unsigned int c;
3259 int bits, i = 0;
3260
3261 c = ent->value;
3262 if (c < 0x80)
3263 { out[i++]= c; bits= -6; }
3264 else if (c < 0x800)
3265 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3266 else if (c < 0x10000)
3267 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3268 else
3269 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3270
3271 for ( ; bits >= 0; bits-= 6) {
3272 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3273 }
3274 out[i] = 0;
3275
3276 htmlCheckParagraph(ctxt);
3277 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3278 ctxt->sax->characters(ctxt->userData, out, i);
3279 }
3280 xmlFree(name);
3281 }
3282}
3283
3284/**
3285 * htmlParseContent:
3286 * @ctxt: an HTML parser context
3287 * @name: the node name
3288 *
3289 * Parse a content: comment, sub-element, reference or text.
3290 *
3291 */
3292
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003293static void
Owen Taylor3473f882001-02-23 17:55:21 +00003294htmlParseContent(htmlParserCtxtPtr ctxt) {
3295 xmlChar *currentNode;
3296 int depth;
3297
3298 currentNode = xmlStrdup(ctxt->name);
3299 depth = ctxt->nameNr;
3300 while (1) {
3301 long cons = ctxt->nbChars;
3302
3303 GROW;
3304 /*
3305 * Our tag or one of it's parent or children is ending.
3306 */
3307 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003308 if (htmlParseEndTag(ctxt) &&
3309 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3310 if (currentNode != NULL)
3311 xmlFree(currentNode);
3312 return;
3313 }
3314 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003315 }
3316
3317 /*
3318 * Has this node been popped out during parsing of
3319 * the next element
3320 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003321 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3322 (!xmlStrEqual(currentNode, ctxt->name)))
3323 {
Owen Taylor3473f882001-02-23 17:55:21 +00003324 if (currentNode != NULL) xmlFree(currentNode);
3325 return;
3326 }
3327
Daniel Veillardf9533d12001-03-03 10:04:57 +00003328 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3329 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003330 /*
3331 * Handle SCRIPT/STYLE separately
3332 */
3333 htmlParseScript(ctxt);
3334 } else {
3335 /*
3336 * Sometimes DOCTYPE arrives in the middle of the document
3337 */
3338 if ((CUR == '<') && (NXT(1) == '!') &&
3339 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3340 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3341 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3342 (UPP(8) == 'E')) {
3343 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3344 ctxt->sax->error(ctxt->userData,
3345 "Misplaced DOCTYPE declaration\n");
3346 ctxt->wellFormed = 0;
3347 htmlParseDocTypeDecl(ctxt);
3348 }
3349
3350 /*
3351 * First case : a comment
3352 */
3353 if ((CUR == '<') && (NXT(1) == '!') &&
3354 (NXT(2) == '-') && (NXT(3) == '-')) {
3355 htmlParseComment(ctxt);
3356 }
3357
3358 /*
3359 * Second case : a sub-element.
3360 */
3361 else if (CUR == '<') {
3362 htmlParseElement(ctxt);
3363 }
3364
3365 /*
3366 * Third case : a reference. If if has not been resolved,
3367 * parsing returns it's Name, create the node
3368 */
3369 else if (CUR == '&') {
3370 htmlParseReference(ctxt);
3371 }
3372
3373 /*
3374 * Fourth : end of the resource
3375 */
3376 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003377 htmlAutoCloseOnEnd(ctxt);
3378 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003379 }
3380
3381 /*
3382 * Last case, text. Note that References are handled directly.
3383 */
3384 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003385 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003386 }
3387
3388 if (cons == ctxt->nbChars) {
3389 if (ctxt->node != NULL) {
3390 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3391 ctxt->sax->error(ctxt->userData,
3392 "detected an error in element content\n");
3393 ctxt->wellFormed = 0;
3394 }
3395 break;
3396 }
3397 }
3398 GROW;
3399 }
3400 if (currentNode != NULL) xmlFree(currentNode);
3401}
3402
3403/**
3404 * htmlParseElement:
3405 * @ctxt: an HTML parser context
3406 *
3407 * parse an HTML element, this is highly recursive
3408 *
3409 * [39] element ::= EmptyElemTag | STag content ETag
3410 *
3411 * [41] Attribute ::= Name Eq AttValue
3412 */
3413
3414void
3415htmlParseElement(htmlParserCtxtPtr ctxt) {
3416 xmlChar *name;
3417 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003418 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003419 htmlParserNodeInfo node_info;
3420 xmlChar *oldname;
3421 int depth = ctxt->nameNr;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003422 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003423
3424 /* Capture start position */
3425 if (ctxt->record_info) {
3426 node_info.begin_pos = ctxt->input->consumed +
3427 (CUR_PTR - ctxt->input->base);
3428 node_info.begin_line = ctxt->input->line;
3429 }
3430
3431 oldname = xmlStrdup(ctxt->name);
3432 htmlParseStartTag(ctxt);
3433 name = ctxt->name;
3434#ifdef DEBUG
3435 if (oldname == NULL)
3436 xmlGenericError(xmlGenericErrorContext,
3437 "Start of element %s\n", name);
3438 else if (name == NULL)
3439 xmlGenericError(xmlGenericErrorContext,
3440 "Start of element failed, was %s\n", oldname);
3441 else
3442 xmlGenericError(xmlGenericErrorContext,
3443 "Start of element %s, was %s\n", name, oldname);
3444#endif
3445 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3446 (name == NULL)) {
3447 if (CUR == '>')
3448 NEXT;
3449 if (oldname != NULL)
3450 xmlFree(oldname);
3451 return;
3452 }
3453 if (oldname != NULL)
3454 xmlFree(oldname);
3455
3456 /*
3457 * Lookup the info for that element.
3458 */
3459 info = htmlTagLookup(name);
3460 if (info == NULL) {
3461 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3462 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3463 name);
3464 ctxt->wellFormed = 0;
3465 } else if (info->depr) {
3466/***************************
3467 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3468 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3469 name);
3470 ***************************/
3471 }
3472
3473 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00003474 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00003475 */
3476 if ((CUR == '/') && (NXT(1) == '>')) {
3477 SKIP(2);
3478 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3479 ctxt->sax->endElement(ctxt->userData, name);
3480 oldname = htmlnamePop(ctxt);
3481#ifdef DEBUG
3482 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
3483#endif
3484 if (oldname != NULL)
3485 xmlFree(oldname);
3486 return;
3487 }
3488
3489 if (CUR == '>') {
3490 NEXT;
3491 } else {
3492 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3493 ctxt->sax->error(ctxt->userData,
3494 "Couldn't find end of Start Tag %s\n",
3495 name);
3496 ctxt->wellFormed = 0;
3497
3498 /*
3499 * end of parsing of this node.
3500 */
3501 if (xmlStrEqual(name, ctxt->name)) {
3502 nodePop(ctxt);
3503 oldname = htmlnamePop(ctxt);
3504#ifdef DEBUG
3505 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
3506#endif
3507 if (oldname != NULL)
3508 xmlFree(oldname);
3509 }
3510
3511 /*
3512 * Capture end position and add node
3513 */
3514 if ( currentNode != NULL && ctxt->record_info ) {
3515 node_info.end_pos = ctxt->input->consumed +
3516 (CUR_PTR - ctxt->input->base);
3517 node_info.end_line = ctxt->input->line;
3518 node_info.node = ctxt->node;
3519 xmlParserAddNodeInfo(ctxt, &node_info);
3520 }
3521 return;
3522 }
3523
3524 /*
3525 * Check for an Empty Element from DTD definition
3526 */
3527 if ((info != NULL) && (info->empty)) {
3528 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3529 ctxt->sax->endElement(ctxt->userData, name);
3530 oldname = htmlnamePop(ctxt);
3531#ifdef DEBUG
3532 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
3533#endif
3534 if (oldname != NULL)
3535 xmlFree(oldname);
3536 return;
3537 }
3538
3539 /*
3540 * Parse the content of the element:
3541 */
3542 currentNode = xmlStrdup(ctxt->name);
3543 depth = ctxt->nameNr;
3544 while (IS_CHAR(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00003545 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00003546 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00003547 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00003548 if (ctxt->nameNr < depth) break;
3549 }
3550
Owen Taylor3473f882001-02-23 17:55:21 +00003551 /*
3552 * Capture end position and add node
3553 */
3554 if ( currentNode != NULL && ctxt->record_info ) {
3555 node_info.end_pos = ctxt->input->consumed +
3556 (CUR_PTR - ctxt->input->base);
3557 node_info.end_line = ctxt->input->line;
3558 node_info.node = ctxt->node;
3559 xmlParserAddNodeInfo(ctxt, &node_info);
3560 }
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003561 if (!IS_CHAR(CUR)) {
3562 htmlAutoCloseOnEnd(ctxt);
3563 }
3564
Owen Taylor3473f882001-02-23 17:55:21 +00003565 if (currentNode != NULL)
3566 xmlFree(currentNode);
3567}
3568
3569/**
3570 * htmlParseDocument :
3571 * @ctxt: an HTML parser context
3572 *
3573 * parse an HTML document (and build a tree if using the standard SAX
3574 * interface).
3575 *
3576 * Returns 0, -1 in case of error. the parser context is augmented
3577 * as a result of the parsing.
3578 */
3579
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003580static int
Owen Taylor3473f882001-02-23 17:55:21 +00003581htmlParseDocument(htmlParserCtxtPtr ctxt) {
3582 xmlDtdPtr dtd;
3583
Daniel Veillardd0463562001-10-13 09:15:48 +00003584 xmlInitParser();
3585
Owen Taylor3473f882001-02-23 17:55:21 +00003586 htmlDefaultSAXHandlerInit();
3587 ctxt->html = 1;
3588
3589 GROW;
3590 /*
3591 * SAX: beginning of the document processing.
3592 */
3593 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3594 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3595
3596 /*
3597 * Wipe out everything which is before the first '<'
3598 */
3599 SKIP_BLANKS;
3600 if (CUR == 0) {
3601 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3602 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3603 ctxt->wellFormed = 0;
3604 }
3605
3606 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3607 ctxt->sax->startDocument(ctxt->userData);
3608
3609
3610 /*
3611 * Parse possible comments before any content
3612 */
3613 while ((CUR == '<') && (NXT(1) == '!') &&
3614 (NXT(2) == '-') && (NXT(3) == '-')) {
3615 htmlParseComment(ctxt);
3616 SKIP_BLANKS;
3617 }
3618
3619
3620 /*
3621 * Then possibly doc type declaration(s) and more Misc
3622 * (doctypedecl Misc*)?
3623 */
3624 if ((CUR == '<') && (NXT(1) == '!') &&
3625 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3626 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3627 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3628 (UPP(8) == 'E')) {
3629 htmlParseDocTypeDecl(ctxt);
3630 }
3631 SKIP_BLANKS;
3632
3633 /*
3634 * Parse possible comments before any content
3635 */
3636 while ((CUR == '<') && (NXT(1) == '!') &&
3637 (NXT(2) == '-') && (NXT(3) == '-')) {
3638 htmlParseComment(ctxt);
3639 SKIP_BLANKS;
3640 }
3641
3642 /*
3643 * Time to start parsing the tree itself
3644 */
3645 htmlParseContent(ctxt);
3646
3647 /*
3648 * autoclose
3649 */
3650 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003651 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003652
3653
3654 /*
3655 * SAX: end of the document processing.
3656 */
3657 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3658 ctxt->sax->endDocument(ctxt->userData);
3659
3660 if (ctxt->myDoc != NULL) {
3661 dtd = xmlGetIntSubset(ctxt->myDoc);
3662 if (dtd == NULL)
3663 ctxt->myDoc->intSubset =
3664 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3665 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3666 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3667 }
3668 if (! ctxt->wellFormed) return(-1);
3669 return(0);
3670}
3671
3672
3673/************************************************************************
3674 * *
3675 * Parser contexts handling *
3676 * *
3677 ************************************************************************/
3678
3679/**
3680 * xmlInitParserCtxt:
3681 * @ctxt: an HTML parser context
3682 *
3683 * Initialize a parser context
3684 */
3685
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003686static void
Owen Taylor3473f882001-02-23 17:55:21 +00003687htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3688{
3689 htmlSAXHandler *sax;
3690
3691 if (ctxt == NULL) return;
3692 memset(ctxt, 0, sizeof(htmlParserCtxt));
3693
3694 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3695 if (sax == NULL) {
3696 xmlGenericError(xmlGenericErrorContext,
3697 "htmlInitParserCtxt: out of memory\n");
3698 }
3699 else
3700 memset(sax, 0, sizeof(htmlSAXHandler));
3701
3702 /* Allocate the Input stack */
3703 ctxt->inputTab = (htmlParserInputPtr *)
3704 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3705 if (ctxt->inputTab == NULL) {
3706 xmlGenericError(xmlGenericErrorContext,
3707 "htmlInitParserCtxt: out of memory\n");
3708 ctxt->inputNr = 0;
3709 ctxt->inputMax = 0;
3710 ctxt->input = NULL;
3711 return;
3712 }
3713 ctxt->inputNr = 0;
3714 ctxt->inputMax = 5;
3715 ctxt->input = NULL;
3716 ctxt->version = NULL;
3717 ctxt->encoding = NULL;
3718 ctxt->standalone = -1;
3719 ctxt->instate = XML_PARSER_START;
3720
3721 /* Allocate the Node stack */
3722 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3723 if (ctxt->nodeTab == NULL) {
3724 xmlGenericError(xmlGenericErrorContext,
3725 "htmlInitParserCtxt: out of memory\n");
3726 ctxt->nodeNr = 0;
3727 ctxt->nodeMax = 0;
3728 ctxt->node = NULL;
3729 ctxt->inputNr = 0;
3730 ctxt->inputMax = 0;
3731 ctxt->input = NULL;
3732 return;
3733 }
3734 ctxt->nodeNr = 0;
3735 ctxt->nodeMax = 10;
3736 ctxt->node = NULL;
3737
3738 /* Allocate the Name stack */
3739 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
3740 if (ctxt->nameTab == NULL) {
3741 xmlGenericError(xmlGenericErrorContext,
3742 "htmlInitParserCtxt: out of memory\n");
3743 ctxt->nameNr = 0;
3744 ctxt->nameMax = 10;
3745 ctxt->name = NULL;
3746 ctxt->nodeNr = 0;
3747 ctxt->nodeMax = 0;
3748 ctxt->node = NULL;
3749 ctxt->inputNr = 0;
3750 ctxt->inputMax = 0;
3751 ctxt->input = NULL;
3752 return;
3753 }
3754 ctxt->nameNr = 0;
3755 ctxt->nameMax = 10;
3756 ctxt->name = NULL;
3757
3758 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3759 else {
3760 ctxt->sax = sax;
3761 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3762 }
3763 ctxt->userData = ctxt;
3764 ctxt->myDoc = NULL;
3765 ctxt->wellFormed = 1;
3766 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00003767 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00003768 ctxt->html = 1;
3769 ctxt->record_info = 0;
3770 ctxt->validate = 0;
3771 ctxt->nbChars = 0;
3772 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00003773 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003774 xmlInitNodeInfoSeq(&ctxt->node_seq);
3775}
3776
3777/**
3778 * htmlFreeParserCtxt:
3779 * @ctxt: an HTML parser context
3780 *
3781 * Free all the memory used by a parser context. However the parsed
3782 * document in ctxt->myDoc is not freed.
3783 */
3784
3785void
3786htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3787{
3788 xmlFreeParserCtxt(ctxt);
3789}
3790
3791/**
3792 * htmlCreateDocParserCtxt :
3793 * @cur: a pointer to an array of xmlChar
3794 * @encoding: a free form C string describing the HTML document encoding, or NULL
3795 *
3796 * Create a parser context for an HTML document.
3797 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003798 * TODO: check the need to add encoding handling there
3799 *
Owen Taylor3473f882001-02-23 17:55:21 +00003800 * Returns the new parser context or NULL
3801 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003802static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00003803htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Owen Taylor3473f882001-02-23 17:55:21 +00003804 htmlParserCtxtPtr ctxt;
3805 htmlParserInputPtr input;
3806 /* htmlCharEncoding enc; */
3807
3808 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
3809 if (ctxt == NULL) {
3810 perror("malloc");
3811 return(NULL);
3812 }
3813 htmlInitParserCtxt(ctxt);
3814 input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
3815 if (input == NULL) {
3816 perror("malloc");
3817 xmlFree(ctxt);
3818 return(NULL);
3819 }
3820 memset(input, 0, sizeof(htmlParserInput));
3821
3822 input->line = 1;
3823 input->col = 1;
3824 input->base = cur;
3825 input->cur = cur;
3826
3827 inputPush(ctxt, input);
3828 return(ctxt);
3829}
3830
3831/************************************************************************
3832 * *
3833 * Progressive parsing interfaces *
3834 * *
3835 ************************************************************************/
3836
3837/**
3838 * htmlParseLookupSequence:
3839 * @ctxt: an HTML parser context
3840 * @first: the first char to lookup
3841 * @next: the next char to lookup or zero
3842 * @third: the next char to lookup or zero
3843 *
3844 * Try to find if a sequence (first, next, third) or just (first next) or
3845 * (first) is available in the input stream.
3846 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3847 * to avoid rescanning sequences of bytes, it DOES change the state of the
3848 * parser, do not use liberally.
3849 * This is basically similar to xmlParseLookupSequence()
3850 *
3851 * Returns the index to the current parsing point if the full sequence
3852 * is available, -1 otherwise.
3853 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003854static int
Owen Taylor3473f882001-02-23 17:55:21 +00003855htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3856 xmlChar next, xmlChar third) {
3857 int base, len;
3858 htmlParserInputPtr in;
3859 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00003860 int incomment = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003861
3862 in = ctxt->input;
3863 if (in == NULL) return(-1);
3864 base = in->cur - in->base;
3865 if (base < 0) return(-1);
3866 if (ctxt->checkIndex > base)
3867 base = ctxt->checkIndex;
3868 if (in->buf == NULL) {
3869 buf = in->base;
3870 len = in->length;
3871 } else {
3872 buf = in->buf->buffer->content;
3873 len = in->buf->buffer->use;
3874 }
3875 /* take into account the sequence length */
3876 if (third) len -= 2;
3877 else if (next) len --;
3878 for (;base < len;base++) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00003879 if (!incomment && (base + 4 < len)) {
3880 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
3881 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
3882 incomment = 1;
3883 }
3884 /* do not increment base, some people use <!--> */
3885 }
3886 if (incomment) {
3887 if (base + 3 < len)
3888 return(-1);
3889 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
3890 (buf[base + 2] == '>')) {
3891 incomment = 0;
3892 base += 2;
3893 }
3894 continue;
3895 }
Owen Taylor3473f882001-02-23 17:55:21 +00003896 if (buf[base] == first) {
3897 if (third != 0) {
3898 if ((buf[base + 1] != next) ||
3899 (buf[base + 2] != third)) continue;
3900 } else if (next != 0) {
3901 if (buf[base + 1] != next) continue;
3902 }
3903 ctxt->checkIndex = 0;
3904#ifdef DEBUG_PUSH
3905 if (next == 0)
3906 xmlGenericError(xmlGenericErrorContext,
3907 "HPP: lookup '%c' found at %d\n",
3908 first, base);
3909 else if (third == 0)
3910 xmlGenericError(xmlGenericErrorContext,
3911 "HPP: lookup '%c%c' found at %d\n",
3912 first, next, base);
3913 else
3914 xmlGenericError(xmlGenericErrorContext,
3915 "HPP: lookup '%c%c%c' found at %d\n",
3916 first, next, third, base);
3917#endif
3918 return(base - (in->cur - in->base));
3919 }
3920 }
3921 ctxt->checkIndex = base;
3922#ifdef DEBUG_PUSH
3923 if (next == 0)
3924 xmlGenericError(xmlGenericErrorContext,
3925 "HPP: lookup '%c' failed\n", first);
3926 else if (third == 0)
3927 xmlGenericError(xmlGenericErrorContext,
3928 "HPP: lookup '%c%c' failed\n", first, next);
3929 else
3930 xmlGenericError(xmlGenericErrorContext,
3931 "HPP: lookup '%c%c%c' failed\n", first, next, third);
3932#endif
3933 return(-1);
3934}
3935
3936/**
3937 * htmlParseTryOrFinish:
3938 * @ctxt: an HTML parser context
3939 * @terminate: last chunk indicator
3940 *
3941 * Try to progress on parsing
3942 *
3943 * Returns zero if no parsing was possible
3944 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003945static int
Owen Taylor3473f882001-02-23 17:55:21 +00003946htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
3947 int ret = 0;
3948 htmlParserInputPtr in;
3949 int avail = 0;
3950 xmlChar cur, next;
3951
3952#ifdef DEBUG_PUSH
3953 switch (ctxt->instate) {
3954 case XML_PARSER_EOF:
3955 xmlGenericError(xmlGenericErrorContext,
3956 "HPP: try EOF\n"); break;
3957 case XML_PARSER_START:
3958 xmlGenericError(xmlGenericErrorContext,
3959 "HPP: try START\n"); break;
3960 case XML_PARSER_MISC:
3961 xmlGenericError(xmlGenericErrorContext,
3962 "HPP: try MISC\n");break;
3963 case XML_PARSER_COMMENT:
3964 xmlGenericError(xmlGenericErrorContext,
3965 "HPP: try COMMENT\n");break;
3966 case XML_PARSER_PROLOG:
3967 xmlGenericError(xmlGenericErrorContext,
3968 "HPP: try PROLOG\n");break;
3969 case XML_PARSER_START_TAG:
3970 xmlGenericError(xmlGenericErrorContext,
3971 "HPP: try START_TAG\n");break;
3972 case XML_PARSER_CONTENT:
3973 xmlGenericError(xmlGenericErrorContext,
3974 "HPP: try CONTENT\n");break;
3975 case XML_PARSER_CDATA_SECTION:
3976 xmlGenericError(xmlGenericErrorContext,
3977 "HPP: try CDATA_SECTION\n");break;
3978 case XML_PARSER_END_TAG:
3979 xmlGenericError(xmlGenericErrorContext,
3980 "HPP: try END_TAG\n");break;
3981 case XML_PARSER_ENTITY_DECL:
3982 xmlGenericError(xmlGenericErrorContext,
3983 "HPP: try ENTITY_DECL\n");break;
3984 case XML_PARSER_ENTITY_VALUE:
3985 xmlGenericError(xmlGenericErrorContext,
3986 "HPP: try ENTITY_VALUE\n");break;
3987 case XML_PARSER_ATTRIBUTE_VALUE:
3988 xmlGenericError(xmlGenericErrorContext,
3989 "HPP: try ATTRIBUTE_VALUE\n");break;
3990 case XML_PARSER_DTD:
3991 xmlGenericError(xmlGenericErrorContext,
3992 "HPP: try DTD\n");break;
3993 case XML_PARSER_EPILOG:
3994 xmlGenericError(xmlGenericErrorContext,
3995 "HPP: try EPILOG\n");break;
3996 case XML_PARSER_PI:
3997 xmlGenericError(xmlGenericErrorContext,
3998 "HPP: try PI\n");break;
3999 case XML_PARSER_SYSTEM_LITERAL:
4000 xmlGenericError(xmlGenericErrorContext,
4001 "HPP: try SYSTEM_LITERAL\n");break;
4002 }
4003#endif
4004
4005 while (1) {
4006
4007 in = ctxt->input;
4008 if (in == NULL) break;
4009 if (in->buf == NULL)
4010 avail = in->length - (in->cur - in->base);
4011 else
4012 avail = in->buf->buffer->use - (in->cur - in->base);
4013 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004014 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004015 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4016 /*
4017 * SAX: end of the document processing.
4018 */
4019 ctxt->instate = XML_PARSER_EOF;
4020 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4021 ctxt->sax->endDocument(ctxt->userData);
4022 }
4023 }
4024 if (avail < 1)
4025 goto done;
4026 switch (ctxt->instate) {
4027 case XML_PARSER_EOF:
4028 /*
4029 * Document parsing is done !
4030 */
4031 goto done;
4032 case XML_PARSER_START:
4033 /*
4034 * Very first chars read from the document flow.
4035 */
4036 cur = in->cur[0];
4037 if (IS_BLANK(cur)) {
4038 SKIP_BLANKS;
4039 if (in->buf == NULL)
4040 avail = in->length - (in->cur - in->base);
4041 else
4042 avail = in->buf->buffer->use - (in->cur - in->base);
4043 }
4044 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4045 ctxt->sax->setDocumentLocator(ctxt->userData,
4046 &xmlDefaultSAXLocator);
4047 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4048 (!ctxt->disableSAX))
4049 ctxt->sax->startDocument(ctxt->userData);
4050
4051 cur = in->cur[0];
4052 next = in->cur[1];
4053 if ((cur == '<') && (next == '!') &&
4054 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4055 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4056 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4057 (UPP(8) == 'E')) {
4058 if ((!terminate) &&
4059 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4060 goto done;
4061#ifdef DEBUG_PUSH
4062 xmlGenericError(xmlGenericErrorContext,
4063 "HPP: Parsing internal subset\n");
4064#endif
4065 htmlParseDocTypeDecl(ctxt);
4066 ctxt->instate = XML_PARSER_PROLOG;
4067#ifdef DEBUG_PUSH
4068 xmlGenericError(xmlGenericErrorContext,
4069 "HPP: entering PROLOG\n");
4070#endif
4071 } else {
4072 ctxt->instate = XML_PARSER_MISC;
4073 }
4074#ifdef DEBUG_PUSH
4075 xmlGenericError(xmlGenericErrorContext,
4076 "HPP: entering MISC\n");
4077#endif
4078 break;
4079 case XML_PARSER_MISC:
4080 SKIP_BLANKS;
4081 if (in->buf == NULL)
4082 avail = in->length - (in->cur - in->base);
4083 else
4084 avail = in->buf->buffer->use - (in->cur - in->base);
4085 if (avail < 2)
4086 goto done;
4087 cur = in->cur[0];
4088 next = in->cur[1];
4089 if ((cur == '<') && (next == '!') &&
4090 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4091 if ((!terminate) &&
4092 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4093 goto done;
4094#ifdef DEBUG_PUSH
4095 xmlGenericError(xmlGenericErrorContext,
4096 "HPP: Parsing Comment\n");
4097#endif
4098 htmlParseComment(ctxt);
4099 ctxt->instate = XML_PARSER_MISC;
4100 } else if ((cur == '<') && (next == '!') &&
4101 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4102 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4103 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4104 (UPP(8) == 'E')) {
4105 if ((!terminate) &&
4106 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4107 goto done;
4108#ifdef DEBUG_PUSH
4109 xmlGenericError(xmlGenericErrorContext,
4110 "HPP: Parsing internal subset\n");
4111#endif
4112 htmlParseDocTypeDecl(ctxt);
4113 ctxt->instate = XML_PARSER_PROLOG;
4114#ifdef DEBUG_PUSH
4115 xmlGenericError(xmlGenericErrorContext,
4116 "HPP: entering PROLOG\n");
4117#endif
4118 } else if ((cur == '<') && (next == '!') &&
4119 (avail < 9)) {
4120 goto done;
4121 } else {
4122 ctxt->instate = XML_PARSER_START_TAG;
4123#ifdef DEBUG_PUSH
4124 xmlGenericError(xmlGenericErrorContext,
4125 "HPP: entering START_TAG\n");
4126#endif
4127 }
4128 break;
4129 case XML_PARSER_PROLOG:
4130 SKIP_BLANKS;
4131 if (in->buf == NULL)
4132 avail = in->length - (in->cur - in->base);
4133 else
4134 avail = in->buf->buffer->use - (in->cur - in->base);
4135 if (avail < 2)
4136 goto done;
4137 cur = in->cur[0];
4138 next = in->cur[1];
4139 if ((cur == '<') && (next == '!') &&
4140 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4141 if ((!terminate) &&
4142 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4143 goto done;
4144#ifdef DEBUG_PUSH
4145 xmlGenericError(xmlGenericErrorContext,
4146 "HPP: Parsing Comment\n");
4147#endif
4148 htmlParseComment(ctxt);
4149 ctxt->instate = XML_PARSER_PROLOG;
4150 } else if ((cur == '<') && (next == '!') &&
4151 (avail < 4)) {
4152 goto done;
4153 } else {
4154 ctxt->instate = XML_PARSER_START_TAG;
4155#ifdef DEBUG_PUSH
4156 xmlGenericError(xmlGenericErrorContext,
4157 "HPP: entering START_TAG\n");
4158#endif
4159 }
4160 break;
4161 case XML_PARSER_EPILOG:
4162 if (in->buf == NULL)
4163 avail = in->length - (in->cur - in->base);
4164 else
4165 avail = in->buf->buffer->use - (in->cur - in->base);
4166 if (avail < 1)
4167 goto done;
4168 cur = in->cur[0];
4169 if (IS_BLANK(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004170 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004171 goto done;
4172 }
4173 if (avail < 2)
4174 goto done;
4175 next = in->cur[1];
4176 if ((cur == '<') && (next == '!') &&
4177 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4178 if ((!terminate) &&
4179 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4180 goto done;
4181#ifdef DEBUG_PUSH
4182 xmlGenericError(xmlGenericErrorContext,
4183 "HPP: Parsing Comment\n");
4184#endif
4185 htmlParseComment(ctxt);
4186 ctxt->instate = XML_PARSER_EPILOG;
4187 } else if ((cur == '<') && (next == '!') &&
4188 (avail < 4)) {
4189 goto done;
4190 } else {
4191 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004192 ctxt->wellFormed = 0;
4193 ctxt->instate = XML_PARSER_EOF;
4194#ifdef DEBUG_PUSH
4195 xmlGenericError(xmlGenericErrorContext,
4196 "HPP: entering EOF\n");
4197#endif
4198 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4199 ctxt->sax->endDocument(ctxt->userData);
4200 goto done;
4201 }
4202 break;
4203 case XML_PARSER_START_TAG: {
4204 xmlChar *name, *oldname;
4205 int depth = ctxt->nameNr;
Daniel Veillardbb371292001-08-16 23:26:59 +00004206 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004207
4208 if (avail < 2)
4209 goto done;
4210 cur = in->cur[0];
4211 if (cur != '<') {
4212 ctxt->instate = XML_PARSER_CONTENT;
4213#ifdef DEBUG_PUSH
4214 xmlGenericError(xmlGenericErrorContext,
4215 "HPP: entering CONTENT\n");
4216#endif
4217 break;
4218 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004219 if (in->cur[1] == '/') {
4220 ctxt->instate = XML_PARSER_END_TAG;
4221 ctxt->checkIndex = 0;
4222#ifdef DEBUG_PUSH
4223 xmlGenericError(xmlGenericErrorContext,
4224 "HPP: entering END_TAG\n");
4225#endif
4226 break;
4227 }
Owen Taylor3473f882001-02-23 17:55:21 +00004228 if ((!terminate) &&
4229 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4230 goto done;
4231
4232 oldname = xmlStrdup(ctxt->name);
4233 htmlParseStartTag(ctxt);
4234 name = ctxt->name;
4235#ifdef DEBUG
4236 if (oldname == NULL)
4237 xmlGenericError(xmlGenericErrorContext,
4238 "Start of element %s\n", name);
4239 else if (name == NULL)
4240 xmlGenericError(xmlGenericErrorContext,
4241 "Start of element failed, was %s\n",
4242 oldname);
4243 else
4244 xmlGenericError(xmlGenericErrorContext,
4245 "Start of element %s, was %s\n",
4246 name, oldname);
4247#endif
4248 if (((depth == ctxt->nameNr) &&
4249 (xmlStrEqual(oldname, ctxt->name))) ||
4250 (name == NULL)) {
4251 if (CUR == '>')
4252 NEXT;
4253 if (oldname != NULL)
4254 xmlFree(oldname);
4255 break;
4256 }
4257 if (oldname != NULL)
4258 xmlFree(oldname);
4259
4260 /*
4261 * Lookup the info for that element.
4262 */
4263 info = htmlTagLookup(name);
4264 if (info == NULL) {
4265 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4266 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4267 name);
4268 ctxt->wellFormed = 0;
4269 } else if (info->depr) {
4270 /***************************
4271 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4272 ctxt->sax->warning(ctxt->userData,
4273 "Tag %s is deprecated\n",
4274 name);
4275 ***************************/
4276 }
4277
4278 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004279 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004280 */
4281 if ((CUR == '/') && (NXT(1) == '>')) {
4282 SKIP(2);
4283 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4284 ctxt->sax->endElement(ctxt->userData, name);
4285 oldname = htmlnamePop(ctxt);
4286#ifdef DEBUG
4287 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
4288 oldname);
4289#endif
4290 if (oldname != NULL)
4291 xmlFree(oldname);
4292 ctxt->instate = XML_PARSER_CONTENT;
4293#ifdef DEBUG_PUSH
4294 xmlGenericError(xmlGenericErrorContext,
4295 "HPP: entering CONTENT\n");
4296#endif
4297 break;
4298 }
4299
4300 if (CUR == '>') {
4301 NEXT;
4302 } else {
4303 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4304 ctxt->sax->error(ctxt->userData,
4305 "Couldn't find end of Start Tag %s\n",
4306 name);
4307 ctxt->wellFormed = 0;
4308
4309 /*
4310 * end of parsing of this node.
4311 */
4312 if (xmlStrEqual(name, ctxt->name)) {
4313 nodePop(ctxt);
4314 oldname = htmlnamePop(ctxt);
4315#ifdef DEBUG
4316 xmlGenericError(xmlGenericErrorContext,
4317 "End of start tag problem: popping out %s\n", oldname);
4318#endif
4319 if (oldname != NULL)
4320 xmlFree(oldname);
4321 }
4322
4323 ctxt->instate = XML_PARSER_CONTENT;
4324#ifdef DEBUG_PUSH
4325 xmlGenericError(xmlGenericErrorContext,
4326 "HPP: entering CONTENT\n");
4327#endif
4328 break;
4329 }
4330
4331 /*
4332 * Check for an Empty Element from DTD definition
4333 */
4334 if ((info != NULL) && (info->empty)) {
4335 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4336 ctxt->sax->endElement(ctxt->userData, name);
4337 oldname = htmlnamePop(ctxt);
4338#ifdef DEBUG
4339 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
4340#endif
4341 if (oldname != NULL)
4342 xmlFree(oldname);
4343 }
4344 ctxt->instate = XML_PARSER_CONTENT;
4345#ifdef DEBUG_PUSH
4346 xmlGenericError(xmlGenericErrorContext,
4347 "HPP: entering CONTENT\n");
4348#endif
4349 break;
4350 }
4351 case XML_PARSER_CONTENT: {
4352 long cons;
4353 /*
4354 * Handle preparsed entities and charRef
4355 */
4356 if (ctxt->token != 0) {
4357 xmlChar chr[2] = { 0 , 0 } ;
4358
4359 chr[0] = (xmlChar) ctxt->token;
4360 htmlCheckParagraph(ctxt);
4361 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4362 ctxt->sax->characters(ctxt->userData, chr, 1);
4363 ctxt->token = 0;
4364 ctxt->checkIndex = 0;
4365 }
4366 if ((avail == 1) && (terminate)) {
4367 cur = in->cur[0];
4368 if ((cur != '<') && (cur != '&')) {
4369 if (ctxt->sax != NULL) {
4370 if (IS_BLANK(cur)) {
4371 if (ctxt->sax->ignorableWhitespace != NULL)
4372 ctxt->sax->ignorableWhitespace(
4373 ctxt->userData, &cur, 1);
4374 } else {
4375 htmlCheckParagraph(ctxt);
4376 if (ctxt->sax->characters != NULL)
4377 ctxt->sax->characters(
4378 ctxt->userData, &cur, 1);
4379 }
4380 }
4381 ctxt->token = 0;
4382 ctxt->checkIndex = 0;
4383 NEXT;
William M. Brack1633d182001-10-05 15:41:19 +00004384 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004385 }
Owen Taylor3473f882001-02-23 17:55:21 +00004386 }
4387 if (avail < 2)
4388 goto done;
4389 cur = in->cur[0];
4390 next = in->cur[1];
4391 cons = ctxt->nbChars;
4392 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4393 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4394 /*
4395 * Handle SCRIPT/STYLE separately
4396 */
4397 if ((!terminate) &&
4398 (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4399 goto done;
4400 htmlParseScript(ctxt);
4401 if ((cur == '<') && (next == '/')) {
4402 ctxt->instate = XML_PARSER_END_TAG;
4403 ctxt->checkIndex = 0;
4404#ifdef DEBUG_PUSH
4405 xmlGenericError(xmlGenericErrorContext,
4406 "HPP: entering END_TAG\n");
4407#endif
4408 break;
4409 }
4410 } else {
4411 /*
4412 * Sometimes DOCTYPE arrives in the middle of the document
4413 */
4414 if ((cur == '<') && (next == '!') &&
4415 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4416 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4417 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4418 (UPP(8) == 'E')) {
4419 if ((!terminate) &&
4420 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4421 goto done;
4422 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4423 ctxt->sax->error(ctxt->userData,
4424 "Misplaced DOCTYPE declaration\n");
4425 ctxt->wellFormed = 0;
4426 htmlParseDocTypeDecl(ctxt);
4427 } else if ((cur == '<') && (next == '!') &&
4428 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4429 if ((!terminate) &&
4430 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4431 goto done;
4432#ifdef DEBUG_PUSH
4433 xmlGenericError(xmlGenericErrorContext,
4434 "HPP: Parsing Comment\n");
4435#endif
4436 htmlParseComment(ctxt);
4437 ctxt->instate = XML_PARSER_CONTENT;
4438 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4439 goto done;
4440 } else if ((cur == '<') && (next == '/')) {
4441 ctxt->instate = XML_PARSER_END_TAG;
4442 ctxt->checkIndex = 0;
4443#ifdef DEBUG_PUSH
4444 xmlGenericError(xmlGenericErrorContext,
4445 "HPP: entering END_TAG\n");
4446#endif
4447 break;
4448 } else if (cur == '<') {
4449 ctxt->instate = XML_PARSER_START_TAG;
4450 ctxt->checkIndex = 0;
4451#ifdef DEBUG_PUSH
4452 xmlGenericError(xmlGenericErrorContext,
4453 "HPP: entering START_TAG\n");
4454#endif
4455 break;
4456 } else if (cur == '&') {
4457 if ((!terminate) &&
4458 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4459 goto done;
4460#ifdef DEBUG_PUSH
4461 xmlGenericError(xmlGenericErrorContext,
4462 "HPP: Parsing Reference\n");
4463#endif
4464 /* TODO: check generation of subtrees if noent !!! */
4465 htmlParseReference(ctxt);
4466 } else {
4467 /* TODO Avoid the extra copy, handle directly !!!!!! */
4468 /*
4469 * Goal of the following test is :
4470 * - minimize calls to the SAX 'character' callback
4471 * when they are mergeable
4472 */
4473 if ((ctxt->inputNr == 1) &&
4474 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4475 if ((!terminate) &&
4476 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4477 goto done;
4478 }
4479 ctxt->checkIndex = 0;
4480#ifdef DEBUG_PUSH
4481 xmlGenericError(xmlGenericErrorContext,
4482 "HPP: Parsing char data\n");
4483#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004484 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004485 }
4486 }
4487 if (cons == ctxt->nbChars) {
4488 if (ctxt->node != NULL) {
4489 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4490 ctxt->sax->error(ctxt->userData,
4491 "detected an error in element content\n");
4492 ctxt->wellFormed = 0;
4493 }
4494 NEXT;
4495 break;
4496 }
4497
4498 break;
4499 }
4500 case XML_PARSER_END_TAG:
4501 if (avail < 2)
4502 goto done;
4503 if ((!terminate) &&
4504 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4505 goto done;
4506 htmlParseEndTag(ctxt);
4507 if (ctxt->nameNr == 0) {
4508 ctxt->instate = XML_PARSER_EPILOG;
4509 } else {
4510 ctxt->instate = XML_PARSER_CONTENT;
4511 }
4512 ctxt->checkIndex = 0;
4513#ifdef DEBUG_PUSH
4514 xmlGenericError(xmlGenericErrorContext,
4515 "HPP: entering CONTENT\n");
4516#endif
4517 break;
4518 case XML_PARSER_CDATA_SECTION:
4519 xmlGenericError(xmlGenericErrorContext,
4520 "HPP: internal error, state == CDATA\n");
4521 ctxt->instate = XML_PARSER_CONTENT;
4522 ctxt->checkIndex = 0;
4523#ifdef DEBUG_PUSH
4524 xmlGenericError(xmlGenericErrorContext,
4525 "HPP: entering CONTENT\n");
4526#endif
4527 break;
4528 case XML_PARSER_DTD:
4529 xmlGenericError(xmlGenericErrorContext,
4530 "HPP: internal error, state == DTD\n");
4531 ctxt->instate = XML_PARSER_CONTENT;
4532 ctxt->checkIndex = 0;
4533#ifdef DEBUG_PUSH
4534 xmlGenericError(xmlGenericErrorContext,
4535 "HPP: entering CONTENT\n");
4536#endif
4537 break;
4538 case XML_PARSER_COMMENT:
4539 xmlGenericError(xmlGenericErrorContext,
4540 "HPP: internal error, state == COMMENT\n");
4541 ctxt->instate = XML_PARSER_CONTENT;
4542 ctxt->checkIndex = 0;
4543#ifdef DEBUG_PUSH
4544 xmlGenericError(xmlGenericErrorContext,
4545 "HPP: entering CONTENT\n");
4546#endif
4547 break;
4548 case XML_PARSER_PI:
4549 xmlGenericError(xmlGenericErrorContext,
4550 "HPP: internal error, state == PI\n");
4551 ctxt->instate = XML_PARSER_CONTENT;
4552 ctxt->checkIndex = 0;
4553#ifdef DEBUG_PUSH
4554 xmlGenericError(xmlGenericErrorContext,
4555 "HPP: entering CONTENT\n");
4556#endif
4557 break;
4558 case XML_PARSER_ENTITY_DECL:
4559 xmlGenericError(xmlGenericErrorContext,
4560 "HPP: internal error, state == ENTITY_DECL\n");
4561 ctxt->instate = XML_PARSER_CONTENT;
4562 ctxt->checkIndex = 0;
4563#ifdef DEBUG_PUSH
4564 xmlGenericError(xmlGenericErrorContext,
4565 "HPP: entering CONTENT\n");
4566#endif
4567 break;
4568 case XML_PARSER_ENTITY_VALUE:
4569 xmlGenericError(xmlGenericErrorContext,
4570 "HPP: internal error, state == ENTITY_VALUE\n");
4571 ctxt->instate = XML_PARSER_CONTENT;
4572 ctxt->checkIndex = 0;
4573#ifdef DEBUG_PUSH
4574 xmlGenericError(xmlGenericErrorContext,
4575 "HPP: entering DTD\n");
4576#endif
4577 break;
4578 case XML_PARSER_ATTRIBUTE_VALUE:
4579 xmlGenericError(xmlGenericErrorContext,
4580 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
4581 ctxt->instate = XML_PARSER_START_TAG;
4582 ctxt->checkIndex = 0;
4583#ifdef DEBUG_PUSH
4584 xmlGenericError(xmlGenericErrorContext,
4585 "HPP: entering START_TAG\n");
4586#endif
4587 break;
4588 case XML_PARSER_SYSTEM_LITERAL:
4589 xmlGenericError(xmlGenericErrorContext,
4590 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
4591 ctxt->instate = XML_PARSER_CONTENT;
4592 ctxt->checkIndex = 0;
4593#ifdef DEBUG_PUSH
4594 xmlGenericError(xmlGenericErrorContext,
4595 "HPP: entering CONTENT\n");
4596#endif
4597 break;
4598 case XML_PARSER_IGNORE:
4599 xmlGenericError(xmlGenericErrorContext,
4600 "HPP: internal error, state == XML_PARSER_IGNORE\n");
4601 ctxt->instate = XML_PARSER_CONTENT;
4602 ctxt->checkIndex = 0;
4603#ifdef DEBUG_PUSH
4604 xmlGenericError(xmlGenericErrorContext,
4605 "HPP: entering CONTENT\n");
4606#endif
4607 break;
4608 }
4609 }
4610done:
4611 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004612 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004613 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4614 /*
4615 * SAX: end of the document processing.
4616 */
4617 ctxt->instate = XML_PARSER_EOF;
4618 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4619 ctxt->sax->endDocument(ctxt->userData);
4620 }
4621 }
4622 if ((ctxt->myDoc != NULL) &&
4623 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4624 (ctxt->instate == XML_PARSER_EPILOG))) {
4625 xmlDtdPtr dtd;
4626 dtd = xmlGetIntSubset(ctxt->myDoc);
4627 if (dtd == NULL)
4628 ctxt->myDoc->intSubset =
4629 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4630 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4631 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4632 }
4633#ifdef DEBUG_PUSH
4634 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4635#endif
4636 return(ret);
4637}
4638
4639/**
Owen Taylor3473f882001-02-23 17:55:21 +00004640 * htmlParseChunk:
4641 * @ctxt: an XML parser context
4642 * @chunk: an char array
4643 * @size: the size in byte of the chunk
4644 * @terminate: last chunk indicator
4645 *
4646 * Parse a Chunk of memory
4647 *
4648 * Returns zero if no error, the xmlParserErrors otherwise.
4649 */
4650int
4651htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4652 int terminate) {
4653 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4654 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4655 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4656 int cur = ctxt->input->cur - ctxt->input->base;
4657
4658 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4659 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4660 ctxt->input->cur = ctxt->input->base + cur;
4661#ifdef DEBUG_PUSH
4662 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4663#endif
4664
4665 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4666 htmlParseTryOrFinish(ctxt, terminate);
4667 } else if (ctxt->instate != XML_PARSER_EOF) {
4668 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
4669 htmlParseTryOrFinish(ctxt, terminate);
4670 }
4671 if (terminate) {
4672 if ((ctxt->instate != XML_PARSER_EOF) &&
4673 (ctxt->instate != XML_PARSER_EPILOG) &&
4674 (ctxt->instate != XML_PARSER_MISC)) {
4675 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004676 ctxt->wellFormed = 0;
4677 }
4678 if (ctxt->instate != XML_PARSER_EOF) {
4679 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4680 ctxt->sax->endDocument(ctxt->userData);
4681 }
4682 ctxt->instate = XML_PARSER_EOF;
4683 }
4684 return((xmlParserErrors) ctxt->errNo);
4685}
4686
4687/************************************************************************
4688 * *
4689 * User entry points *
4690 * *
4691 ************************************************************************/
4692
4693/**
4694 * htmlCreatePushParserCtxt :
4695 * @sax: a SAX handler
4696 * @user_data: The user data returned on SAX callbacks
4697 * @chunk: a pointer to an array of chars
4698 * @size: number of chars in the array
4699 * @filename: an optional file name or URI
4700 * @enc: an optional encoding
4701 *
4702 * Create a parser context for using the HTML parser in push mode
4703 * To allow content encoding detection, @size should be >= 4
4704 * The value of @filename is used for fetching external entities
4705 * and error/warning reports.
4706 *
4707 * Returns the new parser context or NULL
4708 */
4709htmlParserCtxtPtr
4710htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4711 const char *chunk, int size, const char *filename,
4712 xmlCharEncoding enc) {
4713 htmlParserCtxtPtr ctxt;
4714 htmlParserInputPtr inputStream;
4715 xmlParserInputBufferPtr buf;
4716
Daniel Veillardd0463562001-10-13 09:15:48 +00004717 xmlInitParser();
4718
Owen Taylor3473f882001-02-23 17:55:21 +00004719 buf = xmlAllocParserInputBuffer(enc);
4720 if (buf == NULL) return(NULL);
4721
4722 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4723 if (ctxt == NULL) {
4724 xmlFree(buf);
4725 return(NULL);
4726 }
4727 memset(ctxt, 0, sizeof(htmlParserCtxt));
4728 htmlInitParserCtxt(ctxt);
4729 if (sax != NULL) {
4730 if (ctxt->sax != &htmlDefaultSAXHandler)
4731 xmlFree(ctxt->sax);
4732 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4733 if (ctxt->sax == NULL) {
4734 xmlFree(buf);
4735 xmlFree(ctxt);
4736 return(NULL);
4737 }
4738 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4739 if (user_data != NULL)
4740 ctxt->userData = user_data;
4741 }
4742 if (filename == NULL) {
4743 ctxt->directory = NULL;
4744 } else {
4745 ctxt->directory = xmlParserGetDirectory(filename);
4746 }
4747
4748 inputStream = htmlNewInputStream(ctxt);
4749 if (inputStream == NULL) {
4750 xmlFreeParserCtxt(ctxt);
4751 return(NULL);
4752 }
4753
4754 if (filename == NULL)
4755 inputStream->filename = NULL;
4756 else
4757 inputStream->filename = xmlMemStrdup(filename);
4758 inputStream->buf = buf;
4759 inputStream->base = inputStream->buf->buffer->content;
4760 inputStream->cur = inputStream->buf->buffer->content;
4761
4762 inputPush(ctxt, inputStream);
4763
4764 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4765 (ctxt->input->buf != NULL)) {
4766 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4767#ifdef DEBUG_PUSH
4768 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4769#endif
4770 }
4771
4772 return(ctxt);
4773}
4774
4775/**
4776 * htmlSAXParseDoc :
4777 * @cur: a pointer to an array of xmlChar
4778 * @encoding: a free form C string describing the HTML document encoding, or NULL
4779 * @sax: the SAX handler block
4780 * @userData: if using SAX, this pointer will be provided on callbacks.
4781 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004782 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
4783 * to handle parse events. If sax is NULL, fallback to the default DOM
4784 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00004785 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004786 * Returns the resulting document tree unless SAX is NULL or the document is
4787 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00004788 */
4789
4790htmlDocPtr
4791htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
4792 htmlDocPtr ret;
4793 htmlParserCtxtPtr ctxt;
4794
Daniel Veillardd0463562001-10-13 09:15:48 +00004795 xmlInitParser();
4796
Owen Taylor3473f882001-02-23 17:55:21 +00004797 if (cur == NULL) return(NULL);
4798
4799
4800 ctxt = htmlCreateDocParserCtxt(cur, encoding);
4801 if (ctxt == NULL) return(NULL);
4802 if (sax != NULL) {
4803 ctxt->sax = sax;
4804 ctxt->userData = userData;
4805 }
4806
4807 htmlParseDocument(ctxt);
4808 ret = ctxt->myDoc;
4809 if (sax != NULL) {
4810 ctxt->sax = NULL;
4811 ctxt->userData = NULL;
4812 }
4813 htmlFreeParserCtxt(ctxt);
4814
4815 return(ret);
4816}
4817
4818/**
4819 * htmlParseDoc :
4820 * @cur: a pointer to an array of xmlChar
4821 * @encoding: a free form C string describing the HTML document encoding, or NULL
4822 *
4823 * parse an HTML in-memory document and build a tree.
4824 *
4825 * Returns the resulting document tree
4826 */
4827
4828htmlDocPtr
4829htmlParseDoc(xmlChar *cur, const char *encoding) {
4830 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4831}
4832
4833
4834/**
4835 * htmlCreateFileParserCtxt :
4836 * @filename: the filename
4837 * @encoding: a free form C string describing the HTML document encoding, or NULL
4838 *
4839 * Create a parser context for a file content.
4840 * Automatic support for ZLIB/Compress compressed document is provided
4841 * by default if found at compile-time.
4842 *
4843 * Returns the new parser context or NULL
4844 */
4845htmlParserCtxtPtr
4846htmlCreateFileParserCtxt(const char *filename, const char *encoding)
4847{
4848 htmlParserCtxtPtr ctxt;
4849 htmlParserInputPtr inputStream;
4850 xmlParserInputBufferPtr buf;
4851 /* htmlCharEncoding enc; */
4852 xmlChar *content, *content_line = (xmlChar *) "charset=";
4853
4854 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
4855 if (buf == NULL) return(NULL);
4856
4857 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4858 if (ctxt == NULL) {
4859 perror("malloc");
4860 return(NULL);
4861 }
4862 memset(ctxt, 0, sizeof(htmlParserCtxt));
4863 htmlInitParserCtxt(ctxt);
4864 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
4865 if (inputStream == NULL) {
4866 perror("malloc");
4867 xmlFree(ctxt);
4868 return(NULL);
4869 }
4870 memset(inputStream, 0, sizeof(htmlParserInput));
4871
4872 inputStream->filename = xmlMemStrdup(filename);
4873 inputStream->line = 1;
4874 inputStream->col = 1;
4875 inputStream->buf = buf;
4876 inputStream->directory = NULL;
4877
4878 inputStream->base = inputStream->buf->buffer->content;
4879 inputStream->cur = inputStream->buf->buffer->content;
4880 inputStream->free = NULL;
4881
4882 inputPush(ctxt, inputStream);
4883
4884 /* set encoding */
4885 if (encoding) {
4886 content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
4887 if (content) {
4888 strcpy ((char *)content, (char *)content_line);
4889 strcat ((char *)content, (char *)encoding);
4890 htmlCheckEncoding (ctxt, content);
4891 xmlFree (content);
4892 }
4893 }
4894
4895 return(ctxt);
4896}
4897
4898/**
4899 * htmlSAXParseFile :
4900 * @filename: the filename
4901 * @encoding: a free form C string describing the HTML document encoding, or NULL
4902 * @sax: the SAX handler block
4903 * @userData: if using SAX, this pointer will be provided on callbacks.
4904 *
4905 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4906 * compressed document is provided by default if found at compile-time.
4907 * It use the given SAX function block to handle the parsing callback.
4908 * If sax is NULL, fallback to the default DOM tree building routines.
4909 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004910 * Returns the resulting document tree unless SAX is NULL or the document is
4911 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00004912 */
4913
4914htmlDocPtr
4915htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
4916 void *userData) {
4917 htmlDocPtr ret;
4918 htmlParserCtxtPtr ctxt;
4919 htmlSAXHandlerPtr oldsax = NULL;
4920
Daniel Veillardd0463562001-10-13 09:15:48 +00004921 xmlInitParser();
4922
Owen Taylor3473f882001-02-23 17:55:21 +00004923 ctxt = htmlCreateFileParserCtxt(filename, encoding);
4924 if (ctxt == NULL) return(NULL);
4925 if (sax != NULL) {
4926 oldsax = ctxt->sax;
4927 ctxt->sax = sax;
4928 ctxt->userData = userData;
4929 }
4930
4931 htmlParseDocument(ctxt);
4932
4933 ret = ctxt->myDoc;
4934 if (sax != NULL) {
4935 ctxt->sax = oldsax;
4936 ctxt->userData = NULL;
4937 }
4938 htmlFreeParserCtxt(ctxt);
4939
4940 return(ret);
4941}
4942
4943/**
4944 * htmlParseFile :
4945 * @filename: the filename
4946 * @encoding: a free form C string describing the HTML document encoding, or NULL
4947 *
4948 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4949 * compressed document is provided by default if found at compile-time.
4950 *
4951 * Returns the resulting document tree
4952 */
4953
4954htmlDocPtr
4955htmlParseFile(const char *filename, const char *encoding) {
4956 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
4957}
4958
4959/**
4960 * htmlHandleOmittedElem:
4961 * @val: int 0 or 1
4962 *
4963 * Set and return the previous value for handling HTML omitted tags.
4964 *
4965 * Returns the last value for 0 for no handling, 1 for auto insertion.
4966 */
4967
4968int
4969htmlHandleOmittedElem(int val) {
4970 int old = htmlOmittedDefaultValue;
4971
4972 htmlOmittedDefaultValue = val;
4973 return(old);
4974}
4975
4976#endif /* LIBXML_HTML_ENABLED */