blob: 4c819d1bff1b3569f803921c8abcf14862e515d9 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Owen Taylor3473f882001-02-23 17:55:21 +000045
46#define HTML_MAX_NAMELEN 1000
47#define HTML_PARSER_BIG_BUFFER_SIZE 1000
48#define HTML_PARSER_BUFFER_SIZE 100
49
50/* #define DEBUG */
51/* #define DEBUG_PUSH */
52
Daniel Veillard22090732001-07-16 00:06:07 +000053static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000054
Daniel Veillard56a4cb82001-03-24 17:00:36 +000055xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
56 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000057static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000058
59/************************************************************************
60 * *
Owen Taylor3473f882001-02-23 17:55:21 +000061 * Parser stacks related functions and macros *
62 * *
63 ************************************************************************/
64
65/*
66 * Generic function for accessing stacks in the Parser Context
67 */
68
69#define PUSH_AND_POP(scope, type, name) \
70scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
71 if (ctxt->name##Nr >= ctxt->name##Max) { \
72 ctxt->name##Max *= 2; \
73 ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
74 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
75 if (ctxt->name##Tab == NULL) { \
76 xmlGenericError(xmlGenericErrorContext, \
77 "realloc failed !\n"); \
78 return(0); \
79 } \
80 } \
81 ctxt->name##Tab[ctxt->name##Nr] = value; \
82 ctxt->name = value; \
83 return(ctxt->name##Nr++); \
84} \
85scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
86 type ret; \
87 if (ctxt->name##Nr < 0) return(0); \
88 ctxt->name##Nr--; \
89 if (ctxt->name##Nr < 0) return(0); \
90 if (ctxt->name##Nr > 0) \
91 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
92 else \
93 ctxt->name = NULL; \
94 ret = ctxt->name##Tab[ctxt->name##Nr]; \
95 ctxt->name##Tab[ctxt->name##Nr] = 0; \
96 return(ret); \
97} \
98
Daniel Veillard56a4cb82001-03-24 17:00:36 +000099/* PUSH_AND_POP(static, xmlNodePtr, node) */
100PUSH_AND_POP(static, xmlChar*, name)
Owen Taylor3473f882001-02-23 17:55:21 +0000101
102/*
103 * Macros for accessing the content. Those should be used only by the parser,
104 * and not exported.
105 *
106 * Dirty macros, i.e. one need to make assumption on the context to use them
107 *
108 * CUR_PTR return the current pointer to the xmlChar to be parsed.
109 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
110 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
111 * in UNICODE mode. This should be used internally by the parser
112 * only to compare to ASCII values otherwise it would break when
113 * running with UTF-8 encoding.
114 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
115 * to compare on ASCII based substring.
116 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
117 * it should be used only to compare on ASCII based substring.
118 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
119 * strings within the parser.
120 *
121 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
122 *
123 * CURRENT Returns the current char value, with the full decoding of
124 * UTF-8 if we are using this mode. It returns an int.
125 * NEXT Skip to the next character, this does the proper decoding
126 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
127 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
128 */
129
130#define UPPER (toupper(*ctxt->input->cur))
131
132#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
133
134#define NXT(val) ctxt->input->cur[(val)]
135
136#define UPP(val) (toupper(ctxt->input->cur[(val)]))
137
138#define CUR_PTR ctxt->input->cur
139
140#define SHRINK xmlParserInputShrink(ctxt->input)
141
142#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
143
144#define CURRENT ((int) (*ctxt->input->cur))
145
146#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
147
148/* Inported from XML */
149
Daniel Veillard561b7f82002-03-20 21:55:57 +0000150/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
151#define CUR ((int) (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000152#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
153
Daniel Veillard561b7f82002-03-20 21:55:57 +0000154#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000155#define NXT(val) ctxt->input->cur[(val)]
156#define CUR_PTR ctxt->input->cur
157
158
159#define NEXTL(l) do { \
160 if (*(ctxt->input->cur) == '\n') { \
161 ctxt->input->line++; ctxt->input->col = 1; \
162 } else ctxt->input->col++; \
163 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
164 } while (0)
165
166/************
167 \
168 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
169 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
170 ************/
171
172#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
173#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
174
175#define COPY_BUF(l,b,i,v) \
176 if (l == 1) b[i++] = (xmlChar) v; \
177 else i += xmlCopyChar(l,&b[i],v)
178
179/**
180 * htmlCurrentChar:
181 * @ctxt: the HTML parser context
182 * @len: pointer to the length of the char read
183 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000184 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000185 * bytes in the input buffer. Implement the end of line normalization:
186 * 2.11 End-of-Line Handling
187 * If the encoding is unspecified, in the case we find an ISO-Latin-1
188 * char, then the encoding converter is plugged in automatically.
189 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000190 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000191 */
192
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000193static int
Owen Taylor3473f882001-02-23 17:55:21 +0000194htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
195 if (ctxt->instate == XML_PARSER_EOF)
196 return(0);
197
198 if (ctxt->token != 0) {
199 *len = 0;
200 return(ctxt->token);
201 }
202 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
203 /*
204 * We are supposed to handle UTF8, check it's valid
205 * From rfc2044: encoding of the Unicode values on UTF-8:
206 *
207 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
208 * 0000 0000-0000 007F 0xxxxxxx
209 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
210 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
211 *
212 * Check for the 0x110000 limit too
213 */
214 const unsigned char *cur = ctxt->input->cur;
215 unsigned char c;
216 unsigned int val;
217
218 c = *cur;
219 if (c & 0x80) {
220 if (cur[1] == 0)
221 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
222 if ((cur[1] & 0xc0) != 0x80)
223 goto encoding_error;
224 if ((c & 0xe0) == 0xe0) {
225
226 if (cur[2] == 0)
227 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
228 if ((cur[2] & 0xc0) != 0x80)
229 goto encoding_error;
230 if ((c & 0xf0) == 0xf0) {
231 if (cur[3] == 0)
232 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
233 if (((c & 0xf8) != 0xf0) ||
234 ((cur[3] & 0xc0) != 0x80))
235 goto encoding_error;
236 /* 4-byte code */
237 *len = 4;
238 val = (cur[0] & 0x7) << 18;
239 val |= (cur[1] & 0x3f) << 12;
240 val |= (cur[2] & 0x3f) << 6;
241 val |= cur[3] & 0x3f;
242 } else {
243 /* 3-byte code */
244 *len = 3;
245 val = (cur[0] & 0xf) << 12;
246 val |= (cur[1] & 0x3f) << 6;
247 val |= cur[2] & 0x3f;
248 }
249 } else {
250 /* 2-byte code */
251 *len = 2;
252 val = (cur[0] & 0x1f) << 6;
253 val |= cur[1] & 0x3f;
254 }
255 if (!IS_CHAR(val)) {
256 ctxt->errNo = XML_ERR_INVALID_ENCODING;
257 if ((ctxt->sax != NULL) &&
258 (ctxt->sax->error != NULL))
259 ctxt->sax->error(ctxt->userData,
260 "Char 0x%X out of allowed range\n", val);
261 ctxt->wellFormed = 0;
262 ctxt->disableSAX = 1;
263 }
264 return(val);
265 } else {
266 /* 1-byte code */
267 *len = 1;
268 return((int) *ctxt->input->cur);
269 }
270 }
271 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000272 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000273 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000274 * XML constructs only use < 128 chars
275 */
276 *len = 1;
277 if ((int) *ctxt->input->cur < 0x80)
278 return((int) *ctxt->input->cur);
279
280 /*
281 * Humm this is bad, do an automatic flow conversion
282 */
283 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
284 ctxt->charset = XML_CHAR_ENCODING_UTF8;
285 return(xmlCurrentChar(ctxt, len));
286
287encoding_error:
288 /*
289 * If we detect an UTF8 error that probably mean that the
290 * input encoding didn't get properly advertized in the
291 * declaration header. Report the error and switch the encoding
292 * to ISO-Latin-1 (if you don't like this policy, just declare the
293 * encoding !)
294 */
295 ctxt->errNo = XML_ERR_INVALID_ENCODING;
296 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
297 ctxt->sax->error(ctxt->userData,
298 "Input is not proper UTF-8, indicate encoding !\n");
299 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
300 ctxt->input->cur[0], ctxt->input->cur[1],
301 ctxt->input->cur[2], ctxt->input->cur[3]);
302 }
303
304 ctxt->charset = XML_CHAR_ENCODING_8859_1;
305 *len = 1;
306 return((int) *ctxt->input->cur);
307}
308
309/**
Owen Taylor3473f882001-02-23 17:55:21 +0000310 * htmlSkipBlankChars:
311 * @ctxt: the HTML parser context
312 *
313 * skip all blanks character found at that point in the input streams.
314 *
315 * Returns the number of space chars skipped
316 */
317
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000318static int
Owen Taylor3473f882001-02-23 17:55:21 +0000319htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
320 int res = 0;
321
322 while (IS_BLANK(*(ctxt->input->cur))) {
323 if ((*ctxt->input->cur == 0) &&
324 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
325 xmlPopInput(ctxt);
326 } else {
327 if (*(ctxt->input->cur) == '\n') {
328 ctxt->input->line++; ctxt->input->col = 1;
329 } else ctxt->input->col++;
330 ctxt->input->cur++;
331 ctxt->nbChars++;
332 if (*ctxt->input->cur == 0)
333 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
334 }
335 res++;
336 }
337 return(res);
338}
339
340
341
342/************************************************************************
343 * *
344 * The list of HTML elements and their properties *
345 * *
346 ************************************************************************/
347
348/*
349 * Start Tag: 1 means the start tag can be ommited
350 * End Tag: 1 means the end tag can be ommited
351 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000352 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000353 * Depr: this element is deprecated
354 * DTD: 1 means that this element is valid only in the Loose DTD
355 * 2 means that this element is valid only in the Frameset DTD
356 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000357 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Owen Taylor3473f882001-02-23 17:55:21 +0000358 */
Daniel Veillard22090732001-07-16 00:06:07 +0000359static const htmlElemDesc
360html40ElementTable[] = {
Daniel Veillard02bb1702001-06-13 21:11:59 +0000361{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor " },
362{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form" },
363{ "acronym", 0, 0, 0, 0, 0, 0, 1, "" },
364{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author " },
365{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet " },
366{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area " },
367{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style" },
368{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri " },
369{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " },
370{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride " },
371{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style" },
372{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation " },
373{ "body", 1, 1, 0, 0, 0, 0, 0, "document body " },
374{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break " },
375{ "button", 0, 0, 0, 0, 0, 0, 2, "push button " },
376{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption " },
377{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center " },
378{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation" },
379{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment" },
380{ "col", 0, 2, 2, 1, 0, 0, 0, "table column " },
381{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group " },
382{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description " },
383{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text " },
384{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition" },
385{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list" },
386{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container"},
387{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list " },
388{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term " },
389{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis" },
390{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group " },
391{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font " },
392{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form " },
393{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " },
394{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" },
395{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading " },
396{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading " },
397{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading " },
398{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading " },
399{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading " },
400{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading " },
401{ "head", 1, 1, 0, 0, 0, 0, 0, "document head " },
402{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " },
403{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element " },
404{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style" },
405{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow " },
406{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image " },
407{ "input", 0, 2, 2, 1, 0, 0, 1, "form control " },
408{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text" },
409{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt " },
410{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user" },
411{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text " },
412{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend " },
413{ "li", 0, 1, 1, 0, 0, 0, 0, "list item " },
414{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link " },
415{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map " },
416{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list " },
417{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation " },
418{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering " },
419{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
420{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object " },
421{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list " },
422{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group " },
423{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " },
424{ "p", 0, 1, 1, 0, 0, 0, 0, "paragraph " },
425{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value " },
426{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text " },
427{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation " },
428{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style" },
429{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc." },
430{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements " },
431{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector " },
432{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style" },
433{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container " },
434{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text" },
435{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis" },
436{ "style", 0, 0, 0, 0, 0, 0, 0, "style info " },
437{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript" },
438{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript " },
439{ "table", 0, 0, 0, 0, 0, 0, 0, "&#160;" },
440{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body " },
441{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell" },
442{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field " },
443{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer " },
444{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell" },
445{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header " },
446{ "title", 0, 0, 0, 0, 0, 0, 0, "document title " },
447{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row " },
448{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style" },
449{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style" },
450{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list " },
451{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument" },
Owen Taylor3473f882001-02-23 17:55:21 +0000452};
453
454/*
Owen Taylor3473f882001-02-23 17:55:21 +0000455 * start tags that imply the end of current element
456 */
Daniel Veillard22090732001-07-16 00:06:07 +0000457static const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000458"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
459 "dl", "ul", "ol", "menu", "dir", "address", "pre",
460 "listing", "xmp", "head", NULL,
461"head", "p", NULL,
462"title", "p", NULL,
463"body", "head", "style", "link", "title", "p", NULL,
464"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
465 "pre", "listing", "xmp", "head", "li", NULL,
466"hr", "p", "head", NULL,
467"h1", "p", "head", NULL,
468"h2", "p", "head", NULL,
469"h3", "p", "head", NULL,
470"h4", "p", "head", NULL,
471"h5", "p", "head", NULL,
472"h6", "p", "head", NULL,
473"dir", "p", "head", NULL,
474"address", "p", "head", "ul", NULL,
475"pre", "p", "head", "ul", NULL,
476"listing", "p", "head", NULL,
477"xmp", "p", "head", NULL,
478"blockquote", "p", "head", NULL,
479"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
480 "xmp", "head", NULL,
481"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
482 "head", "dd", NULL,
483"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
484 "head", "dt", NULL,
485"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
486 "listing", "xmp", NULL,
487"ol", "p", "head", "ul", NULL,
488"menu", "p", "head", "ul", NULL,
489"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
490"div", "p", "head", NULL,
491"noscript", "p", "head", NULL,
492"center", "font", "b", "i", "p", "head", NULL,
493"a", "a", NULL,
494"caption", "p", NULL,
495"colgroup", "caption", "colgroup", "col", "p", NULL,
496"col", "caption", "col", "p", NULL,
497"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
498 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000499"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
500"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000501"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
502"thead", "caption", "col", "colgroup", NULL,
503"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
504 "tbody", "p", NULL,
505"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
506 "tfoot", "tbody", "p", NULL,
507"optgroup", "option", NULL,
508"option", "option", NULL,
509"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
510 "pre", "listing", "xmp", "a", NULL,
511NULL
512};
513
514/*
515 * The list of HTML elements which are supposed not to have
516 * CDATA content and where a p element will be implied
517 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000518 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +0000519 * implied paragraph
520 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000521static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000522 "html",
523 "head",
524 "body",
525 NULL
526};
527
528/*
529 * The list of HTML attributes which are of content %Script;
530 * NOTE: when adding ones, check htmlIsScriptAttribute() since
531 * it assumes the name starts with 'on'
532 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000533static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000534 "onclick",
535 "ondblclick",
536 "onmousedown",
537 "onmouseup",
538 "onmouseover",
539 "onmousemove",
540 "onmouseout",
541 "onkeypress",
542 "onkeydown",
543 "onkeyup",
544 "onload",
545 "onunload",
546 "onfocus",
547 "onblur",
548 "onsubmit",
549 "onrest",
550 "onchange",
551 "onselect"
552};
553
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000554/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000555 * This table is used by the htmlparser to know what to do with
556 * broken html pages. By assigning different priorities to different
557 * elements the parser can decide how to handle extra endtags.
558 * Endtags are only allowed to close elements with lower or equal
559 * priority.
560 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000561
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000562typedef struct {
563 const char *name;
564 int priority;
565} elementPriority;
566
Daniel Veillard22090732001-07-16 00:06:07 +0000567static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000568 {"div", 150},
569 {"td", 160},
570 {"th", 160},
571 {"tr", 170},
572 {"thead", 180},
573 {"tbody", 180},
574 {"tfoot", 180},
575 {"table", 190},
576 {"head", 200},
577 {"body", 200},
578 {"html", 220},
579 {NULL, 100} /* Default priority */
580};
Owen Taylor3473f882001-02-23 17:55:21 +0000581
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000582static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +0000583static int htmlStartCloseIndexinitialized = 0;
584
585/************************************************************************
586 * *
587 * functions to handle HTML specific data *
588 * *
589 ************************************************************************/
590
591/**
592 * htmlInitAutoClose:
593 *
594 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
595 * This is not reentrant. Call xmlInitParser() once before processing in
596 * case of use in multithreaded programs.
597 */
598void
599htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000600 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000601
602 if (htmlStartCloseIndexinitialized) return;
603
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000604 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
605 indx = 0;
606 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
607 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +0000608 while (htmlStartClose[i] != NULL) i++;
609 i++;
610 }
611 htmlStartCloseIndexinitialized = 1;
612}
613
614/**
615 * htmlTagLookup:
616 * @tag: The tag name in lowercase
617 *
618 * Lookup the HTML tag in the ElementTable
619 *
620 * Returns the related htmlElemDescPtr or NULL if not found.
621 */
Daniel Veillardbb371292001-08-16 23:26:59 +0000622const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +0000623htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000624 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000625
626 for (i = 0; i < (sizeof(html40ElementTable) /
627 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000628 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillard22090732001-07-16 00:06:07 +0000629 return((const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +0000630 }
631 return(NULL);
632}
633
634/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000635 * htmlGetEndPriority:
636 * @name: The name of the element to look up the priority for.
637 *
638 * Return value: The "endtag" priority.
639 **/
640static int
641htmlGetEndPriority (const xmlChar *name) {
642 int i = 0;
643
644 while ((htmlEndPriority[i].name != NULL) &&
645 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
646 i++;
647
648 return(htmlEndPriority[i].priority);
649}
650
651/**
Owen Taylor3473f882001-02-23 17:55:21 +0000652 * htmlCheckAutoClose:
653 * @newtag: The new tag name
654 * @oldtag: The old tag name
655 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000656 * Checks whether the new tag is one of the registered valid tags for
657 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +0000658 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
659 *
660 * Returns 0 if no, 1 if yes.
661 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000662static int
Owen Taylor3473f882001-02-23 17:55:21 +0000663htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000664 int i, indx;
665 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +0000666
667 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
668
669 /* inefficient, but not a big deal */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000670 for (indx = 0; indx < 100;indx++) {
671 closed = htmlStartCloseIndex[indx];
672 if (closed == NULL) return(0);
673 if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor3473f882001-02-23 17:55:21 +0000674 }
675
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000676 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +0000677 i++;
678 while (htmlStartClose[i] != NULL) {
679 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
680 return(1);
681 }
682 i++;
683 }
684 return(0);
685}
686
687/**
688 * htmlAutoCloseOnClose:
689 * @ctxt: an HTML parser context
690 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000691 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +0000692 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000693 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000694 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000695static void
Owen Taylor3473f882001-02-23 17:55:21 +0000696htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillardbb371292001-08-16 23:26:59 +0000697 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +0000698 xmlChar *oldname;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000699 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +0000700
701#ifdef DEBUG
702 xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
703 for (i = 0;i < ctxt->nameNr;i++)
704 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
705#endif
706
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000707 priority = htmlGetEndPriority (newtag);
708
Owen Taylor3473f882001-02-23 17:55:21 +0000709 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000710
Owen Taylor3473f882001-02-23 17:55:21 +0000711 if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000712 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000713 * A missplaced endtag can only close elements with lower
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000714 * or equal priority, so if we find an element with higher
715 * priority before we find an element with
716 * matching name, we just ignore this endtag
717 */
718 if (htmlGetEndPriority (ctxt->nameTab[i]) > priority) return;
Owen Taylor3473f882001-02-23 17:55:21 +0000719 }
720 if (i < 0) return;
721
722 while (!xmlStrEqual(newtag, ctxt->name)) {
723 info = htmlTagLookup(ctxt->name);
724 if ((info == NULL) || (info->endTag == 1)) {
725#ifdef DEBUG
726 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
727#endif
Daniel Veillard56098d42001-04-24 12:51:09 +0000728 } else if (info->endTag == 3) {
729#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +0000730 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", newtag, ctxt->name);
William M. Brack1633d182001-10-05 15:41:19 +0000731
Daniel Veillard56098d42001-04-24 12:51:09 +0000732#endif
733 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
734 ctxt->sax->error(ctxt->userData,
735 "Opening and ending tag mismatch: %s and %s\n",
736 newtag, ctxt->name);
737 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000738 }
739 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
740 ctxt->sax->endElement(ctxt->userData, ctxt->name);
741 oldname = htmlnamePop(ctxt);
742 if (oldname != NULL) {
743#ifdef DEBUG
744 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
745#endif
746 xmlFree(oldname);
747 }
748 }
749}
750
751/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000752 * htmlAutoCloseOnEnd:
753 * @ctxt: an HTML parser context
754 *
755 * Close all remaining tags at the end of the stream
756 */
757static void
758htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
759 xmlChar *oldname;
760 int i;
761
762 if (ctxt->nameNr == 0)
763 return;
764#ifdef DEBUG
765 xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
766#endif
767
768 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
769#ifdef DEBUG
770 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
771#endif
772 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
773 ctxt->sax->endElement(ctxt->userData, ctxt->name);
774 oldname = htmlnamePop(ctxt);
775 if (oldname != NULL) {
776#ifdef DEBUG
777 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
778#endif
779 xmlFree(oldname);
780 }
781 }
782}
783
784/**
Owen Taylor3473f882001-02-23 17:55:21 +0000785 * htmlAutoClose:
786 * @ctxt: an HTML parser context
787 * @newtag: The new tag name or NULL
788 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000789 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000790 * The list is kept in htmlStartClose array. This function is
791 * called when a new tag has been detected and generates the
792 * appropriates closes if possible/needed.
793 * If newtag is NULL this mean we are at the end of the resource
794 * and we should check
795 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000796static void
Owen Taylor3473f882001-02-23 17:55:21 +0000797htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
798 xmlChar *oldname;
799 while ((newtag != NULL) && (ctxt->name != NULL) &&
800 (htmlCheckAutoClose(newtag, ctxt->name))) {
801#ifdef DEBUG
802 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
803#endif
804 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
805 ctxt->sax->endElement(ctxt->userData, ctxt->name);
806 oldname = htmlnamePop(ctxt);
807 if (oldname != NULL) {
808#ifdef DEBUG
809 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
810#endif
811 xmlFree(oldname);
812 }
813 }
814 if (newtag == NULL) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000815 htmlAutoCloseOnEnd(ctxt);
816 return;
Owen Taylor3473f882001-02-23 17:55:21 +0000817 }
818 while ((newtag == NULL) && (ctxt->name != NULL) &&
819 ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
820 (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
821 (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
822#ifdef DEBUG
823 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
824#endif
825 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
826 ctxt->sax->endElement(ctxt->userData, ctxt->name);
827 oldname = htmlnamePop(ctxt);
828 if (oldname != NULL) {
829#ifdef DEBUG
830 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
831#endif
832 xmlFree(oldname);
833 }
834 }
835
836}
837
838/**
839 * htmlAutoCloseTag:
840 * @doc: the HTML document
841 * @name: The tag name
842 * @elem: the HTML element
843 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000844 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000845 * The list is kept in htmlStartClose array. This function checks
846 * if the element or one of it's children would autoclose the
847 * given tag.
848 *
849 * Returns 1 if autoclose, 0 otherwise
850 */
851int
852htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
853 htmlNodePtr child;
854
855 if (elem == NULL) return(1);
856 if (xmlStrEqual(name, elem->name)) return(0);
857 if (htmlCheckAutoClose(elem->name, name)) return(1);
858 child = elem->children;
859 while (child != NULL) {
860 if (htmlAutoCloseTag(doc, name, child)) return(1);
861 child = child->next;
862 }
863 return(0);
864}
865
866/**
867 * htmlIsAutoClosed:
868 * @doc: the HTML document
869 * @elem: the HTML element
870 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000871 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +0000872 * The list is kept in htmlStartClose array. This function checks
873 * if a tag is autoclosed by one of it's child
874 *
875 * Returns 1 if autoclosed, 0 otherwise
876 */
877int
878htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
879 htmlNodePtr child;
880
881 if (elem == NULL) return(1);
882 child = elem->children;
883 while (child != NULL) {
884 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
885 child = child->next;
886 }
887 return(0);
888}
889
890/**
891 * htmlCheckImplied:
892 * @ctxt: an HTML parser context
893 * @newtag: The new tag name
894 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000895 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +0000896 * called when a new tag has been detected and generates the
897 * appropriates implicit tags if missing
898 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000899static void
Owen Taylor3473f882001-02-23 17:55:21 +0000900htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
901 if (!htmlOmittedDefaultValue)
902 return;
903 if (xmlStrEqual(newtag, BAD_CAST"html"))
904 return;
905 if (ctxt->nameNr <= 0) {
906#ifdef DEBUG
907 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
908#endif
909 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
910 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
911 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
912 }
913 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
914 return;
915 if ((ctxt->nameNr <= 1) &&
916 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
917 (xmlStrEqual(newtag, BAD_CAST"style")) ||
918 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
919 (xmlStrEqual(newtag, BAD_CAST"link")) ||
920 (xmlStrEqual(newtag, BAD_CAST"title")) ||
921 (xmlStrEqual(newtag, BAD_CAST"base")))) {
922 /*
923 * dropped OBJECT ... i you put it first BODY will be
924 * assumed !
925 */
926#ifdef DEBUG
927 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
928#endif
929 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
930 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
931 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
932 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
933 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
934 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
935 int i;
936 for (i = 0;i < ctxt->nameNr;i++) {
937 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
938 return;
939 }
940 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
941 return;
942 }
943 }
944
945#ifdef DEBUG
946 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
947#endif
948 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
949 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
950 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
951 }
952}
953
954/**
955 * htmlCheckParagraph
956 * @ctxt: an HTML parser context
957 *
958 * Check whether a p element need to be implied before inserting
959 * characters in the current element.
960 *
961 * Returns 1 if a paragraph has been inserted, 0 if not and -1
962 * in case of error.
963 */
964
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000965static int
Owen Taylor3473f882001-02-23 17:55:21 +0000966htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
967 const xmlChar *tag;
968 int i;
969
970 if (ctxt == NULL)
971 return(-1);
972 tag = ctxt->name;
973 if (tag == NULL) {
974 htmlAutoClose(ctxt, BAD_CAST"p");
975 htmlCheckImplied(ctxt, BAD_CAST"p");
976 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
977 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
978 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
979 return(1);
980 }
981 if (!htmlOmittedDefaultValue)
982 return(0);
983 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
984 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
985#ifdef DEBUG
986 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
987#endif
988 htmlAutoClose(ctxt, BAD_CAST"p");
989 htmlCheckImplied(ctxt, BAD_CAST"p");
990 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
991 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
992 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
993 return(1);
994 }
995 }
996 return(0);
997}
998
999/**
1000 * htmlIsScriptAttribute:
1001 * @name: an attribute name
1002 *
1003 * Check if an attribute is of content type Script
1004 *
1005 * Returns 1 is the attribute is a script 0 otherwise
1006 */
1007int
1008htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001009 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001010
1011 if (name == NULL)
1012 return(0);
1013 /*
1014 * all script attributes start with 'on'
1015 */
1016 if ((name[0] != 'o') || (name[1] != 'n'))
1017 return(0);
1018 for (i = 0;
1019 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1020 i++) {
1021 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1022 return(1);
1023 }
1024 return(0);
1025}
1026
1027/************************************************************************
1028 * *
1029 * The list of HTML predefined entities *
1030 * *
1031 ************************************************************************/
1032
1033
Daniel Veillard22090732001-07-16 00:06:07 +00001034static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001035/*
1036 * the 4 absolute ones, plus apostrophe.
1037 */
1038{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1039{ 38, "amp", "ampersand, U+0026 ISOnum" },
1040{ 39, "apos", "single quote" },
1041{ 60, "lt", "less-than sign, U+003C ISOnum" },
1042{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1043
1044/*
1045 * A bunch still in the 128-255 range
1046 * Replacing them depend really on the charset used.
1047 */
1048{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1049{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1050{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1051{ 163, "pound","pound sign, U+00A3 ISOnum" },
1052{ 164, "curren","currency sign, U+00A4 ISOnum" },
1053{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1054{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1055{ 167, "sect", "section sign, U+00A7 ISOnum" },
1056{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1057{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1058{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1059{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1060{ 172, "not", "not sign, U+00AC ISOnum" },
1061{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1062{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1063{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1064{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1065{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1066{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1067{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1068{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1069{ 181, "micro","micro sign, U+00B5 ISOnum" },
1070{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1071{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1072{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1073{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1074{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1075{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1076{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1077{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1078{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1079{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1080{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1081{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1082{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1083{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1084{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1085{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1086{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1087{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1088{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1089{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1090{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1091{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1092{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1093{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1094{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1095{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1096{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1097{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1098{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1099{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1100{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1101{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1102{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1103{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1104{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1105{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1106{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1107{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1108{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1109{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1110{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1111{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1112{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1113{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1114{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1115{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1116{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1117{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1118{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1119{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1120{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1121{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1122{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1123{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1124{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1125{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1126{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1127{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1128{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1129{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1130{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1131{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1132{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1133{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1134{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1135{ 247, "divide","division sign, U+00F7 ISOnum" },
1136{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1137{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1138{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1139{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1140{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1141{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1142{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1143{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1144
1145{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1146{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1147{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1148{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1149{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1150
1151/*
1152 * Anything below should really be kept as entities references
1153 */
1154{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1155
1156{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1157{ 732, "tilde","small tilde, U+02DC ISOdia" },
1158
1159{ 913, "Alpha","greek capital letter alpha, U+0391" },
1160{ 914, "Beta", "greek capital letter beta, U+0392" },
1161{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1162{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1163{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1164{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1165{ 919, "Eta", "greek capital letter eta, U+0397" },
1166{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1167{ 921, "Iota", "greek capital letter iota, U+0399" },
1168{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001169{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001170{ 924, "Mu", "greek capital letter mu, U+039C" },
1171{ 925, "Nu", "greek capital letter nu, U+039D" },
1172{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1173{ 927, "Omicron","greek capital letter omicron, U+039F" },
1174{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1175{ 929, "Rho", "greek capital letter rho, U+03A1" },
1176{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1177{ 932, "Tau", "greek capital letter tau, U+03A4" },
1178{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1179{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1180{ 935, "Chi", "greek capital letter chi, U+03A7" },
1181{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1182{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1183
1184{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1185{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1186{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1187{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1188{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1189{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1190{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1191{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1192{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1193{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1194{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1195{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1196{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1197{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1198{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1199{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1200{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1201{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1202{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1203{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1204{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1205{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1206{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1207{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1208{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1209{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1210{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1211{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1212
1213{ 8194, "ensp", "en space, U+2002 ISOpub" },
1214{ 8195, "emsp", "em space, U+2003 ISOpub" },
1215{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1216{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1217{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1218{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1219{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1220{ 8211, "ndash","en dash, U+2013 ISOpub" },
1221{ 8212, "mdash","em dash, U+2014 ISOpub" },
1222{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1223{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1224{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1225{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1226{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1227{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1228{ 8224, "dagger","dagger, U+2020 ISOpub" },
1229{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1230
1231{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1232{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1233
1234{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1235
1236{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1237{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1238
1239{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1240{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1241
1242{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1243{ 8260, "frasl","fraction slash, U+2044 NEW" },
1244
1245{ 8364, "euro", "euro sign, U+20AC NEW" },
1246
1247{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1248{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1249{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1250{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1251{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1252{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1253{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1254{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1255{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1256{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1257{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1258{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1259{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1260{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1261{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1262{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1263
1264{ 8704, "forall","for all, U+2200 ISOtech" },
1265{ 8706, "part", "partial differential, U+2202 ISOtech" },
1266{ 8707, "exist","there exists, U+2203 ISOtech" },
1267{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1268{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1269{ 8712, "isin", "element of, U+2208 ISOtech" },
1270{ 8713, "notin","not an element of, U+2209 ISOtech" },
1271{ 8715, "ni", "contains as member, U+220B ISOtech" },
1272{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001273{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001274{ 8722, "minus","minus sign, U+2212 ISOtech" },
1275{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1276{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1277{ 8733, "prop", "proportional to, U+221D ISOtech" },
1278{ 8734, "infin","infinity, U+221E ISOtech" },
1279{ 8736, "ang", "angle, U+2220 ISOamso" },
1280{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1281{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1282{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1283{ 8746, "cup", "union = cup, U+222A ISOtech" },
1284{ 8747, "int", "integral, U+222B ISOtech" },
1285{ 8756, "there4","therefore, U+2234 ISOtech" },
1286{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1287{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1288{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1289{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1290{ 8801, "equiv","identical to, U+2261 ISOtech" },
1291{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1292{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1293{ 8834, "sub", "subset of, U+2282 ISOtech" },
1294{ 8835, "sup", "superset of, U+2283 ISOtech" },
1295{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1296{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1297{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1298{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1299{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1300{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1301{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1302{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1303{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1304{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1305{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1306{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1307{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1308{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1309
1310{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1311{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1312{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1313{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1314
1315};
1316
1317/************************************************************************
1318 * *
1319 * Commodity functions to handle entities *
1320 * *
1321 ************************************************************************/
1322
1323/*
1324 * Macro used to grow the current buffer.
1325 */
1326#define growBuffer(buffer) { \
1327 buffer##_size *= 2; \
1328 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1329 if (buffer == NULL) { \
1330 perror("realloc failed"); \
1331 return(NULL); \
1332 } \
1333}
1334
1335/**
1336 * htmlEntityLookup:
1337 * @name: the entity name
1338 *
1339 * Lookup the given entity in EntitiesTable
1340 *
1341 * TODO: the linear scan is really ugly, an hash table is really needed.
1342 *
1343 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1344 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001345const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001346htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001347 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001348
1349 for (i = 0;i < (sizeof(html40EntitiesTable)/
1350 sizeof(html40EntitiesTable[0]));i++) {
1351 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1352#ifdef DEBUG
1353 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
1354#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001355 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001356 }
1357 }
1358 return(NULL);
1359}
1360
1361/**
1362 * htmlEntityValueLookup:
1363 * @value: the entity's unicode value
1364 *
1365 * Lookup the given entity in EntitiesTable
1366 *
1367 * TODO: the linear scan is really ugly, an hash table is really needed.
1368 *
1369 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1370 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001371const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001372htmlEntityValueLookup(unsigned int value) {
1373 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001374#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00001375 unsigned int lv = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001376#endif
1377
1378 for (i = 0;i < (sizeof(html40EntitiesTable)/
1379 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001380 if (html40EntitiesTable[i].value >= value) {
1381 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001382 break;
1383#ifdef DEBUG
1384 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
1385#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001386 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001387 }
1388#ifdef DEBUG
1389 if (lv > html40EntitiesTable[i].value) {
1390 xmlGenericError(xmlGenericErrorContext,
1391 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1392 lv, html40EntitiesTable[i].value);
1393 }
1394 lv = html40EntitiesTable[i].value;
1395#endif
1396 }
1397 return(NULL);
1398}
1399
1400/**
1401 * UTF8ToHtml:
1402 * @out: a pointer to an array of bytes to store the result
1403 * @outlen: the length of @out
1404 * @in: a pointer to an array of UTF-8 chars
1405 * @inlen: the length of @in
1406 *
1407 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1408 * plus HTML entities block of chars out.
1409 *
1410 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1411 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001412 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001413 * The value of @outlen after return is the number of octets consumed.
1414 */
1415int
1416UTF8ToHtml(unsigned char* out, int *outlen,
1417 const unsigned char* in, int *inlen) {
1418 const unsigned char* processed = in;
1419 const unsigned char* outend;
1420 const unsigned char* outstart = out;
1421 const unsigned char* instart = in;
1422 const unsigned char* inend;
1423 unsigned int c, d;
1424 int trailing;
1425
1426 if (in == NULL) {
1427 /*
1428 * initialization nothing to do
1429 */
1430 *outlen = 0;
1431 *inlen = 0;
1432 return(0);
1433 }
1434 inend = in + (*inlen);
1435 outend = out + (*outlen);
1436 while (in < inend) {
1437 d = *in++;
1438 if (d < 0x80) { c= d; trailing= 0; }
1439 else if (d < 0xC0) {
1440 /* trailing byte in leading position */
1441 *outlen = out - outstart;
1442 *inlen = processed - instart;
1443 return(-2);
1444 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1445 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1446 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1447 else {
1448 /* no chance for this in Ascii */
1449 *outlen = out - outstart;
1450 *inlen = processed - instart;
1451 return(-2);
1452 }
1453
1454 if (inend - in < trailing) {
1455 break;
1456 }
1457
1458 for ( ; trailing; trailing--) {
1459 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1460 break;
1461 c <<= 6;
1462 c |= d & 0x3F;
1463 }
1464
1465 /* assertion: c is a single UTF-4 value */
1466 if (c < 0x80) {
1467 if (out + 1 >= outend)
1468 break;
1469 *out++ = c;
1470 } else {
1471 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001472 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001473
1474 /*
1475 * Try to lookup a predefined HTML entity for it
1476 */
1477
1478 ent = htmlEntityValueLookup(c);
1479 if (ent == NULL) {
1480 /* no chance for this in Ascii */
1481 *outlen = out - outstart;
1482 *inlen = processed - instart;
1483 return(-2);
1484 }
1485 len = strlen(ent->name);
1486 if (out + 2 + len >= outend)
1487 break;
1488 *out++ = '&';
1489 memcpy(out, ent->name, len);
1490 out += len;
1491 *out++ = ';';
1492 }
1493 processed = in;
1494 }
1495 *outlen = out - outstart;
1496 *inlen = processed - instart;
1497 return(0);
1498}
1499
1500/**
1501 * htmlEncodeEntities:
1502 * @out: a pointer to an array of bytes to store the result
1503 * @outlen: the length of @out
1504 * @in: a pointer to an array of UTF-8 chars
1505 * @inlen: the length of @in
1506 * @quoteChar: the quote character to escape (' or ") or zero.
1507 *
1508 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1509 * plus HTML entities block of chars out.
1510 *
1511 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1512 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001513 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001514 * The value of @outlen after return is the number of octets consumed.
1515 */
1516int
1517htmlEncodeEntities(unsigned char* out, int *outlen,
1518 const unsigned char* in, int *inlen, int quoteChar) {
1519 const unsigned char* processed = in;
1520 const unsigned char* outend = out + (*outlen);
1521 const unsigned char* outstart = out;
1522 const unsigned char* instart = in;
1523 const unsigned char* inend = in + (*inlen);
1524 unsigned int c, d;
1525 int trailing;
1526
1527 while (in < inend) {
1528 d = *in++;
1529 if (d < 0x80) { c= d; trailing= 0; }
1530 else if (d < 0xC0) {
1531 /* trailing byte in leading position */
1532 *outlen = out - outstart;
1533 *inlen = processed - instart;
1534 return(-2);
1535 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1536 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1537 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1538 else {
1539 /* no chance for this in Ascii */
1540 *outlen = out - outstart;
1541 *inlen = processed - instart;
1542 return(-2);
1543 }
1544
1545 if (inend - in < trailing)
1546 break;
1547
1548 while (trailing--) {
1549 if (((d= *in++) & 0xC0) != 0x80) {
1550 *outlen = out - outstart;
1551 *inlen = processed - instart;
1552 return(-2);
1553 }
1554 c <<= 6;
1555 c |= d & 0x3F;
1556 }
1557
1558 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001559 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1560 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001561 if (out >= outend)
1562 break;
1563 *out++ = c;
1564 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001565 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001566 const char *cp;
1567 char nbuf[16];
1568 int len;
1569
1570 /*
1571 * Try to lookup a predefined HTML entity for it
1572 */
1573 ent = htmlEntityValueLookup(c);
1574 if (ent == NULL) {
1575 sprintf(nbuf, "#%u", c);
1576 cp = nbuf;
1577 }
1578 else
1579 cp = ent->name;
1580 len = strlen(cp);
1581 if (out + 2 + len > outend)
1582 break;
1583 *out++ = '&';
1584 memcpy(out, cp, len);
1585 out += len;
1586 *out++ = ';';
1587 }
1588 processed = in;
1589 }
1590 *outlen = out - outstart;
1591 *inlen = processed - instart;
1592 return(0);
1593}
1594
1595/**
1596 * htmlDecodeEntities:
1597 * @ctxt: the parser context
1598 * @len: the len to decode (in bytes !), -1 for no size limit
1599 * @end: an end marker xmlChar, 0 if none
1600 * @end2: an end marker xmlChar, 0 if none
1601 * @end3: an end marker xmlChar, 0 if none
1602 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001603 * Substitute the HTML entities by their value
Owen Taylor3473f882001-02-23 17:55:21 +00001604 *
1605 * DEPRECATED !!!!
1606 *
1607 * Returns A newly allocated string with the substitution done. The caller
1608 * must deallocate it !
1609 */
1610xmlChar *
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00001611htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
1612 xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001613 static int deprecated = 0;
1614 if (!deprecated) {
1615 xmlGenericError(xmlGenericErrorContext,
1616 "htmlDecodeEntities() deprecated function reached\n");
1617 deprecated = 1;
1618 }
1619 return(NULL);
1620#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00001621 xmlChar *name = NULL;
1622 xmlChar *buffer = NULL;
1623 unsigned int buffer_size = 0;
1624 unsigned int nbchars = 0;
1625 htmlEntityDescPtr ent;
1626 unsigned int max = (unsigned int) len;
1627 int c,l;
1628
1629 if (ctxt->depth > 40) {
1630 ctxt->errNo = XML_ERR_ENTITY_LOOP;
1631 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1632 ctxt->sax->error(ctxt->userData,
1633 "Detected entity reference loop\n");
1634 ctxt->wellFormed = 0;
1635 ctxt->disableSAX = 1;
1636 return(NULL);
1637 }
1638
1639 /*
1640 * allocate a translation buffer.
1641 */
1642 buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1643 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1644 if (buffer == NULL) {
1645 perror("xmlDecodeEntities: malloc failed");
1646 return(NULL);
1647 }
1648
1649 /*
1650 * Ok loop until we reach one of the ending char or a size limit.
1651 */
1652 c = CUR_CHAR(l);
1653 while ((nbchars < max) && (c != end) &&
1654 (c != end2) && (c != end3)) {
1655
1656 if (c == 0) break;
1657 if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
1658 int val = htmlParseCharRef(ctxt);
1659 COPY_BUF(0,buffer,nbchars,val);
1660 NEXTL(l);
1661 } else if ((c == '&') && (ctxt->token != '&')) {
1662 ent = htmlParseEntityRef(ctxt, &name);
1663 if (name != NULL) {
1664 if (ent != NULL) {
1665 int val = ent->value;
1666 COPY_BUF(0,buffer,nbchars,val);
1667 NEXTL(l);
1668 } else {
1669 const xmlChar *cur = name;
1670
1671 buffer[nbchars++] = '&';
1672 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1673 growBuffer(buffer);
1674 }
1675 while (*cur != 0) {
1676 buffer[nbchars++] = *cur++;
1677 }
1678 buffer[nbchars++] = ';';
1679 }
1680 }
1681 } else {
1682 COPY_BUF(l,buffer,nbchars,c);
1683 NEXTL(l);
1684 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1685 growBuffer(buffer);
1686 }
1687 }
1688 c = CUR_CHAR(l);
1689 }
1690 buffer[nbchars++] = 0;
1691 return(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001692#endif
Owen Taylor3473f882001-02-23 17:55:21 +00001693}
1694
1695/************************************************************************
1696 * *
1697 * Commodity functions to handle streams *
1698 * *
1699 ************************************************************************/
1700
1701/**
Owen Taylor3473f882001-02-23 17:55:21 +00001702 * htmlNewInputStream:
1703 * @ctxt: an HTML parser context
1704 *
1705 * Create a new input stream structure
1706 * Returns the new input stream or NULL
1707 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001708static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001709htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1710 htmlParserInputPtr input;
1711
1712 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1713 if (input == NULL) {
1714 ctxt->errNo = XML_ERR_NO_MEMORY;
1715 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1716 ctxt->sax->error(ctxt->userData,
1717 "malloc: couldn't allocate a new input stream\n");
1718 return(NULL);
1719 }
1720 memset(input, 0, sizeof(htmlParserInput));
1721 input->filename = NULL;
1722 input->directory = NULL;
1723 input->base = NULL;
1724 input->cur = NULL;
1725 input->buf = NULL;
1726 input->line = 1;
1727 input->col = 1;
1728 input->buf = NULL;
1729 input->free = NULL;
1730 input->version = NULL;
1731 input->consumed = 0;
1732 input->length = 0;
1733 return(input);
1734}
1735
1736
1737/************************************************************************
1738 * *
1739 * Commodity functions, cleanup needed ? *
1740 * *
1741 ************************************************************************/
1742
1743/**
1744 * areBlanks:
1745 * @ctxt: an HTML parser context
1746 * @str: a xmlChar *
1747 * @len: the size of @str
1748 *
1749 * Is this a sequence of blank chars that one can ignore ?
1750 *
1751 * Returns 1 if ignorable 0 otherwise.
1752 */
1753
1754static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
1755 int i;
1756 xmlNodePtr lastChild;
1757
1758 for (i = 0;i < len;i++)
1759 if (!(IS_BLANK(str[i]))) return(0);
1760
1761 if (CUR == 0) return(1);
1762 if (CUR != '<') return(0);
1763 if (ctxt->name == NULL)
1764 return(1);
1765 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
1766 return(1);
1767 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
1768 return(1);
1769 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
1770 return(1);
1771 if (ctxt->node == NULL) return(0);
1772 lastChild = xmlGetLastChild(ctxt->node);
1773 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00001774 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
1775 (ctxt->node->content != NULL)) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001776 } else if (xmlNodeIsText(lastChild)) {
1777 return(0);
1778 } else if (xmlStrEqual(lastChild->name, BAD_CAST"b")) {
1779 return(0);
1780 } else if (xmlStrEqual(lastChild->name, BAD_CAST"bold")) {
1781 return(0);
1782 } else if (xmlStrEqual(lastChild->name, BAD_CAST"em")) {
1783 return(0);
1784 }
1785 return(1);
1786}
1787
1788/**
Owen Taylor3473f882001-02-23 17:55:21 +00001789 * htmlNewDocNoDtD:
1790 * @URI: URI for the dtd, or NULL
1791 * @ExternalID: the external ID of the DTD, or NULL
1792 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001793 * Creates a new HTML document without a DTD node if @URI and @ExternalID
1794 * are NULL
1795 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001796 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00001797 */
1798htmlDocPtr
1799htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
1800 xmlDocPtr cur;
1801
1802 /*
1803 * Allocate a new document and fill the fields.
1804 */
1805 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
1806 if (cur == NULL) {
1807 xmlGenericError(xmlGenericErrorContext,
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001808 "htmlNewDocNoDtD : malloc failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00001809 return(NULL);
1810 }
1811 memset(cur, 0, sizeof(xmlDoc));
1812
1813 cur->type = XML_HTML_DOCUMENT_NODE;
1814 cur->version = NULL;
1815 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001816 cur->doc = cur;
1817 cur->name = NULL;
1818 cur->children = NULL;
1819 cur->extSubset = NULL;
1820 cur->oldNs = NULL;
1821 cur->encoding = NULL;
1822 cur->standalone = 1;
1823 cur->compression = 0;
1824 cur->ids = NULL;
1825 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001826 cur->_private = NULL;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00001827 if ((ExternalID != NULL) ||
1828 (URI != NULL))
Daniel Veillard5151c062001-10-23 13:10:19 +00001829 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00001830 return(cur);
1831}
1832
1833/**
1834 * htmlNewDoc:
1835 * @URI: URI for the dtd, or NULL
1836 * @ExternalID: the external ID of the DTD, or NULL
1837 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001838 * Creates a new HTML document
1839 *
Owen Taylor3473f882001-02-23 17:55:21 +00001840 * Returns a new document
1841 */
1842htmlDocPtr
1843htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1844 if ((URI == NULL) && (ExternalID == NULL))
1845 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00001846 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
1847 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00001848
1849 return(htmlNewDocNoDtD(URI, ExternalID));
1850}
1851
1852
1853/************************************************************************
1854 * *
1855 * The parser itself *
1856 * Relates to http://www.w3.org/TR/html40 *
1857 * *
1858 ************************************************************************/
1859
1860/************************************************************************
1861 * *
1862 * The parser itself *
1863 * *
1864 ************************************************************************/
1865
1866/**
1867 * htmlParseHTMLName:
1868 * @ctxt: an HTML parser context
1869 *
1870 * parse an HTML tag or attribute name, note that we convert it to lowercase
1871 * since HTML names are not case-sensitive.
1872 *
1873 * Returns the Tag Name parsed or NULL
1874 */
1875
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001876static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001877htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1878 xmlChar *ret = NULL;
1879 int i = 0;
1880 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
1881
1882 if (!IS_LETTER(CUR) && (CUR != '_') &&
1883 (CUR != ':')) return(NULL);
1884
1885 while ((i < HTML_PARSER_BUFFER_SIZE) &&
1886 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1887 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
1888 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
1889 else loc[i] = CUR;
1890 i++;
1891
1892 NEXT;
1893 }
1894
1895 ret = xmlStrndup(loc, i);
1896
1897 return(ret);
1898}
1899
1900/**
1901 * htmlParseName:
1902 * @ctxt: an HTML parser context
1903 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001904 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00001905 *
1906 * Returns the Name parsed or NULL
1907 */
1908
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001909static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001910htmlParseName(htmlParserCtxtPtr ctxt) {
1911 xmlChar buf[HTML_MAX_NAMELEN];
1912 int len = 0;
1913
1914 GROW;
1915 if (!IS_LETTER(CUR) && (CUR != '_')) {
1916 return(NULL);
1917 }
1918
1919 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1920 (CUR == '.') || (CUR == '-') ||
1921 (CUR == '_') || (CUR == ':') ||
1922 (IS_COMBINING(CUR)) ||
1923 (IS_EXTENDER(CUR))) {
1924 buf[len++] = CUR;
1925 NEXT;
1926 if (len >= HTML_MAX_NAMELEN) {
1927 xmlGenericError(xmlGenericErrorContext,
1928 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1929 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1930 (CUR == '.') || (CUR == '-') ||
1931 (CUR == '_') || (CUR == ':') ||
1932 (IS_COMBINING(CUR)) ||
1933 (IS_EXTENDER(CUR)))
1934 NEXT;
1935 break;
1936 }
1937 }
1938 return(xmlStrndup(buf, len));
1939}
1940
1941/**
1942 * htmlParseHTMLAttribute:
1943 * @ctxt: an HTML parser context
1944 * @stop: a char stop value
1945 *
1946 * parse an HTML attribute value till the stop (quote), if
1947 * stop is 0 then it stops at the first space
1948 *
1949 * Returns the attribute parsed or NULL
1950 */
1951
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001952static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001953htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
1954 xmlChar *buffer = NULL;
1955 int buffer_size = 0;
1956 xmlChar *out = NULL;
1957 xmlChar *name = NULL;
1958
1959 xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00001960 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001961
1962 /*
1963 * allocate a translation buffer.
1964 */
1965 buffer_size = HTML_PARSER_BUFFER_SIZE;
1966 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1967 if (buffer == NULL) {
1968 perror("htmlParseHTMLAttribute: malloc failed");
1969 return(NULL);
1970 }
1971 out = buffer;
1972
1973 /*
1974 * Ok loop until we reach one of the ending chars
1975 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00001976 while ((CUR != 0) && (CUR != stop)) {
1977 if ((stop == 0) && (CUR == '>')) break;
Owen Taylor3473f882001-02-23 17:55:21 +00001978 if ((stop == 0) && (IS_BLANK(CUR))) break;
1979 if (CUR == '&') {
1980 if (NXT(1) == '#') {
1981 unsigned int c;
1982 int bits;
1983
1984 c = htmlParseCharRef(ctxt);
1985 if (c < 0x80)
1986 { *out++ = c; bits= -6; }
1987 else if (c < 0x800)
1988 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1989 else if (c < 0x10000)
1990 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1991 else
1992 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1993
1994 for ( ; bits >= 0; bits-= 6) {
1995 *out++ = ((c >> bits) & 0x3F) | 0x80;
1996 }
1997 } else {
1998 ent = htmlParseEntityRef(ctxt, &name);
1999 if (name == NULL) {
2000 *out++ = '&';
2001 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002002 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002003
2004 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002005 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002006 }
2007 } else if (ent == NULL) {
2008 *out++ = '&';
2009 cur = name;
2010 while (*cur != 0) {
2011 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002012 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002013
2014 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002015 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002016 }
2017 *out++ = *cur++;
2018 }
2019 xmlFree(name);
2020 } else {
2021 unsigned int c;
2022 int bits;
2023
2024 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002025 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002026
2027 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002028 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002029 }
2030 c = (xmlChar)ent->value;
2031 if (c < 0x80)
2032 { *out++ = c; bits= -6; }
2033 else if (c < 0x800)
2034 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2035 else if (c < 0x10000)
2036 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2037 else
2038 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2039
2040 for ( ; bits >= 0; bits-= 6) {
2041 *out++ = ((c >> bits) & 0x3F) | 0x80;
2042 }
2043 xmlFree(name);
2044 }
2045 }
2046 } else {
2047 unsigned int c;
2048 int bits, l;
2049
2050 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002051 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002052
2053 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002054 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002055 }
2056 c = CUR_CHAR(l);
2057 if (c < 0x80)
2058 { *out++ = c; bits= -6; }
2059 else if (c < 0x800)
2060 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2061 else if (c < 0x10000)
2062 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2063 else
2064 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2065
2066 for ( ; bits >= 0; bits-= 6) {
2067 *out++ = ((c >> bits) & 0x3F) | 0x80;
2068 }
2069 NEXT;
2070 }
2071 }
2072 *out++ = 0;
2073 return(buffer);
2074}
2075
2076/**
Owen Taylor3473f882001-02-23 17:55:21 +00002077 * htmlParseEntityRef:
2078 * @ctxt: an HTML parser context
2079 * @str: location to store the entity name
2080 *
2081 * parse an HTML ENTITY references
2082 *
2083 * [68] EntityRef ::= '&' Name ';'
2084 *
2085 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2086 * if non-NULL *str will have to be freed by the caller.
2087 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002088const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00002089htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2090 xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002091 const htmlEntityDesc * ent = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002092 *str = NULL;
2093
2094 if (CUR == '&') {
2095 NEXT;
2096 name = htmlParseName(ctxt);
2097 if (name == NULL) {
2098 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2099 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2100 ctxt->wellFormed = 0;
2101 } else {
2102 GROW;
2103 if (CUR == ';') {
2104 *str = name;
2105
2106 /*
2107 * Lookup the entity in the table.
2108 */
2109 ent = htmlEntityLookup(name);
2110 if (ent != NULL) /* OK that's ugly !!! */
2111 NEXT;
2112 } else {
2113 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2114 ctxt->sax->error(ctxt->userData,
2115 "htmlParseEntityRef: expecting ';'\n");
2116 *str = name;
2117 }
2118 }
2119 }
2120 return(ent);
2121}
2122
2123/**
2124 * htmlParseAttValue:
2125 * @ctxt: an HTML parser context
2126 *
2127 * parse a value for an attribute
2128 * Note: the parser won't do substitution of entities here, this
2129 * will be handled later in xmlStringGetNodeList, unless it was
2130 * asked for ctxt->replaceEntities != 0
2131 *
2132 * Returns the AttValue parsed or NULL.
2133 */
2134
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002135static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002136htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2137 xmlChar *ret = NULL;
2138
2139 if (CUR == '"') {
2140 NEXT;
2141 ret = htmlParseHTMLAttribute(ctxt, '"');
2142 if (CUR != '"') {
2143 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2144 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2145 ctxt->wellFormed = 0;
2146 } else
2147 NEXT;
2148 } else if (CUR == '\'') {
2149 NEXT;
2150 ret = htmlParseHTMLAttribute(ctxt, '\'');
2151 if (CUR != '\'') {
2152 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2153 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2154 ctxt->wellFormed = 0;
2155 } else
2156 NEXT;
2157 } else {
2158 /*
2159 * That's an HTMLism, the attribute value may not be quoted
2160 */
2161 ret = htmlParseHTMLAttribute(ctxt, 0);
2162 if (ret == NULL) {
2163 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2164 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2165 ctxt->wellFormed = 0;
2166 }
2167 }
2168 return(ret);
2169}
2170
2171/**
2172 * htmlParseSystemLiteral:
2173 * @ctxt: an HTML parser context
2174 *
2175 * parse an HTML Literal
2176 *
2177 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2178 *
2179 * Returns the SystemLiteral parsed or NULL
2180 */
2181
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002182static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002183htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2184 const xmlChar *q;
2185 xmlChar *ret = NULL;
2186
2187 if (CUR == '"') {
2188 NEXT;
2189 q = CUR_PTR;
2190 while ((IS_CHAR(CUR)) && (CUR != '"'))
2191 NEXT;
2192 if (!IS_CHAR(CUR)) {
2193 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2194 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2195 ctxt->wellFormed = 0;
2196 } else {
2197 ret = xmlStrndup(q, CUR_PTR - q);
2198 NEXT;
2199 }
2200 } else if (CUR == '\'') {
2201 NEXT;
2202 q = CUR_PTR;
2203 while ((IS_CHAR(CUR)) && (CUR != '\''))
2204 NEXT;
2205 if (!IS_CHAR(CUR)) {
2206 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2207 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2208 ctxt->wellFormed = 0;
2209 } else {
2210 ret = xmlStrndup(q, CUR_PTR - q);
2211 NEXT;
2212 }
2213 } else {
2214 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2215 ctxt->sax->error(ctxt->userData,
2216 "SystemLiteral \" or ' expected\n");
2217 ctxt->wellFormed = 0;
2218 }
2219
2220 return(ret);
2221}
2222
2223/**
2224 * htmlParsePubidLiteral:
2225 * @ctxt: an HTML parser context
2226 *
2227 * parse an HTML public literal
2228 *
2229 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2230 *
2231 * Returns the PubidLiteral parsed or NULL.
2232 */
2233
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002234static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002235htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2236 const xmlChar *q;
2237 xmlChar *ret = NULL;
2238 /*
2239 * Name ::= (Letter | '_') (NameChar)*
2240 */
2241 if (CUR == '"') {
2242 NEXT;
2243 q = CUR_PTR;
2244 while (IS_PUBIDCHAR(CUR)) NEXT;
2245 if (CUR != '"') {
2246 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2247 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2248 ctxt->wellFormed = 0;
2249 } else {
2250 ret = xmlStrndup(q, CUR_PTR - q);
2251 NEXT;
2252 }
2253 } else if (CUR == '\'') {
2254 NEXT;
2255 q = CUR_PTR;
2256 while ((IS_LETTER(CUR)) && (CUR != '\''))
2257 NEXT;
2258 if (!IS_LETTER(CUR)) {
2259 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2260 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2261 ctxt->wellFormed = 0;
2262 } else {
2263 ret = xmlStrndup(q, CUR_PTR - q);
2264 NEXT;
2265 }
2266 } else {
2267 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2268 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2269 ctxt->wellFormed = 0;
2270 }
2271
2272 return(ret);
2273}
2274
2275/**
2276 * htmlParseScript:
2277 * @ctxt: an HTML parser context
2278 *
2279 * parse the content of an HTML SCRIPT or STYLE element
2280 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2281 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2282 * http://www.w3.org/TR/html4/types.html#type-script
2283 * http://www.w3.org/TR/html4/types.html#h-6.15
2284 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2285 *
2286 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2287 * element and the value of intrinsic event attributes. User agents must
2288 * not evaluate script data as HTML markup but instead must pass it on as
2289 * data to a script engine.
2290 * NOTES:
2291 * - The content is passed like CDATA
2292 * - the attributes for style and scripting "onXXX" are also described
2293 * as CDATA but SGML allows entities references in attributes so their
2294 * processing is identical as other attributes
2295 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002296static void
Owen Taylor3473f882001-02-23 17:55:21 +00002297htmlParseScript(htmlParserCtxtPtr ctxt) {
2298 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2299 int nbchar = 0;
2300 xmlChar cur;
2301
2302 SHRINK;
2303 cur = CUR;
2304 while (IS_CHAR(cur)) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00002305 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2306 (NXT(3) == '-')) {
2307 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2308 if (ctxt->sax->cdataBlock!= NULL) {
2309 /*
2310 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2311 */
2312 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2313 }
2314 }
2315 nbchar = 0;
2316 htmlParseComment(ctxt);
2317 cur = CUR;
2318 continue;
2319 } else if ((cur == '<') && (NXT(1) == '/')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002320 /*
2321 * One should break here, the specification is clear:
2322 * Authors should therefore escape "</" within the content.
2323 * Escape mechanisms are specific to each scripting or
2324 * style sheet language.
2325 */
2326 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2327 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2328 break; /* while */
2329 }
2330 buf[nbchar++] = cur;
2331 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2332 if (ctxt->sax->cdataBlock!= NULL) {
2333 /*
2334 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2335 */
2336 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2337 }
2338 nbchar = 0;
2339 }
2340 NEXT;
2341 cur = CUR;
2342 }
2343 if (!(IS_CHAR(cur))) {
2344 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2345 ctxt->sax->error(ctxt->userData,
2346 "Invalid char in CDATA 0x%X\n", cur);
2347 ctxt->wellFormed = 0;
2348 NEXT;
2349 }
2350
2351 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2352 if (ctxt->sax->cdataBlock!= NULL) {
2353 /*
2354 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2355 */
2356 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2357 }
2358 }
2359}
2360
2361
2362/**
2363 * htmlParseCharData:
2364 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002365 *
2366 * parse a CharData section.
2367 * if we are within a CDATA section ']]>' marks an end of section.
2368 *
2369 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2370 */
2371
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002372static void
2373htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002374 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2375 int nbchar = 0;
2376 int cur, l;
2377
2378 SHRINK;
2379 cur = CUR_CHAR(l);
2380 while (((cur != '<') || (ctxt->token == '<')) &&
2381 ((cur != '&') || (ctxt->token == '&')) &&
2382 (IS_CHAR(cur))) {
2383 COPY_BUF(l,buf,nbchar,cur);
2384 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2385 /*
2386 * Ok the segment is to be consumed as chars.
2387 */
2388 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2389 if (areBlanks(ctxt, buf, nbchar)) {
2390 if (ctxt->sax->ignorableWhitespace != NULL)
2391 ctxt->sax->ignorableWhitespace(ctxt->userData,
2392 buf, nbchar);
2393 } else {
2394 htmlCheckParagraph(ctxt);
2395 if (ctxt->sax->characters != NULL)
2396 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2397 }
2398 }
2399 nbchar = 0;
2400 }
2401 NEXTL(l);
2402 cur = CUR_CHAR(l);
2403 }
2404 if (nbchar != 0) {
2405 /*
2406 * Ok the segment is to be consumed as chars.
2407 */
2408 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2409 if (areBlanks(ctxt, buf, nbchar)) {
2410 if (ctxt->sax->ignorableWhitespace != NULL)
2411 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2412 } else {
2413 htmlCheckParagraph(ctxt);
2414 if (ctxt->sax->characters != NULL)
2415 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2416 }
2417 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002418 } else {
2419 /*
2420 * Loop detection
2421 */
2422 if (cur == 0)
2423 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002424 }
2425}
2426
2427/**
2428 * htmlParseExternalID:
2429 * @ctxt: an HTML parser context
2430 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002431 *
2432 * Parse an External ID or a Public ID
2433 *
Owen Taylor3473f882001-02-23 17:55:21 +00002434 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2435 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2436 *
2437 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2438 *
2439 * Returns the function returns SystemLiteral and in the second
2440 * case publicID receives PubidLiteral, is strict is off
2441 * it is possible to return NULL and have publicID set.
2442 */
2443
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002444static xmlChar *
2445htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002446 xmlChar *URI = NULL;
2447
2448 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2449 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2450 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2451 SKIP(6);
2452 if (!IS_BLANK(CUR)) {
2453 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2454 ctxt->sax->error(ctxt->userData,
2455 "Space required after 'SYSTEM'\n");
2456 ctxt->wellFormed = 0;
2457 }
2458 SKIP_BLANKS;
2459 URI = htmlParseSystemLiteral(ctxt);
2460 if (URI == NULL) {
2461 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2462 ctxt->sax->error(ctxt->userData,
2463 "htmlParseExternalID: SYSTEM, no URI\n");
2464 ctxt->wellFormed = 0;
2465 }
2466 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2467 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2468 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2469 SKIP(6);
2470 if (!IS_BLANK(CUR)) {
2471 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2472 ctxt->sax->error(ctxt->userData,
2473 "Space required after 'PUBLIC'\n");
2474 ctxt->wellFormed = 0;
2475 }
2476 SKIP_BLANKS;
2477 *publicID = htmlParsePubidLiteral(ctxt);
2478 if (*publicID == NULL) {
2479 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2480 ctxt->sax->error(ctxt->userData,
2481 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2482 ctxt->wellFormed = 0;
2483 }
2484 SKIP_BLANKS;
2485 if ((CUR == '"') || (CUR == '\'')) {
2486 URI = htmlParseSystemLiteral(ctxt);
2487 }
2488 }
2489 return(URI);
2490}
2491
2492/**
2493 * htmlParseComment:
2494 * @ctxt: an HTML parser context
2495 *
2496 * Parse an XML (SGML) comment <!-- .... -->
2497 *
2498 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2499 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002500static void
Owen Taylor3473f882001-02-23 17:55:21 +00002501htmlParseComment(htmlParserCtxtPtr ctxt) {
2502 xmlChar *buf = NULL;
2503 int len;
2504 int size = HTML_PARSER_BUFFER_SIZE;
2505 int q, ql;
2506 int r, rl;
2507 int cur, l;
2508 xmlParserInputState state;
2509
2510 /*
2511 * Check that there is a comment right here.
2512 */
2513 if ((RAW != '<') || (NXT(1) != '!') ||
2514 (NXT(2) != '-') || (NXT(3) != '-')) return;
2515
2516 state = ctxt->instate;
2517 ctxt->instate = XML_PARSER_COMMENT;
2518 SHRINK;
2519 SKIP(4);
2520 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2521 if (buf == NULL) {
2522 xmlGenericError(xmlGenericErrorContext,
2523 "malloc of %d byte failed\n", size);
2524 ctxt->instate = state;
2525 return;
2526 }
2527 q = CUR_CHAR(ql);
2528 NEXTL(ql);
2529 r = CUR_CHAR(rl);
2530 NEXTL(rl);
2531 cur = CUR_CHAR(l);
2532 len = 0;
2533 while (IS_CHAR(cur) &&
2534 ((cur != '>') ||
2535 (r != '-') || (q != '-'))) {
2536 if (len + 5 >= size) {
2537 size *= 2;
2538 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2539 if (buf == NULL) {
2540 xmlGenericError(xmlGenericErrorContext,
2541 "realloc of %d byte failed\n", size);
2542 ctxt->instate = state;
2543 return;
2544 }
2545 }
2546 COPY_BUF(ql,buf,len,q);
2547 q = r;
2548 ql = rl;
2549 r = cur;
2550 rl = l;
2551 NEXTL(l);
2552 cur = CUR_CHAR(l);
2553 if (cur == 0) {
2554 SHRINK;
2555 GROW;
2556 cur = CUR_CHAR(l);
2557 }
2558 }
2559 buf[len] = 0;
2560 if (!IS_CHAR(cur)) {
2561 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2562 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2563 ctxt->sax->error(ctxt->userData,
2564 "Comment not terminated \n<!--%.50s\n", buf);
2565 ctxt->wellFormed = 0;
2566 xmlFree(buf);
2567 } else {
2568 NEXT;
2569 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2570 (!ctxt->disableSAX))
2571 ctxt->sax->comment(ctxt->userData, buf);
2572 xmlFree(buf);
2573 }
2574 ctxt->instate = state;
2575}
2576
2577/**
2578 * htmlParseCharRef:
2579 * @ctxt: an HTML parser context
2580 *
2581 * parse Reference declarations
2582 *
2583 * [66] CharRef ::= '&#' [0-9]+ ';' |
2584 * '&#x' [0-9a-fA-F]+ ';'
2585 *
2586 * Returns the value parsed (as an int)
2587 */
2588int
2589htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2590 int val = 0;
2591
2592 if ((CUR == '&') && (NXT(1) == '#') &&
2593 (NXT(2) == 'x')) {
2594 SKIP(3);
2595 while (CUR != ';') {
2596 if ((CUR >= '0') && (CUR <= '9'))
2597 val = val * 16 + (CUR - '0');
2598 else if ((CUR >= 'a') && (CUR <= 'f'))
2599 val = val * 16 + (CUR - 'a') + 10;
2600 else if ((CUR >= 'A') && (CUR <= 'F'))
2601 val = val * 16 + (CUR - 'A') + 10;
2602 else {
2603 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2604 ctxt->sax->error(ctxt->userData,
2605 "htmlParseCharRef: invalid hexadecimal value\n");
2606 ctxt->wellFormed = 0;
2607 return(0);
2608 }
2609 NEXT;
2610 }
2611 if (CUR == ';')
2612 NEXT;
2613 } else if ((CUR == '&') && (NXT(1) == '#')) {
2614 SKIP(2);
2615 while (CUR != ';') {
2616 if ((CUR >= '0') && (CUR <= '9'))
2617 val = val * 10 + (CUR - '0');
2618 else {
2619 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2620 ctxt->sax->error(ctxt->userData,
2621 "htmlParseCharRef: invalid decimal value\n");
2622 ctxt->wellFormed = 0;
2623 return(0);
2624 }
2625 NEXT;
2626 }
2627 if (CUR == ';')
2628 NEXT;
2629 } else {
2630 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2631 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2632 ctxt->wellFormed = 0;
2633 }
2634 /*
2635 * Check the value IS_CHAR ...
2636 */
2637 if (IS_CHAR(val)) {
2638 return(val);
2639 } else {
2640 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2641 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
2642 val);
2643 ctxt->wellFormed = 0;
2644 }
2645 return(0);
2646}
2647
2648
2649/**
2650 * htmlParseDocTypeDecl :
2651 * @ctxt: an HTML parser context
2652 *
2653 * parse a DOCTYPE declaration
2654 *
2655 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2656 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2657 */
2658
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002659static void
Owen Taylor3473f882001-02-23 17:55:21 +00002660htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
2661 xmlChar *name;
2662 xmlChar *ExternalID = NULL;
2663 xmlChar *URI = NULL;
2664
2665 /*
2666 * We know that '<!DOCTYPE' has been detected.
2667 */
2668 SKIP(9);
2669
2670 SKIP_BLANKS;
2671
2672 /*
2673 * Parse the DOCTYPE name.
2674 */
2675 name = htmlParseName(ctxt);
2676 if (name == NULL) {
2677 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2678 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2679 ctxt->wellFormed = 0;
2680 }
2681 /*
2682 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2683 */
2684
2685 SKIP_BLANKS;
2686
2687 /*
2688 * Check for SystemID and ExternalID
2689 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002690 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00002691 SKIP_BLANKS;
2692
2693 /*
2694 * We should be at the end of the DOCTYPE declaration.
2695 */
2696 if (CUR != '>') {
2697 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardf6ed8bc2001-10-02 09:22:47 +00002698 ctxt->sax->error(ctxt->userData, "DOCTYPE improperly terminated\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002699 ctxt->wellFormed = 0;
2700 /* We shouldn't try to resynchronize ... */
2701 }
2702 NEXT;
2703
2704 /*
2705 * Create or update the document accordingly to the DOCTYPE
2706 */
2707 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2708 (!ctxt->disableSAX))
2709 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
2710
2711 /*
2712 * Cleanup, since we don't use all those identifiers
2713 */
2714 if (URI != NULL) xmlFree(URI);
2715 if (ExternalID != NULL) xmlFree(ExternalID);
2716 if (name != NULL) xmlFree(name);
2717}
2718
2719/**
2720 * htmlParseAttribute:
2721 * @ctxt: an HTML parser context
2722 * @value: a xmlChar ** used to store the value of the attribute
2723 *
2724 * parse an attribute
2725 *
2726 * [41] Attribute ::= Name Eq AttValue
2727 *
2728 * [25] Eq ::= S? '=' S?
2729 *
2730 * With namespace:
2731 *
2732 * [NS 11] Attribute ::= QName Eq AttValue
2733 *
2734 * Also the case QName == xmlns:??? is handled independently as a namespace
2735 * definition.
2736 *
2737 * Returns the attribute name, and the value in *value.
2738 */
2739
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002740static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002741htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
2742 xmlChar *name, *val = NULL;
2743
2744 *value = NULL;
2745 name = htmlParseHTMLName(ctxt);
2746 if (name == NULL) {
2747 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2748 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2749 ctxt->wellFormed = 0;
2750 return(NULL);
2751 }
2752
2753 /*
2754 * read the value
2755 */
2756 SKIP_BLANKS;
2757 if (CUR == '=') {
2758 NEXT;
2759 SKIP_BLANKS;
2760 val = htmlParseAttValue(ctxt);
2761 /******
2762 } else {
2763 * TODO : some attribute must have values, some may not
2764 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2765 ctxt->sax->warning(ctxt->userData,
2766 "No value for attribute %s\n", name); */
2767 }
2768
2769 *value = val;
2770 return(name);
2771}
2772
2773/**
2774 * htmlCheckEncoding:
2775 * @ctxt: an HTML parser context
2776 * @attvalue: the attribute value
2777 *
2778 * Checks an http-equiv attribute from a Meta tag to detect
2779 * the encoding
2780 * If a new encoding is detected the parser is switched to decode
2781 * it and pass UTF8
2782 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002783static void
Owen Taylor3473f882001-02-23 17:55:21 +00002784htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2785 const xmlChar *encoding;
2786
2787 if ((ctxt == NULL) || (attvalue == NULL))
2788 return;
2789
2790 /* do not change encoding */
2791 if (ctxt->input->encoding != NULL)
2792 return;
2793
2794 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
2795 if (encoding != NULL) {
2796 encoding += 8;
2797 } else {
2798 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
2799 if (encoding != NULL)
2800 encoding += 9;
2801 }
2802 if (encoding != NULL) {
2803 xmlCharEncoding enc;
2804 xmlCharEncodingHandlerPtr handler;
2805
2806 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2807
2808 if (ctxt->input->encoding != NULL)
2809 xmlFree((xmlChar *) ctxt->input->encoding);
2810 ctxt->input->encoding = xmlStrdup(encoding);
2811
2812 enc = xmlParseCharEncoding((const char *) encoding);
2813 /*
2814 * registered set of known encodings
2815 */
2816 if (enc != XML_CHAR_ENCODING_ERROR) {
2817 xmlSwitchEncoding(ctxt, enc);
2818 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2819 } else {
2820 /*
2821 * fallback for unknown encodings
2822 */
2823 handler = xmlFindCharEncodingHandler((const char *) encoding);
2824 if (handler != NULL) {
2825 xmlSwitchToEncoding(ctxt, handler);
2826 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2827 } else {
2828 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2829 }
2830 }
2831
2832 if ((ctxt->input->buf != NULL) &&
2833 (ctxt->input->buf->encoder != NULL) &&
2834 (ctxt->input->buf->raw != NULL) &&
2835 (ctxt->input->buf->buffer != NULL)) {
2836 int nbchars;
2837 int processed;
2838
2839 /*
2840 * convert as much as possible to the parser reading buffer.
2841 */
2842 processed = ctxt->input->cur - ctxt->input->base;
2843 xmlBufferShrink(ctxt->input->buf->buffer, processed);
2844 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2845 ctxt->input->buf->buffer,
2846 ctxt->input->buf->raw);
2847 if (nbchars < 0) {
2848 ctxt->errNo = XML_ERR_INVALID_ENCODING;
2849 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2850 ctxt->sax->error(ctxt->userData,
2851 "htmlCheckEncoding: encoder error\n");
2852 }
2853 ctxt->input->base =
2854 ctxt->input->cur = ctxt->input->buf->buffer->content;
2855 }
2856 }
2857}
2858
2859/**
2860 * htmlCheckMeta:
2861 * @ctxt: an HTML parser context
2862 * @atts: the attributes values
2863 *
2864 * Checks an attributes from a Meta tag
2865 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002866static void
Owen Taylor3473f882001-02-23 17:55:21 +00002867htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2868 int i;
2869 const xmlChar *att, *value;
2870 int http = 0;
2871 const xmlChar *content = NULL;
2872
2873 if ((ctxt == NULL) || (atts == NULL))
2874 return;
2875
2876 i = 0;
2877 att = atts[i++];
2878 while (att != NULL) {
2879 value = atts[i++];
2880 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
2881 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
2882 http = 1;
2883 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
2884 content = value;
2885 att = atts[i++];
2886 }
2887 if ((http) && (content != NULL))
2888 htmlCheckEncoding(ctxt, content);
2889
2890}
2891
2892/**
2893 * htmlParseStartTag:
2894 * @ctxt: an HTML parser context
2895 *
2896 * parse a start of tag either for rule element or
2897 * EmptyElement. In both case we don't parse the tag closing chars.
2898 *
2899 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2900 *
2901 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2902 *
2903 * With namespace:
2904 *
2905 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2906 *
2907 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2908 *
2909 */
2910
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002911static void
Owen Taylor3473f882001-02-23 17:55:21 +00002912htmlParseStartTag(htmlParserCtxtPtr ctxt) {
2913 xmlChar *name;
2914 xmlChar *attname;
2915 xmlChar *attvalue;
2916 const xmlChar **atts = NULL;
2917 int nbatts = 0;
2918 int maxatts = 0;
2919 int meta = 0;
2920 int i;
2921
2922 if (CUR != '<') return;
2923 NEXT;
2924
2925 GROW;
2926 name = htmlParseHTMLName(ctxt);
2927 if (name == NULL) {
2928 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2929 ctxt->sax->error(ctxt->userData,
2930 "htmlParseStartTag: invalid element name\n");
2931 ctxt->wellFormed = 0;
2932 /* Dump the bogus tag like browsers do */
2933 while ((IS_CHAR(CUR)) && (CUR != '>'))
2934 NEXT;
2935 return;
2936 }
2937 if (xmlStrEqual(name, BAD_CAST"meta"))
2938 meta = 1;
2939
2940 /*
2941 * Check for auto-closure of HTML elements.
2942 */
2943 htmlAutoClose(ctxt, name);
2944
2945 /*
2946 * Check for implied HTML elements.
2947 */
2948 htmlCheckImplied(ctxt, name);
2949
2950 /*
2951 * Avoid html at any level > 0, head at any level != 1
2952 * or any attempt to recurse body
2953 */
2954 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
2955 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2956 ctxt->sax->error(ctxt->userData,
2957 "htmlParseStartTag: misplaced <html> tag\n");
2958 ctxt->wellFormed = 0;
2959 xmlFree(name);
2960 return;
2961 }
2962 if ((ctxt->nameNr != 1) &&
2963 (xmlStrEqual(name, BAD_CAST"head"))) {
2964 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2965 ctxt->sax->error(ctxt->userData,
2966 "htmlParseStartTag: misplaced <head> tag\n");
2967 ctxt->wellFormed = 0;
2968 xmlFree(name);
2969 return;
2970 }
2971 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002972 int indx;
2973 for (indx = 0;indx < ctxt->nameNr;indx++) {
2974 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor3473f882001-02-23 17:55:21 +00002975 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2976 ctxt->sax->error(ctxt->userData,
2977 "htmlParseStartTag: misplaced <body> tag\n");
2978 ctxt->wellFormed = 0;
2979 xmlFree(name);
2980 return;
2981 }
2982 }
2983 }
2984
2985 /*
2986 * Now parse the attributes, it ends up with the ending
2987 *
2988 * (S Attribute)* S?
2989 */
2990 SKIP_BLANKS;
2991 while ((IS_CHAR(CUR)) &&
2992 (CUR != '>') &&
2993 ((CUR != '/') || (NXT(1) != '>'))) {
2994 long cons = ctxt->nbChars;
2995
2996 GROW;
2997 attname = htmlParseAttribute(ctxt, &attvalue);
2998 if (attname != NULL) {
2999
3000 /*
3001 * Well formedness requires at most one declaration of an attribute
3002 */
3003 for (i = 0; i < nbatts;i += 2) {
3004 if (xmlStrEqual(atts[i], attname)) {
3005 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3006 ctxt->sax->error(ctxt->userData,
3007 "Attribute %s redefined\n",
3008 attname);
3009 ctxt->wellFormed = 0;
3010 xmlFree(attname);
3011 if (attvalue != NULL)
3012 xmlFree(attvalue);
3013 goto failed;
3014 }
3015 }
3016
3017 /*
3018 * Add the pair to atts
3019 */
3020 if (atts == NULL) {
3021 maxatts = 10;
3022 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
3023 if (atts == NULL) {
3024 xmlGenericError(xmlGenericErrorContext,
3025 "malloc of %ld byte failed\n",
3026 maxatts * (long)sizeof(xmlChar *));
3027 if (name != NULL) xmlFree(name);
3028 return;
3029 }
3030 } else if (nbatts + 4 > maxatts) {
3031 maxatts *= 2;
3032 atts = (const xmlChar **) xmlRealloc((void *) atts,
3033 maxatts * sizeof(xmlChar *));
3034 if (atts == NULL) {
3035 xmlGenericError(xmlGenericErrorContext,
3036 "realloc of %ld byte failed\n",
3037 maxatts * (long)sizeof(xmlChar *));
3038 if (name != NULL) xmlFree(name);
3039 return;
3040 }
3041 }
3042 atts[nbatts++] = attname;
3043 atts[nbatts++] = attvalue;
3044 atts[nbatts] = NULL;
3045 atts[nbatts + 1] = NULL;
3046 }
3047 else {
3048 /* Dump the bogus attribute string up to the next blank or
3049 * the end of the tag. */
Daniel Veillard561b7f82002-03-20 21:55:57 +00003050 while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
3051 && ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003052 NEXT;
3053 }
3054
3055failed:
3056 SKIP_BLANKS;
3057 if (cons == ctxt->nbChars) {
3058 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3059 ctxt->sax->error(ctxt->userData,
3060 "htmlParseStartTag: problem parsing attributes\n");
3061 ctxt->wellFormed = 0;
3062 break;
3063 }
3064 }
3065
3066 /*
3067 * Handle specific association to the META tag
3068 */
3069 if (meta)
3070 htmlCheckMeta(ctxt, atts);
3071
3072 /*
3073 * SAX: Start of Element !
3074 */
3075 htmlnamePush(ctxt, xmlStrdup(name));
3076#ifdef DEBUG
3077 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
3078#endif
3079 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3080 ctxt->sax->startElement(ctxt->userData, name, atts);
3081
3082 if (atts != NULL) {
3083 for (i = 0;i < nbatts;i++) {
3084 if (atts[i] != NULL)
3085 xmlFree((xmlChar *) atts[i]);
3086 }
3087 xmlFree((void *) atts);
3088 }
3089 if (name != NULL) xmlFree(name);
3090}
3091
3092/**
3093 * htmlParseEndTag:
3094 * @ctxt: an HTML parser context
3095 *
3096 * parse an end of tag
3097 *
3098 * [42] ETag ::= '</' Name S? '>'
3099 *
3100 * With namespace
3101 *
3102 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003103 *
3104 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003105 */
3106
Daniel Veillardf420ac52001-07-04 16:04:09 +00003107static int
Owen Taylor3473f882001-02-23 17:55:21 +00003108htmlParseEndTag(htmlParserCtxtPtr ctxt) {
3109 xmlChar *name;
3110 xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003111 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003112
3113 if ((CUR != '<') || (NXT(1) != '/')) {
3114 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3115 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3116 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003117 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003118 }
3119 SKIP(2);
3120
3121 name = htmlParseHTMLName(ctxt);
Daniel Veillardf420ac52001-07-04 16:04:09 +00003122 if (name == NULL) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003123
3124 /*
3125 * We should definitely be at the ending "S? '>'" part
3126 */
3127 SKIP_BLANKS;
3128 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3129 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3130 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3131 ctxt->wellFormed = 0;
3132 } else
3133 NEXT;
3134
3135 /*
3136 * If the name read is not one of the element in the parsing stack
3137 * then return, it's just an error.
3138 */
3139 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
3140 if (xmlStrEqual(name, ctxt->nameTab[i])) break;
3141 }
3142 if (i < 0) {
3143 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3144 ctxt->sax->error(ctxt->userData,
3145 "Unexpected end tag : %s\n", name);
3146 xmlFree(name);
3147 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003148 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003149 }
3150
3151
3152 /*
3153 * Check for auto-closure of HTML elements.
3154 */
3155
3156 htmlAutoCloseOnClose(ctxt, name);
3157
3158 /*
3159 * Well formedness constraints, opening and closing must match.
3160 * With the exception that the autoclose may have popped stuff out
3161 * of the stack.
3162 */
3163 if (!xmlStrEqual(name, ctxt->name)) {
3164#ifdef DEBUG
3165 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
3166#endif
3167 if ((ctxt->name != NULL) &&
3168 (!xmlStrEqual(ctxt->name, name))) {
3169 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3170 ctxt->sax->error(ctxt->userData,
3171 "Opening and ending tag mismatch: %s and %s\n",
3172 name, ctxt->name);
3173 ctxt->wellFormed = 0;
3174 }
3175 }
3176
3177 /*
3178 * SAX: End of Tag
3179 */
3180 oldname = ctxt->name;
3181 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3182 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3183 ctxt->sax->endElement(ctxt->userData, name);
3184 oldname = htmlnamePop(ctxt);
3185 if (oldname != NULL) {
3186#ifdef DEBUG
3187 xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
3188#endif
3189 xmlFree(oldname);
3190#ifdef DEBUG
3191 } else {
3192 xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
3193#endif
3194 }
Daniel Veillardf420ac52001-07-04 16:04:09 +00003195 ret = 1;
3196 } else {
3197 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003198 }
3199
3200 if (name != NULL)
3201 xmlFree(name);
3202
Daniel Veillardf420ac52001-07-04 16:04:09 +00003203 return(ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003204}
3205
3206
3207/**
3208 * htmlParseReference:
3209 * @ctxt: an HTML parser context
3210 *
3211 * parse and handle entity references in content,
3212 * this will end-up in a call to character() since this is either a
3213 * CharRef, or a predefined entity.
3214 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003215static void
Owen Taylor3473f882001-02-23 17:55:21 +00003216htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003217 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003218 xmlChar out[6];
3219 xmlChar *name;
3220 if (CUR != '&') return;
3221
3222 if (NXT(1) == '#') {
3223 unsigned int c;
3224 int bits, i = 0;
3225
3226 c = htmlParseCharRef(ctxt);
3227 if (c == 0)
3228 return;
3229
3230 if (c < 0x80) { out[i++]= c; bits= -6; }
3231 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3232 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3233 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3234
3235 for ( ; bits >= 0; bits-= 6) {
3236 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3237 }
3238 out[i] = 0;
3239
3240 htmlCheckParagraph(ctxt);
3241 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3242 ctxt->sax->characters(ctxt->userData, out, i);
3243 } else {
3244 ent = htmlParseEntityRef(ctxt, &name);
3245 if (name == NULL) {
3246 htmlCheckParagraph(ctxt);
3247 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3248 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3249 return;
3250 }
3251 if ((ent == NULL) || (ent->value <= 0)) {
3252 htmlCheckParagraph(ctxt);
3253 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3254 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3255 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3256 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3257 }
3258 } else {
3259 unsigned int c;
3260 int bits, i = 0;
3261
3262 c = ent->value;
3263 if (c < 0x80)
3264 { out[i++]= c; bits= -6; }
3265 else if (c < 0x800)
3266 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3267 else if (c < 0x10000)
3268 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3269 else
3270 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3271
3272 for ( ; bits >= 0; bits-= 6) {
3273 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3274 }
3275 out[i] = 0;
3276
3277 htmlCheckParagraph(ctxt);
3278 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3279 ctxt->sax->characters(ctxt->userData, out, i);
3280 }
3281 xmlFree(name);
3282 }
3283}
3284
3285/**
3286 * htmlParseContent:
3287 * @ctxt: an HTML parser context
3288 * @name: the node name
3289 *
3290 * Parse a content: comment, sub-element, reference or text.
3291 *
3292 */
3293
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003294static void
Owen Taylor3473f882001-02-23 17:55:21 +00003295htmlParseContent(htmlParserCtxtPtr ctxt) {
3296 xmlChar *currentNode;
3297 int depth;
3298
3299 currentNode = xmlStrdup(ctxt->name);
3300 depth = ctxt->nameNr;
3301 while (1) {
3302 long cons = ctxt->nbChars;
3303
3304 GROW;
3305 /*
3306 * Our tag or one of it's parent or children is ending.
3307 */
3308 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003309 if (htmlParseEndTag(ctxt) &&
3310 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3311 if (currentNode != NULL)
3312 xmlFree(currentNode);
3313 return;
3314 }
3315 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003316 }
3317
3318 /*
3319 * Has this node been popped out during parsing of
3320 * the next element
3321 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003322 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3323 (!xmlStrEqual(currentNode, ctxt->name)))
3324 {
Owen Taylor3473f882001-02-23 17:55:21 +00003325 if (currentNode != NULL) xmlFree(currentNode);
3326 return;
3327 }
3328
Daniel Veillardf9533d12001-03-03 10:04:57 +00003329 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3330 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003331 /*
3332 * Handle SCRIPT/STYLE separately
3333 */
3334 htmlParseScript(ctxt);
3335 } else {
3336 /*
3337 * Sometimes DOCTYPE arrives in the middle of the document
3338 */
3339 if ((CUR == '<') && (NXT(1) == '!') &&
3340 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3341 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3342 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3343 (UPP(8) == 'E')) {
3344 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3345 ctxt->sax->error(ctxt->userData,
3346 "Misplaced DOCTYPE declaration\n");
3347 ctxt->wellFormed = 0;
3348 htmlParseDocTypeDecl(ctxt);
3349 }
3350
3351 /*
3352 * First case : a comment
3353 */
3354 if ((CUR == '<') && (NXT(1) == '!') &&
3355 (NXT(2) == '-') && (NXT(3) == '-')) {
3356 htmlParseComment(ctxt);
3357 }
3358
3359 /*
3360 * Second case : a sub-element.
3361 */
3362 else if (CUR == '<') {
3363 htmlParseElement(ctxt);
3364 }
3365
3366 /*
3367 * Third case : a reference. If if has not been resolved,
3368 * parsing returns it's Name, create the node
3369 */
3370 else if (CUR == '&') {
3371 htmlParseReference(ctxt);
3372 }
3373
3374 /*
3375 * Fourth : end of the resource
3376 */
3377 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003378 htmlAutoCloseOnEnd(ctxt);
3379 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003380 }
3381
3382 /*
3383 * Last case, text. Note that References are handled directly.
3384 */
3385 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003386 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003387 }
3388
3389 if (cons == ctxt->nbChars) {
3390 if (ctxt->node != NULL) {
3391 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3392 ctxt->sax->error(ctxt->userData,
3393 "detected an error in element content\n");
3394 ctxt->wellFormed = 0;
3395 }
3396 break;
3397 }
3398 }
3399 GROW;
3400 }
3401 if (currentNode != NULL) xmlFree(currentNode);
3402}
3403
3404/**
3405 * htmlParseElement:
3406 * @ctxt: an HTML parser context
3407 *
3408 * parse an HTML element, this is highly recursive
3409 *
3410 * [39] element ::= EmptyElemTag | STag content ETag
3411 *
3412 * [41] Attribute ::= Name Eq AttValue
3413 */
3414
3415void
3416htmlParseElement(htmlParserCtxtPtr ctxt) {
3417 xmlChar *name;
3418 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003419 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003420 htmlParserNodeInfo node_info;
3421 xmlChar *oldname;
3422 int depth = ctxt->nameNr;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003423 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003424
3425 /* Capture start position */
3426 if (ctxt->record_info) {
3427 node_info.begin_pos = ctxt->input->consumed +
3428 (CUR_PTR - ctxt->input->base);
3429 node_info.begin_line = ctxt->input->line;
3430 }
3431
3432 oldname = xmlStrdup(ctxt->name);
3433 htmlParseStartTag(ctxt);
3434 name = ctxt->name;
3435#ifdef DEBUG
3436 if (oldname == NULL)
3437 xmlGenericError(xmlGenericErrorContext,
3438 "Start of element %s\n", name);
3439 else if (name == NULL)
3440 xmlGenericError(xmlGenericErrorContext,
3441 "Start of element failed, was %s\n", oldname);
3442 else
3443 xmlGenericError(xmlGenericErrorContext,
3444 "Start of element %s, was %s\n", name, oldname);
3445#endif
3446 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3447 (name == NULL)) {
3448 if (CUR == '>')
3449 NEXT;
3450 if (oldname != NULL)
3451 xmlFree(oldname);
3452 return;
3453 }
3454 if (oldname != NULL)
3455 xmlFree(oldname);
3456
3457 /*
3458 * Lookup the info for that element.
3459 */
3460 info = htmlTagLookup(name);
3461 if (info == NULL) {
3462 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3463 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3464 name);
3465 ctxt->wellFormed = 0;
3466 } else if (info->depr) {
3467/***************************
3468 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3469 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3470 name);
3471 ***************************/
3472 }
3473
3474 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00003475 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00003476 */
3477 if ((CUR == '/') && (NXT(1) == '>')) {
3478 SKIP(2);
3479 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3480 ctxt->sax->endElement(ctxt->userData, name);
3481 oldname = htmlnamePop(ctxt);
3482#ifdef DEBUG
3483 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
3484#endif
3485 if (oldname != NULL)
3486 xmlFree(oldname);
3487 return;
3488 }
3489
3490 if (CUR == '>') {
3491 NEXT;
3492 } else {
3493 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3494 ctxt->sax->error(ctxt->userData,
3495 "Couldn't find end of Start Tag %s\n",
3496 name);
3497 ctxt->wellFormed = 0;
3498
3499 /*
3500 * end of parsing of this node.
3501 */
3502 if (xmlStrEqual(name, ctxt->name)) {
3503 nodePop(ctxt);
3504 oldname = htmlnamePop(ctxt);
3505#ifdef DEBUG
3506 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
3507#endif
3508 if (oldname != NULL)
3509 xmlFree(oldname);
3510 }
3511
3512 /*
3513 * Capture end position and add node
3514 */
3515 if ( currentNode != NULL && ctxt->record_info ) {
3516 node_info.end_pos = ctxt->input->consumed +
3517 (CUR_PTR - ctxt->input->base);
3518 node_info.end_line = ctxt->input->line;
3519 node_info.node = ctxt->node;
3520 xmlParserAddNodeInfo(ctxt, &node_info);
3521 }
3522 return;
3523 }
3524
3525 /*
3526 * Check for an Empty Element from DTD definition
3527 */
3528 if ((info != NULL) && (info->empty)) {
3529 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3530 ctxt->sax->endElement(ctxt->userData, name);
3531 oldname = htmlnamePop(ctxt);
3532#ifdef DEBUG
3533 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
3534#endif
3535 if (oldname != NULL)
3536 xmlFree(oldname);
3537 return;
3538 }
3539
3540 /*
3541 * Parse the content of the element:
3542 */
3543 currentNode = xmlStrdup(ctxt->name);
3544 depth = ctxt->nameNr;
3545 while (IS_CHAR(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00003546 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00003547 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00003548 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00003549 if (ctxt->nameNr < depth) break;
3550 }
3551
Owen Taylor3473f882001-02-23 17:55:21 +00003552 /*
3553 * Capture end position and add node
3554 */
3555 if ( currentNode != NULL && ctxt->record_info ) {
3556 node_info.end_pos = ctxt->input->consumed +
3557 (CUR_PTR - ctxt->input->base);
3558 node_info.end_line = ctxt->input->line;
3559 node_info.node = ctxt->node;
3560 xmlParserAddNodeInfo(ctxt, &node_info);
3561 }
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003562 if (!IS_CHAR(CUR)) {
3563 htmlAutoCloseOnEnd(ctxt);
3564 }
3565
Owen Taylor3473f882001-02-23 17:55:21 +00003566 if (currentNode != NULL)
3567 xmlFree(currentNode);
3568}
3569
3570/**
3571 * htmlParseDocument :
3572 * @ctxt: an HTML parser context
3573 *
3574 * parse an HTML document (and build a tree if using the standard SAX
3575 * interface).
3576 *
3577 * Returns 0, -1 in case of error. the parser context is augmented
3578 * as a result of the parsing.
3579 */
3580
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003581static int
Owen Taylor3473f882001-02-23 17:55:21 +00003582htmlParseDocument(htmlParserCtxtPtr ctxt) {
3583 xmlDtdPtr dtd;
3584
Daniel Veillardd0463562001-10-13 09:15:48 +00003585 xmlInitParser();
3586
Owen Taylor3473f882001-02-23 17:55:21 +00003587 htmlDefaultSAXHandlerInit();
3588 ctxt->html = 1;
3589
3590 GROW;
3591 /*
3592 * SAX: beginning of the document processing.
3593 */
3594 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3595 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3596
3597 /*
3598 * Wipe out everything which is before the first '<'
3599 */
3600 SKIP_BLANKS;
3601 if (CUR == 0) {
3602 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3603 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3604 ctxt->wellFormed = 0;
3605 }
3606
3607 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3608 ctxt->sax->startDocument(ctxt->userData);
3609
3610
3611 /*
3612 * Parse possible comments before any content
3613 */
3614 while ((CUR == '<') && (NXT(1) == '!') &&
3615 (NXT(2) == '-') && (NXT(3) == '-')) {
3616 htmlParseComment(ctxt);
3617 SKIP_BLANKS;
3618 }
3619
3620
3621 /*
3622 * Then possibly doc type declaration(s) and more Misc
3623 * (doctypedecl Misc*)?
3624 */
3625 if ((CUR == '<') && (NXT(1) == '!') &&
3626 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3627 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3628 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3629 (UPP(8) == 'E')) {
3630 htmlParseDocTypeDecl(ctxt);
3631 }
3632 SKIP_BLANKS;
3633
3634 /*
3635 * Parse possible comments before any content
3636 */
3637 while ((CUR == '<') && (NXT(1) == '!') &&
3638 (NXT(2) == '-') && (NXT(3) == '-')) {
3639 htmlParseComment(ctxt);
3640 SKIP_BLANKS;
3641 }
3642
3643 /*
3644 * Time to start parsing the tree itself
3645 */
3646 htmlParseContent(ctxt);
3647
3648 /*
3649 * autoclose
3650 */
3651 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003652 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003653
3654
3655 /*
3656 * SAX: end of the document processing.
3657 */
3658 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3659 ctxt->sax->endDocument(ctxt->userData);
3660
3661 if (ctxt->myDoc != NULL) {
3662 dtd = xmlGetIntSubset(ctxt->myDoc);
3663 if (dtd == NULL)
3664 ctxt->myDoc->intSubset =
3665 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3666 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3667 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3668 }
3669 if (! ctxt->wellFormed) return(-1);
3670 return(0);
3671}
3672
3673
3674/************************************************************************
3675 * *
3676 * Parser contexts handling *
3677 * *
3678 ************************************************************************/
3679
3680/**
3681 * xmlInitParserCtxt:
3682 * @ctxt: an HTML parser context
3683 *
3684 * Initialize a parser context
3685 */
3686
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003687static void
Owen Taylor3473f882001-02-23 17:55:21 +00003688htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3689{
3690 htmlSAXHandler *sax;
3691
3692 if (ctxt == NULL) return;
3693 memset(ctxt, 0, sizeof(htmlParserCtxt));
3694
3695 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3696 if (sax == NULL) {
3697 xmlGenericError(xmlGenericErrorContext,
3698 "htmlInitParserCtxt: out of memory\n");
3699 }
3700 else
3701 memset(sax, 0, sizeof(htmlSAXHandler));
3702
3703 /* Allocate the Input stack */
3704 ctxt->inputTab = (htmlParserInputPtr *)
3705 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3706 if (ctxt->inputTab == NULL) {
3707 xmlGenericError(xmlGenericErrorContext,
3708 "htmlInitParserCtxt: out of memory\n");
3709 ctxt->inputNr = 0;
3710 ctxt->inputMax = 0;
3711 ctxt->input = NULL;
3712 return;
3713 }
3714 ctxt->inputNr = 0;
3715 ctxt->inputMax = 5;
3716 ctxt->input = NULL;
3717 ctxt->version = NULL;
3718 ctxt->encoding = NULL;
3719 ctxt->standalone = -1;
3720 ctxt->instate = XML_PARSER_START;
3721
3722 /* Allocate the Node stack */
3723 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3724 if (ctxt->nodeTab == NULL) {
3725 xmlGenericError(xmlGenericErrorContext,
3726 "htmlInitParserCtxt: out of memory\n");
3727 ctxt->nodeNr = 0;
3728 ctxt->nodeMax = 0;
3729 ctxt->node = NULL;
3730 ctxt->inputNr = 0;
3731 ctxt->inputMax = 0;
3732 ctxt->input = NULL;
3733 return;
3734 }
3735 ctxt->nodeNr = 0;
3736 ctxt->nodeMax = 10;
3737 ctxt->node = NULL;
3738
3739 /* Allocate the Name stack */
3740 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
3741 if (ctxt->nameTab == NULL) {
3742 xmlGenericError(xmlGenericErrorContext,
3743 "htmlInitParserCtxt: out of memory\n");
3744 ctxt->nameNr = 0;
3745 ctxt->nameMax = 10;
3746 ctxt->name = NULL;
3747 ctxt->nodeNr = 0;
3748 ctxt->nodeMax = 0;
3749 ctxt->node = NULL;
3750 ctxt->inputNr = 0;
3751 ctxt->inputMax = 0;
3752 ctxt->input = NULL;
3753 return;
3754 }
3755 ctxt->nameNr = 0;
3756 ctxt->nameMax = 10;
3757 ctxt->name = NULL;
3758
3759 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3760 else {
3761 ctxt->sax = sax;
3762 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3763 }
3764 ctxt->userData = ctxt;
3765 ctxt->myDoc = NULL;
3766 ctxt->wellFormed = 1;
3767 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00003768 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00003769 ctxt->html = 1;
3770 ctxt->record_info = 0;
3771 ctxt->validate = 0;
3772 ctxt->nbChars = 0;
3773 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00003774 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003775 xmlInitNodeInfoSeq(&ctxt->node_seq);
3776}
3777
3778/**
3779 * htmlFreeParserCtxt:
3780 * @ctxt: an HTML parser context
3781 *
3782 * Free all the memory used by a parser context. However the parsed
3783 * document in ctxt->myDoc is not freed.
3784 */
3785
3786void
3787htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3788{
3789 xmlFreeParserCtxt(ctxt);
3790}
3791
3792/**
3793 * htmlCreateDocParserCtxt :
3794 * @cur: a pointer to an array of xmlChar
3795 * @encoding: a free form C string describing the HTML document encoding, or NULL
3796 *
3797 * Create a parser context for an HTML document.
3798 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003799 * TODO: check the need to add encoding handling there
3800 *
Owen Taylor3473f882001-02-23 17:55:21 +00003801 * Returns the new parser context or NULL
3802 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003803static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00003804htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Owen Taylor3473f882001-02-23 17:55:21 +00003805 htmlParserCtxtPtr ctxt;
3806 htmlParserInputPtr input;
3807 /* htmlCharEncoding enc; */
3808
3809 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
3810 if (ctxt == NULL) {
3811 perror("malloc");
3812 return(NULL);
3813 }
3814 htmlInitParserCtxt(ctxt);
3815 input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
3816 if (input == NULL) {
3817 perror("malloc");
3818 xmlFree(ctxt);
3819 return(NULL);
3820 }
3821 memset(input, 0, sizeof(htmlParserInput));
3822
3823 input->line = 1;
3824 input->col = 1;
3825 input->base = cur;
3826 input->cur = cur;
3827
3828 inputPush(ctxt, input);
3829 return(ctxt);
3830}
3831
3832/************************************************************************
3833 * *
3834 * Progressive parsing interfaces *
3835 * *
3836 ************************************************************************/
3837
3838/**
3839 * htmlParseLookupSequence:
3840 * @ctxt: an HTML parser context
3841 * @first: the first char to lookup
3842 * @next: the next char to lookup or zero
3843 * @third: the next char to lookup or zero
3844 *
3845 * Try to find if a sequence (first, next, third) or just (first next) or
3846 * (first) is available in the input stream.
3847 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3848 * to avoid rescanning sequences of bytes, it DOES change the state of the
3849 * parser, do not use liberally.
3850 * This is basically similar to xmlParseLookupSequence()
3851 *
3852 * Returns the index to the current parsing point if the full sequence
3853 * is available, -1 otherwise.
3854 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003855static int
Owen Taylor3473f882001-02-23 17:55:21 +00003856htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3857 xmlChar next, xmlChar third) {
3858 int base, len;
3859 htmlParserInputPtr in;
3860 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00003861 int incomment = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003862
3863 in = ctxt->input;
3864 if (in == NULL) return(-1);
3865 base = in->cur - in->base;
3866 if (base < 0) return(-1);
3867 if (ctxt->checkIndex > base)
3868 base = ctxt->checkIndex;
3869 if (in->buf == NULL) {
3870 buf = in->base;
3871 len = in->length;
3872 } else {
3873 buf = in->buf->buffer->content;
3874 len = in->buf->buffer->use;
3875 }
3876 /* take into account the sequence length */
3877 if (third) len -= 2;
3878 else if (next) len --;
3879 for (;base < len;base++) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00003880 if (!incomment && (base + 4 < len)) {
3881 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
3882 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
3883 incomment = 1;
3884 }
3885 /* do not increment base, some people use <!--> */
3886 }
3887 if (incomment) {
3888 if (base + 3 < len)
3889 return(-1);
3890 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
3891 (buf[base + 2] == '>')) {
3892 incomment = 0;
3893 base += 2;
3894 }
3895 continue;
3896 }
Owen Taylor3473f882001-02-23 17:55:21 +00003897 if (buf[base] == first) {
3898 if (third != 0) {
3899 if ((buf[base + 1] != next) ||
3900 (buf[base + 2] != third)) continue;
3901 } else if (next != 0) {
3902 if (buf[base + 1] != next) continue;
3903 }
3904 ctxt->checkIndex = 0;
3905#ifdef DEBUG_PUSH
3906 if (next == 0)
3907 xmlGenericError(xmlGenericErrorContext,
3908 "HPP: lookup '%c' found at %d\n",
3909 first, base);
3910 else if (third == 0)
3911 xmlGenericError(xmlGenericErrorContext,
3912 "HPP: lookup '%c%c' found at %d\n",
3913 first, next, base);
3914 else
3915 xmlGenericError(xmlGenericErrorContext,
3916 "HPP: lookup '%c%c%c' found at %d\n",
3917 first, next, third, base);
3918#endif
3919 return(base - (in->cur - in->base));
3920 }
3921 }
3922 ctxt->checkIndex = base;
3923#ifdef DEBUG_PUSH
3924 if (next == 0)
3925 xmlGenericError(xmlGenericErrorContext,
3926 "HPP: lookup '%c' failed\n", first);
3927 else if (third == 0)
3928 xmlGenericError(xmlGenericErrorContext,
3929 "HPP: lookup '%c%c' failed\n", first, next);
3930 else
3931 xmlGenericError(xmlGenericErrorContext,
3932 "HPP: lookup '%c%c%c' failed\n", first, next, third);
3933#endif
3934 return(-1);
3935}
3936
3937/**
3938 * htmlParseTryOrFinish:
3939 * @ctxt: an HTML parser context
3940 * @terminate: last chunk indicator
3941 *
3942 * Try to progress on parsing
3943 *
3944 * Returns zero if no parsing was possible
3945 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003946static int
Owen Taylor3473f882001-02-23 17:55:21 +00003947htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
3948 int ret = 0;
3949 htmlParserInputPtr in;
3950 int avail = 0;
3951 xmlChar cur, next;
3952
3953#ifdef DEBUG_PUSH
3954 switch (ctxt->instate) {
3955 case XML_PARSER_EOF:
3956 xmlGenericError(xmlGenericErrorContext,
3957 "HPP: try EOF\n"); break;
3958 case XML_PARSER_START:
3959 xmlGenericError(xmlGenericErrorContext,
3960 "HPP: try START\n"); break;
3961 case XML_PARSER_MISC:
3962 xmlGenericError(xmlGenericErrorContext,
3963 "HPP: try MISC\n");break;
3964 case XML_PARSER_COMMENT:
3965 xmlGenericError(xmlGenericErrorContext,
3966 "HPP: try COMMENT\n");break;
3967 case XML_PARSER_PROLOG:
3968 xmlGenericError(xmlGenericErrorContext,
3969 "HPP: try PROLOG\n");break;
3970 case XML_PARSER_START_TAG:
3971 xmlGenericError(xmlGenericErrorContext,
3972 "HPP: try START_TAG\n");break;
3973 case XML_PARSER_CONTENT:
3974 xmlGenericError(xmlGenericErrorContext,
3975 "HPP: try CONTENT\n");break;
3976 case XML_PARSER_CDATA_SECTION:
3977 xmlGenericError(xmlGenericErrorContext,
3978 "HPP: try CDATA_SECTION\n");break;
3979 case XML_PARSER_END_TAG:
3980 xmlGenericError(xmlGenericErrorContext,
3981 "HPP: try END_TAG\n");break;
3982 case XML_PARSER_ENTITY_DECL:
3983 xmlGenericError(xmlGenericErrorContext,
3984 "HPP: try ENTITY_DECL\n");break;
3985 case XML_PARSER_ENTITY_VALUE:
3986 xmlGenericError(xmlGenericErrorContext,
3987 "HPP: try ENTITY_VALUE\n");break;
3988 case XML_PARSER_ATTRIBUTE_VALUE:
3989 xmlGenericError(xmlGenericErrorContext,
3990 "HPP: try ATTRIBUTE_VALUE\n");break;
3991 case XML_PARSER_DTD:
3992 xmlGenericError(xmlGenericErrorContext,
3993 "HPP: try DTD\n");break;
3994 case XML_PARSER_EPILOG:
3995 xmlGenericError(xmlGenericErrorContext,
3996 "HPP: try EPILOG\n");break;
3997 case XML_PARSER_PI:
3998 xmlGenericError(xmlGenericErrorContext,
3999 "HPP: try PI\n");break;
4000 case XML_PARSER_SYSTEM_LITERAL:
4001 xmlGenericError(xmlGenericErrorContext,
4002 "HPP: try SYSTEM_LITERAL\n");break;
4003 }
4004#endif
4005
4006 while (1) {
4007
4008 in = ctxt->input;
4009 if (in == NULL) break;
4010 if (in->buf == NULL)
4011 avail = in->length - (in->cur - in->base);
4012 else
4013 avail = in->buf->buffer->use - (in->cur - in->base);
4014 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004015 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004016 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4017 /*
4018 * SAX: end of the document processing.
4019 */
4020 ctxt->instate = XML_PARSER_EOF;
4021 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4022 ctxt->sax->endDocument(ctxt->userData);
4023 }
4024 }
4025 if (avail < 1)
4026 goto done;
4027 switch (ctxt->instate) {
4028 case XML_PARSER_EOF:
4029 /*
4030 * Document parsing is done !
4031 */
4032 goto done;
4033 case XML_PARSER_START:
4034 /*
4035 * Very first chars read from the document flow.
4036 */
4037 cur = in->cur[0];
4038 if (IS_BLANK(cur)) {
4039 SKIP_BLANKS;
4040 if (in->buf == NULL)
4041 avail = in->length - (in->cur - in->base);
4042 else
4043 avail = in->buf->buffer->use - (in->cur - in->base);
4044 }
4045 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4046 ctxt->sax->setDocumentLocator(ctxt->userData,
4047 &xmlDefaultSAXLocator);
4048 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4049 (!ctxt->disableSAX))
4050 ctxt->sax->startDocument(ctxt->userData);
4051
4052 cur = in->cur[0];
4053 next = in->cur[1];
4054 if ((cur == '<') && (next == '!') &&
4055 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4056 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4057 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4058 (UPP(8) == 'E')) {
4059 if ((!terminate) &&
4060 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4061 goto done;
4062#ifdef DEBUG_PUSH
4063 xmlGenericError(xmlGenericErrorContext,
4064 "HPP: Parsing internal subset\n");
4065#endif
4066 htmlParseDocTypeDecl(ctxt);
4067 ctxt->instate = XML_PARSER_PROLOG;
4068#ifdef DEBUG_PUSH
4069 xmlGenericError(xmlGenericErrorContext,
4070 "HPP: entering PROLOG\n");
4071#endif
4072 } else {
4073 ctxt->instate = XML_PARSER_MISC;
4074 }
4075#ifdef DEBUG_PUSH
4076 xmlGenericError(xmlGenericErrorContext,
4077 "HPP: entering MISC\n");
4078#endif
4079 break;
4080 case XML_PARSER_MISC:
4081 SKIP_BLANKS;
4082 if (in->buf == NULL)
4083 avail = in->length - (in->cur - in->base);
4084 else
4085 avail = in->buf->buffer->use - (in->cur - in->base);
4086 if (avail < 2)
4087 goto done;
4088 cur = in->cur[0];
4089 next = in->cur[1];
4090 if ((cur == '<') && (next == '!') &&
4091 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4092 if ((!terminate) &&
4093 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4094 goto done;
4095#ifdef DEBUG_PUSH
4096 xmlGenericError(xmlGenericErrorContext,
4097 "HPP: Parsing Comment\n");
4098#endif
4099 htmlParseComment(ctxt);
4100 ctxt->instate = XML_PARSER_MISC;
4101 } else if ((cur == '<') && (next == '!') &&
4102 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4103 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4104 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4105 (UPP(8) == 'E')) {
4106 if ((!terminate) &&
4107 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4108 goto done;
4109#ifdef DEBUG_PUSH
4110 xmlGenericError(xmlGenericErrorContext,
4111 "HPP: Parsing internal subset\n");
4112#endif
4113 htmlParseDocTypeDecl(ctxt);
4114 ctxt->instate = XML_PARSER_PROLOG;
4115#ifdef DEBUG_PUSH
4116 xmlGenericError(xmlGenericErrorContext,
4117 "HPP: entering PROLOG\n");
4118#endif
4119 } else if ((cur == '<') && (next == '!') &&
4120 (avail < 9)) {
4121 goto done;
4122 } else {
4123 ctxt->instate = XML_PARSER_START_TAG;
4124#ifdef DEBUG_PUSH
4125 xmlGenericError(xmlGenericErrorContext,
4126 "HPP: entering START_TAG\n");
4127#endif
4128 }
4129 break;
4130 case XML_PARSER_PROLOG:
4131 SKIP_BLANKS;
4132 if (in->buf == NULL)
4133 avail = in->length - (in->cur - in->base);
4134 else
4135 avail = in->buf->buffer->use - (in->cur - in->base);
4136 if (avail < 2)
4137 goto done;
4138 cur = in->cur[0];
4139 next = in->cur[1];
4140 if ((cur == '<') && (next == '!') &&
4141 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4142 if ((!terminate) &&
4143 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4144 goto done;
4145#ifdef DEBUG_PUSH
4146 xmlGenericError(xmlGenericErrorContext,
4147 "HPP: Parsing Comment\n");
4148#endif
4149 htmlParseComment(ctxt);
4150 ctxt->instate = XML_PARSER_PROLOG;
4151 } else if ((cur == '<') && (next == '!') &&
4152 (avail < 4)) {
4153 goto done;
4154 } else {
4155 ctxt->instate = XML_PARSER_START_TAG;
4156#ifdef DEBUG_PUSH
4157 xmlGenericError(xmlGenericErrorContext,
4158 "HPP: entering START_TAG\n");
4159#endif
4160 }
4161 break;
4162 case XML_PARSER_EPILOG:
4163 if (in->buf == NULL)
4164 avail = in->length - (in->cur - in->base);
4165 else
4166 avail = in->buf->buffer->use - (in->cur - in->base);
4167 if (avail < 1)
4168 goto done;
4169 cur = in->cur[0];
4170 if (IS_BLANK(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004171 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004172 goto done;
4173 }
4174 if (avail < 2)
4175 goto done;
4176 next = in->cur[1];
4177 if ((cur == '<') && (next == '!') &&
4178 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4179 if ((!terminate) &&
4180 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4181 goto done;
4182#ifdef DEBUG_PUSH
4183 xmlGenericError(xmlGenericErrorContext,
4184 "HPP: Parsing Comment\n");
4185#endif
4186 htmlParseComment(ctxt);
4187 ctxt->instate = XML_PARSER_EPILOG;
4188 } else if ((cur == '<') && (next == '!') &&
4189 (avail < 4)) {
4190 goto done;
4191 } else {
4192 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004193 ctxt->wellFormed = 0;
4194 ctxt->instate = XML_PARSER_EOF;
4195#ifdef DEBUG_PUSH
4196 xmlGenericError(xmlGenericErrorContext,
4197 "HPP: entering EOF\n");
4198#endif
4199 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4200 ctxt->sax->endDocument(ctxt->userData);
4201 goto done;
4202 }
4203 break;
4204 case XML_PARSER_START_TAG: {
4205 xmlChar *name, *oldname;
4206 int depth = ctxt->nameNr;
Daniel Veillardbb371292001-08-16 23:26:59 +00004207 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004208
4209 if (avail < 2)
4210 goto done;
4211 cur = in->cur[0];
4212 if (cur != '<') {
4213 ctxt->instate = XML_PARSER_CONTENT;
4214#ifdef DEBUG_PUSH
4215 xmlGenericError(xmlGenericErrorContext,
4216 "HPP: entering CONTENT\n");
4217#endif
4218 break;
4219 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004220 if (in->cur[1] == '/') {
4221 ctxt->instate = XML_PARSER_END_TAG;
4222 ctxt->checkIndex = 0;
4223#ifdef DEBUG_PUSH
4224 xmlGenericError(xmlGenericErrorContext,
4225 "HPP: entering END_TAG\n");
4226#endif
4227 break;
4228 }
Owen Taylor3473f882001-02-23 17:55:21 +00004229 if ((!terminate) &&
4230 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4231 goto done;
4232
4233 oldname = xmlStrdup(ctxt->name);
4234 htmlParseStartTag(ctxt);
4235 name = ctxt->name;
4236#ifdef DEBUG
4237 if (oldname == NULL)
4238 xmlGenericError(xmlGenericErrorContext,
4239 "Start of element %s\n", name);
4240 else if (name == NULL)
4241 xmlGenericError(xmlGenericErrorContext,
4242 "Start of element failed, was %s\n",
4243 oldname);
4244 else
4245 xmlGenericError(xmlGenericErrorContext,
4246 "Start of element %s, was %s\n",
4247 name, oldname);
4248#endif
4249 if (((depth == ctxt->nameNr) &&
4250 (xmlStrEqual(oldname, ctxt->name))) ||
4251 (name == NULL)) {
4252 if (CUR == '>')
4253 NEXT;
4254 if (oldname != NULL)
4255 xmlFree(oldname);
4256 break;
4257 }
4258 if (oldname != NULL)
4259 xmlFree(oldname);
4260
4261 /*
4262 * Lookup the info for that element.
4263 */
4264 info = htmlTagLookup(name);
4265 if (info == NULL) {
4266 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4267 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4268 name);
4269 ctxt->wellFormed = 0;
4270 } else if (info->depr) {
4271 /***************************
4272 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4273 ctxt->sax->warning(ctxt->userData,
4274 "Tag %s is deprecated\n",
4275 name);
4276 ***************************/
4277 }
4278
4279 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004280 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004281 */
4282 if ((CUR == '/') && (NXT(1) == '>')) {
4283 SKIP(2);
4284 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4285 ctxt->sax->endElement(ctxt->userData, name);
4286 oldname = htmlnamePop(ctxt);
4287#ifdef DEBUG
4288 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
4289 oldname);
4290#endif
4291 if (oldname != NULL)
4292 xmlFree(oldname);
4293 ctxt->instate = XML_PARSER_CONTENT;
4294#ifdef DEBUG_PUSH
4295 xmlGenericError(xmlGenericErrorContext,
4296 "HPP: entering CONTENT\n");
4297#endif
4298 break;
4299 }
4300
4301 if (CUR == '>') {
4302 NEXT;
4303 } else {
4304 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4305 ctxt->sax->error(ctxt->userData,
4306 "Couldn't find end of Start Tag %s\n",
4307 name);
4308 ctxt->wellFormed = 0;
4309
4310 /*
4311 * end of parsing of this node.
4312 */
4313 if (xmlStrEqual(name, ctxt->name)) {
4314 nodePop(ctxt);
4315 oldname = htmlnamePop(ctxt);
4316#ifdef DEBUG
4317 xmlGenericError(xmlGenericErrorContext,
4318 "End of start tag problem: popping out %s\n", oldname);
4319#endif
4320 if (oldname != NULL)
4321 xmlFree(oldname);
4322 }
4323
4324 ctxt->instate = XML_PARSER_CONTENT;
4325#ifdef DEBUG_PUSH
4326 xmlGenericError(xmlGenericErrorContext,
4327 "HPP: entering CONTENT\n");
4328#endif
4329 break;
4330 }
4331
4332 /*
4333 * Check for an Empty Element from DTD definition
4334 */
4335 if ((info != NULL) && (info->empty)) {
4336 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4337 ctxt->sax->endElement(ctxt->userData, name);
4338 oldname = htmlnamePop(ctxt);
4339#ifdef DEBUG
4340 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
4341#endif
4342 if (oldname != NULL)
4343 xmlFree(oldname);
4344 }
4345 ctxt->instate = XML_PARSER_CONTENT;
4346#ifdef DEBUG_PUSH
4347 xmlGenericError(xmlGenericErrorContext,
4348 "HPP: entering CONTENT\n");
4349#endif
4350 break;
4351 }
4352 case XML_PARSER_CONTENT: {
4353 long cons;
4354 /*
4355 * Handle preparsed entities and charRef
4356 */
4357 if (ctxt->token != 0) {
4358 xmlChar chr[2] = { 0 , 0 } ;
4359
4360 chr[0] = (xmlChar) ctxt->token;
4361 htmlCheckParagraph(ctxt);
4362 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4363 ctxt->sax->characters(ctxt->userData, chr, 1);
4364 ctxt->token = 0;
4365 ctxt->checkIndex = 0;
4366 }
4367 if ((avail == 1) && (terminate)) {
4368 cur = in->cur[0];
4369 if ((cur != '<') && (cur != '&')) {
4370 if (ctxt->sax != NULL) {
4371 if (IS_BLANK(cur)) {
4372 if (ctxt->sax->ignorableWhitespace != NULL)
4373 ctxt->sax->ignorableWhitespace(
4374 ctxt->userData, &cur, 1);
4375 } else {
4376 htmlCheckParagraph(ctxt);
4377 if (ctxt->sax->characters != NULL)
4378 ctxt->sax->characters(
4379 ctxt->userData, &cur, 1);
4380 }
4381 }
4382 ctxt->token = 0;
4383 ctxt->checkIndex = 0;
4384 NEXT;
William M. Brack1633d182001-10-05 15:41:19 +00004385 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004386 }
Owen Taylor3473f882001-02-23 17:55:21 +00004387 }
4388 if (avail < 2)
4389 goto done;
4390 cur = in->cur[0];
4391 next = in->cur[1];
4392 cons = ctxt->nbChars;
4393 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4394 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4395 /*
4396 * Handle SCRIPT/STYLE separately
4397 */
4398 if ((!terminate) &&
4399 (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4400 goto done;
4401 htmlParseScript(ctxt);
4402 if ((cur == '<') && (next == '/')) {
4403 ctxt->instate = XML_PARSER_END_TAG;
4404 ctxt->checkIndex = 0;
4405#ifdef DEBUG_PUSH
4406 xmlGenericError(xmlGenericErrorContext,
4407 "HPP: entering END_TAG\n");
4408#endif
4409 break;
4410 }
4411 } else {
4412 /*
4413 * Sometimes DOCTYPE arrives in the middle of the document
4414 */
4415 if ((cur == '<') && (next == '!') &&
4416 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4417 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4418 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4419 (UPP(8) == 'E')) {
4420 if ((!terminate) &&
4421 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4422 goto done;
4423 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4424 ctxt->sax->error(ctxt->userData,
4425 "Misplaced DOCTYPE declaration\n");
4426 ctxt->wellFormed = 0;
4427 htmlParseDocTypeDecl(ctxt);
4428 } else if ((cur == '<') && (next == '!') &&
4429 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4430 if ((!terminate) &&
4431 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4432 goto done;
4433#ifdef DEBUG_PUSH
4434 xmlGenericError(xmlGenericErrorContext,
4435 "HPP: Parsing Comment\n");
4436#endif
4437 htmlParseComment(ctxt);
4438 ctxt->instate = XML_PARSER_CONTENT;
4439 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4440 goto done;
4441 } else if ((cur == '<') && (next == '/')) {
4442 ctxt->instate = XML_PARSER_END_TAG;
4443 ctxt->checkIndex = 0;
4444#ifdef DEBUG_PUSH
4445 xmlGenericError(xmlGenericErrorContext,
4446 "HPP: entering END_TAG\n");
4447#endif
4448 break;
4449 } else if (cur == '<') {
4450 ctxt->instate = XML_PARSER_START_TAG;
4451 ctxt->checkIndex = 0;
4452#ifdef DEBUG_PUSH
4453 xmlGenericError(xmlGenericErrorContext,
4454 "HPP: entering START_TAG\n");
4455#endif
4456 break;
4457 } else if (cur == '&') {
4458 if ((!terminate) &&
4459 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4460 goto done;
4461#ifdef DEBUG_PUSH
4462 xmlGenericError(xmlGenericErrorContext,
4463 "HPP: Parsing Reference\n");
4464#endif
4465 /* TODO: check generation of subtrees if noent !!! */
4466 htmlParseReference(ctxt);
4467 } else {
4468 /* TODO Avoid the extra copy, handle directly !!!!!! */
4469 /*
4470 * Goal of the following test is :
4471 * - minimize calls to the SAX 'character' callback
4472 * when they are mergeable
4473 */
4474 if ((ctxt->inputNr == 1) &&
4475 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4476 if ((!terminate) &&
4477 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4478 goto done;
4479 }
4480 ctxt->checkIndex = 0;
4481#ifdef DEBUG_PUSH
4482 xmlGenericError(xmlGenericErrorContext,
4483 "HPP: Parsing char data\n");
4484#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004485 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004486 }
4487 }
4488 if (cons == ctxt->nbChars) {
4489 if (ctxt->node != NULL) {
4490 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4491 ctxt->sax->error(ctxt->userData,
4492 "detected an error in element content\n");
4493 ctxt->wellFormed = 0;
4494 }
4495 NEXT;
4496 break;
4497 }
4498
4499 break;
4500 }
4501 case XML_PARSER_END_TAG:
4502 if (avail < 2)
4503 goto done;
4504 if ((!terminate) &&
4505 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4506 goto done;
4507 htmlParseEndTag(ctxt);
4508 if (ctxt->nameNr == 0) {
4509 ctxt->instate = XML_PARSER_EPILOG;
4510 } else {
4511 ctxt->instate = XML_PARSER_CONTENT;
4512 }
4513 ctxt->checkIndex = 0;
4514#ifdef DEBUG_PUSH
4515 xmlGenericError(xmlGenericErrorContext,
4516 "HPP: entering CONTENT\n");
4517#endif
4518 break;
4519 case XML_PARSER_CDATA_SECTION:
4520 xmlGenericError(xmlGenericErrorContext,
4521 "HPP: internal error, state == CDATA\n");
4522 ctxt->instate = XML_PARSER_CONTENT;
4523 ctxt->checkIndex = 0;
4524#ifdef DEBUG_PUSH
4525 xmlGenericError(xmlGenericErrorContext,
4526 "HPP: entering CONTENT\n");
4527#endif
4528 break;
4529 case XML_PARSER_DTD:
4530 xmlGenericError(xmlGenericErrorContext,
4531 "HPP: internal error, state == DTD\n");
4532 ctxt->instate = XML_PARSER_CONTENT;
4533 ctxt->checkIndex = 0;
4534#ifdef DEBUG_PUSH
4535 xmlGenericError(xmlGenericErrorContext,
4536 "HPP: entering CONTENT\n");
4537#endif
4538 break;
4539 case XML_PARSER_COMMENT:
4540 xmlGenericError(xmlGenericErrorContext,
4541 "HPP: internal error, state == COMMENT\n");
4542 ctxt->instate = XML_PARSER_CONTENT;
4543 ctxt->checkIndex = 0;
4544#ifdef DEBUG_PUSH
4545 xmlGenericError(xmlGenericErrorContext,
4546 "HPP: entering CONTENT\n");
4547#endif
4548 break;
4549 case XML_PARSER_PI:
4550 xmlGenericError(xmlGenericErrorContext,
4551 "HPP: internal error, state == PI\n");
4552 ctxt->instate = XML_PARSER_CONTENT;
4553 ctxt->checkIndex = 0;
4554#ifdef DEBUG_PUSH
4555 xmlGenericError(xmlGenericErrorContext,
4556 "HPP: entering CONTENT\n");
4557#endif
4558 break;
4559 case XML_PARSER_ENTITY_DECL:
4560 xmlGenericError(xmlGenericErrorContext,
4561 "HPP: internal error, state == ENTITY_DECL\n");
4562 ctxt->instate = XML_PARSER_CONTENT;
4563 ctxt->checkIndex = 0;
4564#ifdef DEBUG_PUSH
4565 xmlGenericError(xmlGenericErrorContext,
4566 "HPP: entering CONTENT\n");
4567#endif
4568 break;
4569 case XML_PARSER_ENTITY_VALUE:
4570 xmlGenericError(xmlGenericErrorContext,
4571 "HPP: internal error, state == ENTITY_VALUE\n");
4572 ctxt->instate = XML_PARSER_CONTENT;
4573 ctxt->checkIndex = 0;
4574#ifdef DEBUG_PUSH
4575 xmlGenericError(xmlGenericErrorContext,
4576 "HPP: entering DTD\n");
4577#endif
4578 break;
4579 case XML_PARSER_ATTRIBUTE_VALUE:
4580 xmlGenericError(xmlGenericErrorContext,
4581 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
4582 ctxt->instate = XML_PARSER_START_TAG;
4583 ctxt->checkIndex = 0;
4584#ifdef DEBUG_PUSH
4585 xmlGenericError(xmlGenericErrorContext,
4586 "HPP: entering START_TAG\n");
4587#endif
4588 break;
4589 case XML_PARSER_SYSTEM_LITERAL:
4590 xmlGenericError(xmlGenericErrorContext,
4591 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
4592 ctxt->instate = XML_PARSER_CONTENT;
4593 ctxt->checkIndex = 0;
4594#ifdef DEBUG_PUSH
4595 xmlGenericError(xmlGenericErrorContext,
4596 "HPP: entering CONTENT\n");
4597#endif
4598 break;
4599 case XML_PARSER_IGNORE:
4600 xmlGenericError(xmlGenericErrorContext,
4601 "HPP: internal error, state == XML_PARSER_IGNORE\n");
4602 ctxt->instate = XML_PARSER_CONTENT;
4603 ctxt->checkIndex = 0;
4604#ifdef DEBUG_PUSH
4605 xmlGenericError(xmlGenericErrorContext,
4606 "HPP: entering CONTENT\n");
4607#endif
4608 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00004609 case XML_PARSER_PUBLIC_LITERAL:
4610 xmlGenericError(xmlGenericErrorContext,
4611 "HPP: internal error, state == XML_PARSER_LITERAL\n");
4612 ctxt->instate = XML_PARSER_CONTENT;
4613 ctxt->checkIndex = 0;
4614#ifdef DEBUG_PUSH
4615 xmlGenericError(xmlGenericErrorContext,
4616 "HPP: entering CONTENT\n");
4617#endif
4618 break;
4619
Owen Taylor3473f882001-02-23 17:55:21 +00004620 }
4621 }
4622done:
4623 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004624 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004625 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4626 /*
4627 * SAX: end of the document processing.
4628 */
4629 ctxt->instate = XML_PARSER_EOF;
4630 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4631 ctxt->sax->endDocument(ctxt->userData);
4632 }
4633 }
4634 if ((ctxt->myDoc != NULL) &&
4635 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4636 (ctxt->instate == XML_PARSER_EPILOG))) {
4637 xmlDtdPtr dtd;
4638 dtd = xmlGetIntSubset(ctxt->myDoc);
4639 if (dtd == NULL)
4640 ctxt->myDoc->intSubset =
4641 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4642 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4643 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4644 }
4645#ifdef DEBUG_PUSH
4646 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4647#endif
4648 return(ret);
4649}
4650
4651/**
Owen Taylor3473f882001-02-23 17:55:21 +00004652 * htmlParseChunk:
4653 * @ctxt: an XML parser context
4654 * @chunk: an char array
4655 * @size: the size in byte of the chunk
4656 * @terminate: last chunk indicator
4657 *
4658 * Parse a Chunk of memory
4659 *
4660 * Returns zero if no error, the xmlParserErrors otherwise.
4661 */
4662int
4663htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4664 int terminate) {
4665 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4666 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4667 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4668 int cur = ctxt->input->cur - ctxt->input->base;
4669
4670 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4671 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4672 ctxt->input->cur = ctxt->input->base + cur;
4673#ifdef DEBUG_PUSH
4674 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4675#endif
4676
4677 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4678 htmlParseTryOrFinish(ctxt, terminate);
4679 } else if (ctxt->instate != XML_PARSER_EOF) {
4680 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
4681 htmlParseTryOrFinish(ctxt, terminate);
4682 }
4683 if (terminate) {
4684 if ((ctxt->instate != XML_PARSER_EOF) &&
4685 (ctxt->instate != XML_PARSER_EPILOG) &&
4686 (ctxt->instate != XML_PARSER_MISC)) {
4687 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004688 ctxt->wellFormed = 0;
4689 }
4690 if (ctxt->instate != XML_PARSER_EOF) {
4691 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4692 ctxt->sax->endDocument(ctxt->userData);
4693 }
4694 ctxt->instate = XML_PARSER_EOF;
4695 }
4696 return((xmlParserErrors) ctxt->errNo);
4697}
4698
4699/************************************************************************
4700 * *
4701 * User entry points *
4702 * *
4703 ************************************************************************/
4704
4705/**
4706 * htmlCreatePushParserCtxt :
4707 * @sax: a SAX handler
4708 * @user_data: The user data returned on SAX callbacks
4709 * @chunk: a pointer to an array of chars
4710 * @size: number of chars in the array
4711 * @filename: an optional file name or URI
4712 * @enc: an optional encoding
4713 *
4714 * Create a parser context for using the HTML parser in push mode
4715 * To allow content encoding detection, @size should be >= 4
4716 * The value of @filename is used for fetching external entities
4717 * and error/warning reports.
4718 *
4719 * Returns the new parser context or NULL
4720 */
4721htmlParserCtxtPtr
4722htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4723 const char *chunk, int size, const char *filename,
4724 xmlCharEncoding enc) {
4725 htmlParserCtxtPtr ctxt;
4726 htmlParserInputPtr inputStream;
4727 xmlParserInputBufferPtr buf;
4728
Daniel Veillardd0463562001-10-13 09:15:48 +00004729 xmlInitParser();
4730
Owen Taylor3473f882001-02-23 17:55:21 +00004731 buf = xmlAllocParserInputBuffer(enc);
4732 if (buf == NULL) return(NULL);
4733
4734 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4735 if (ctxt == NULL) {
4736 xmlFree(buf);
4737 return(NULL);
4738 }
4739 memset(ctxt, 0, sizeof(htmlParserCtxt));
4740 htmlInitParserCtxt(ctxt);
4741 if (sax != NULL) {
4742 if (ctxt->sax != &htmlDefaultSAXHandler)
4743 xmlFree(ctxt->sax);
4744 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4745 if (ctxt->sax == NULL) {
4746 xmlFree(buf);
4747 xmlFree(ctxt);
4748 return(NULL);
4749 }
4750 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4751 if (user_data != NULL)
4752 ctxt->userData = user_data;
4753 }
4754 if (filename == NULL) {
4755 ctxt->directory = NULL;
4756 } else {
4757 ctxt->directory = xmlParserGetDirectory(filename);
4758 }
4759
4760 inputStream = htmlNewInputStream(ctxt);
4761 if (inputStream == NULL) {
4762 xmlFreeParserCtxt(ctxt);
4763 return(NULL);
4764 }
4765
4766 if (filename == NULL)
4767 inputStream->filename = NULL;
4768 else
4769 inputStream->filename = xmlMemStrdup(filename);
4770 inputStream->buf = buf;
4771 inputStream->base = inputStream->buf->buffer->content;
4772 inputStream->cur = inputStream->buf->buffer->content;
4773
4774 inputPush(ctxt, inputStream);
4775
4776 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4777 (ctxt->input->buf != NULL)) {
4778 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4779#ifdef DEBUG_PUSH
4780 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4781#endif
4782 }
4783
4784 return(ctxt);
4785}
4786
4787/**
4788 * htmlSAXParseDoc :
4789 * @cur: a pointer to an array of xmlChar
4790 * @encoding: a free form C string describing the HTML document encoding, or NULL
4791 * @sax: the SAX handler block
4792 * @userData: if using SAX, this pointer will be provided on callbacks.
4793 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004794 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
4795 * to handle parse events. If sax is NULL, fallback to the default DOM
4796 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00004797 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004798 * Returns the resulting document tree unless SAX is NULL or the document is
4799 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00004800 */
4801
4802htmlDocPtr
4803htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
4804 htmlDocPtr ret;
4805 htmlParserCtxtPtr ctxt;
4806
Daniel Veillardd0463562001-10-13 09:15:48 +00004807 xmlInitParser();
4808
Owen Taylor3473f882001-02-23 17:55:21 +00004809 if (cur == NULL) return(NULL);
4810
4811
4812 ctxt = htmlCreateDocParserCtxt(cur, encoding);
4813 if (ctxt == NULL) return(NULL);
4814 if (sax != NULL) {
4815 ctxt->sax = sax;
4816 ctxt->userData = userData;
4817 }
4818
4819 htmlParseDocument(ctxt);
4820 ret = ctxt->myDoc;
4821 if (sax != NULL) {
4822 ctxt->sax = NULL;
4823 ctxt->userData = NULL;
4824 }
4825 htmlFreeParserCtxt(ctxt);
4826
4827 return(ret);
4828}
4829
4830/**
4831 * htmlParseDoc :
4832 * @cur: a pointer to an array of xmlChar
4833 * @encoding: a free form C string describing the HTML document encoding, or NULL
4834 *
4835 * parse an HTML in-memory document and build a tree.
4836 *
4837 * Returns the resulting document tree
4838 */
4839
4840htmlDocPtr
4841htmlParseDoc(xmlChar *cur, const char *encoding) {
4842 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4843}
4844
4845
4846/**
4847 * htmlCreateFileParserCtxt :
4848 * @filename: the filename
4849 * @encoding: a free form C string describing the HTML document encoding, or NULL
4850 *
4851 * Create a parser context for a file content.
4852 * Automatic support for ZLIB/Compress compressed document is provided
4853 * by default if found at compile-time.
4854 *
4855 * Returns the new parser context or NULL
4856 */
4857htmlParserCtxtPtr
4858htmlCreateFileParserCtxt(const char *filename, const char *encoding)
4859{
4860 htmlParserCtxtPtr ctxt;
4861 htmlParserInputPtr inputStream;
4862 xmlParserInputBufferPtr buf;
4863 /* htmlCharEncoding enc; */
4864 xmlChar *content, *content_line = (xmlChar *) "charset=";
4865
4866 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
4867 if (buf == NULL) return(NULL);
4868
4869 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4870 if (ctxt == NULL) {
4871 perror("malloc");
4872 return(NULL);
4873 }
4874 memset(ctxt, 0, sizeof(htmlParserCtxt));
4875 htmlInitParserCtxt(ctxt);
4876 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
4877 if (inputStream == NULL) {
4878 perror("malloc");
4879 xmlFree(ctxt);
4880 return(NULL);
4881 }
4882 memset(inputStream, 0, sizeof(htmlParserInput));
4883
4884 inputStream->filename = xmlMemStrdup(filename);
4885 inputStream->line = 1;
4886 inputStream->col = 1;
4887 inputStream->buf = buf;
4888 inputStream->directory = NULL;
4889
4890 inputStream->base = inputStream->buf->buffer->content;
4891 inputStream->cur = inputStream->buf->buffer->content;
4892 inputStream->free = NULL;
4893
4894 inputPush(ctxt, inputStream);
4895
4896 /* set encoding */
4897 if (encoding) {
4898 content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
4899 if (content) {
4900 strcpy ((char *)content, (char *)content_line);
4901 strcat ((char *)content, (char *)encoding);
4902 htmlCheckEncoding (ctxt, content);
4903 xmlFree (content);
4904 }
4905 }
4906
4907 return(ctxt);
4908}
4909
4910/**
4911 * htmlSAXParseFile :
4912 * @filename: the filename
4913 * @encoding: a free form C string describing the HTML document encoding, or NULL
4914 * @sax: the SAX handler block
4915 * @userData: if using SAX, this pointer will be provided on callbacks.
4916 *
4917 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4918 * compressed document is provided by default if found at compile-time.
4919 * It use the given SAX function block to handle the parsing callback.
4920 * If sax is NULL, fallback to the default DOM tree building routines.
4921 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004922 * Returns the resulting document tree unless SAX is NULL or the document is
4923 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00004924 */
4925
4926htmlDocPtr
4927htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
4928 void *userData) {
4929 htmlDocPtr ret;
4930 htmlParserCtxtPtr ctxt;
4931 htmlSAXHandlerPtr oldsax = NULL;
4932
Daniel Veillardd0463562001-10-13 09:15:48 +00004933 xmlInitParser();
4934
Owen Taylor3473f882001-02-23 17:55:21 +00004935 ctxt = htmlCreateFileParserCtxt(filename, encoding);
4936 if (ctxt == NULL) return(NULL);
4937 if (sax != NULL) {
4938 oldsax = ctxt->sax;
4939 ctxt->sax = sax;
4940 ctxt->userData = userData;
4941 }
4942
4943 htmlParseDocument(ctxt);
4944
4945 ret = ctxt->myDoc;
4946 if (sax != NULL) {
4947 ctxt->sax = oldsax;
4948 ctxt->userData = NULL;
4949 }
4950 htmlFreeParserCtxt(ctxt);
4951
4952 return(ret);
4953}
4954
4955/**
4956 * htmlParseFile :
4957 * @filename: the filename
4958 * @encoding: a free form C string describing the HTML document encoding, or NULL
4959 *
4960 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4961 * compressed document is provided by default if found at compile-time.
4962 *
4963 * Returns the resulting document tree
4964 */
4965
4966htmlDocPtr
4967htmlParseFile(const char *filename, const char *encoding) {
4968 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
4969}
4970
4971/**
4972 * htmlHandleOmittedElem:
4973 * @val: int 0 or 1
4974 *
4975 * Set and return the previous value for handling HTML omitted tags.
4976 *
4977 * Returns the last value for 0 for no handling, 1 for auto insertion.
4978 */
4979
4980int
4981htmlHandleOmittedElem(int val) {
4982 int old = htmlOmittedDefaultValue;
4983
4984 htmlOmittedDefaultValue = val;
4985 return(old);
4986}
4987
4988#endif /* LIBXML_HTML_ENABLED */