blob: f263dbf66b19cc9954c6b6ae35000fb7e52725c3 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Bjorn Reese70a9da52001-04-21 16:57:29 +00009#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000010#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000011
Owen Taylor3473f882001-02-23 17:55:21 +000012#include <string.h>
13#ifdef HAVE_CTYPE_H
14#include <ctype.h>
15#endif
16#ifdef HAVE_STDLIB_H
17#include <stdlib.h>
18#endif
19#ifdef HAVE_SYS_STAT_H
20#include <sys/stat.h>
21#endif
22#ifdef HAVE_FCNTL_H
23#include <fcntl.h>
24#endif
25#ifdef HAVE_UNISTD_H
26#include <unistd.h>
27#endif
28#ifdef HAVE_ZLIB_H
29#include <zlib.h>
30#endif
31
32#include <libxml/xmlmemory.h>
33#include <libxml/tree.h>
34#include <libxml/parser.h>
35#include <libxml/parserInternals.h>
36#include <libxml/xmlerror.h>
37#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000038#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000039#include <libxml/entities.h>
40#include <libxml/encoding.h>
41#include <libxml/valid.h>
42#include <libxml/xmlIO.h>
43
44#define HTML_MAX_NAMELEN 1000
45#define HTML_PARSER_BIG_BUFFER_SIZE 1000
46#define HTML_PARSER_BUFFER_SIZE 100
47
48/* #define DEBUG */
49/* #define DEBUG_PUSH */
50
Daniel Veillard22090732001-07-16 00:06:07 +000051static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000052
Daniel Veillard56a4cb82001-03-24 17:00:36 +000053xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
54 xmlChar end, xmlChar end2, xmlChar end3);
55
56/************************************************************************
57 * *
Owen Taylor3473f882001-02-23 17:55:21 +000058 * Parser stacks related functions and macros *
59 * *
60 ************************************************************************/
61
62/*
63 * Generic function for accessing stacks in the Parser Context
64 */
65
66#define PUSH_AND_POP(scope, type, name) \
67scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
68 if (ctxt->name##Nr >= ctxt->name##Max) { \
69 ctxt->name##Max *= 2; \
70 ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
71 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
72 if (ctxt->name##Tab == NULL) { \
73 xmlGenericError(xmlGenericErrorContext, \
74 "realloc failed !\n"); \
75 return(0); \
76 } \
77 } \
78 ctxt->name##Tab[ctxt->name##Nr] = value; \
79 ctxt->name = value; \
80 return(ctxt->name##Nr++); \
81} \
82scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
83 type ret; \
84 if (ctxt->name##Nr < 0) return(0); \
85 ctxt->name##Nr--; \
86 if (ctxt->name##Nr < 0) return(0); \
87 if (ctxt->name##Nr > 0) \
88 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
89 else \
90 ctxt->name = NULL; \
91 ret = ctxt->name##Tab[ctxt->name##Nr]; \
92 ctxt->name##Tab[ctxt->name##Nr] = 0; \
93 return(ret); \
94} \
95
Daniel Veillard56a4cb82001-03-24 17:00:36 +000096/* PUSH_AND_POP(static, xmlNodePtr, node) */
97PUSH_AND_POP(static, xmlChar*, name)
Owen Taylor3473f882001-02-23 17:55:21 +000098
99/*
100 * Macros for accessing the content. Those should be used only by the parser,
101 * and not exported.
102 *
103 * Dirty macros, i.e. one need to make assumption on the context to use them
104 *
105 * CUR_PTR return the current pointer to the xmlChar to be parsed.
106 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
107 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
108 * in UNICODE mode. This should be used internally by the parser
109 * only to compare to ASCII values otherwise it would break when
110 * running with UTF-8 encoding.
111 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
112 * to compare on ASCII based substring.
113 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
114 * it should be used only to compare on ASCII based substring.
115 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
116 * strings within the parser.
117 *
118 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
119 *
120 * CURRENT Returns the current char value, with the full decoding of
121 * UTF-8 if we are using this mode. It returns an int.
122 * NEXT Skip to the next character, this does the proper decoding
123 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
124 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
125 */
126
127#define UPPER (toupper(*ctxt->input->cur))
128
129#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
130
131#define NXT(val) ctxt->input->cur[(val)]
132
133#define UPP(val) (toupper(ctxt->input->cur[(val)]))
134
135#define CUR_PTR ctxt->input->cur
136
137#define SHRINK xmlParserInputShrink(ctxt->input)
138
139#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
140
141#define CURRENT ((int) (*ctxt->input->cur))
142
143#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
144
145/* Inported from XML */
146
147/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
148#define CUR ((int) (*ctxt->input->cur))
149#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
150
151#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
152#define NXT(val) ctxt->input->cur[(val)]
153#define CUR_PTR ctxt->input->cur
154
155
156#define NEXTL(l) do { \
157 if (*(ctxt->input->cur) == '\n') { \
158 ctxt->input->line++; ctxt->input->col = 1; \
159 } else ctxt->input->col++; \
160 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
161 } while (0)
162
163/************
164 \
165 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
166 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
167 ************/
168
169#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
170#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
171
172#define COPY_BUF(l,b,i,v) \
173 if (l == 1) b[i++] = (xmlChar) v; \
174 else i += xmlCopyChar(l,&b[i],v)
175
176/**
177 * htmlCurrentChar:
178 * @ctxt: the HTML parser context
179 * @len: pointer to the length of the char read
180 *
181 * The current char value, if using UTF-8 this may actaully span multiple
182 * bytes in the input buffer. Implement the end of line normalization:
183 * 2.11 End-of-Line Handling
184 * If the encoding is unspecified, in the case we find an ISO-Latin-1
185 * char, then the encoding converter is plugged in automatically.
186 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000187 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000188 */
189
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000190static int
Owen Taylor3473f882001-02-23 17:55:21 +0000191htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
192 if (ctxt->instate == XML_PARSER_EOF)
193 return(0);
194
195 if (ctxt->token != 0) {
196 *len = 0;
197 return(ctxt->token);
198 }
199 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
200 /*
201 * We are supposed to handle UTF8, check it's valid
202 * From rfc2044: encoding of the Unicode values on UTF-8:
203 *
204 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
205 * 0000 0000-0000 007F 0xxxxxxx
206 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
207 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
208 *
209 * Check for the 0x110000 limit too
210 */
211 const unsigned char *cur = ctxt->input->cur;
212 unsigned char c;
213 unsigned int val;
214
215 c = *cur;
216 if (c & 0x80) {
217 if (cur[1] == 0)
218 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
219 if ((cur[1] & 0xc0) != 0x80)
220 goto encoding_error;
221 if ((c & 0xe0) == 0xe0) {
222
223 if (cur[2] == 0)
224 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
225 if ((cur[2] & 0xc0) != 0x80)
226 goto encoding_error;
227 if ((c & 0xf0) == 0xf0) {
228 if (cur[3] == 0)
229 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
230 if (((c & 0xf8) != 0xf0) ||
231 ((cur[3] & 0xc0) != 0x80))
232 goto encoding_error;
233 /* 4-byte code */
234 *len = 4;
235 val = (cur[0] & 0x7) << 18;
236 val |= (cur[1] & 0x3f) << 12;
237 val |= (cur[2] & 0x3f) << 6;
238 val |= cur[3] & 0x3f;
239 } else {
240 /* 3-byte code */
241 *len = 3;
242 val = (cur[0] & 0xf) << 12;
243 val |= (cur[1] & 0x3f) << 6;
244 val |= cur[2] & 0x3f;
245 }
246 } else {
247 /* 2-byte code */
248 *len = 2;
249 val = (cur[0] & 0x1f) << 6;
250 val |= cur[1] & 0x3f;
251 }
252 if (!IS_CHAR(val)) {
253 ctxt->errNo = XML_ERR_INVALID_ENCODING;
254 if ((ctxt->sax != NULL) &&
255 (ctxt->sax->error != NULL))
256 ctxt->sax->error(ctxt->userData,
257 "Char 0x%X out of allowed range\n", val);
258 ctxt->wellFormed = 0;
259 ctxt->disableSAX = 1;
260 }
261 return(val);
262 } else {
263 /* 1-byte code */
264 *len = 1;
265 return((int) *ctxt->input->cur);
266 }
267 }
268 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000269 * Assume it's a fixed length encoding (1) with
Owen Taylor3473f882001-02-23 17:55:21 +0000270 * a compatibke encoding for the ASCII set, since
271 * XML constructs only use < 128 chars
272 */
273 *len = 1;
274 if ((int) *ctxt->input->cur < 0x80)
275 return((int) *ctxt->input->cur);
276
277 /*
278 * Humm this is bad, do an automatic flow conversion
279 */
280 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
281 ctxt->charset = XML_CHAR_ENCODING_UTF8;
282 return(xmlCurrentChar(ctxt, len));
283
284encoding_error:
285 /*
286 * If we detect an UTF8 error that probably mean that the
287 * input encoding didn't get properly advertized in the
288 * declaration header. Report the error and switch the encoding
289 * to ISO-Latin-1 (if you don't like this policy, just declare the
290 * encoding !)
291 */
292 ctxt->errNo = XML_ERR_INVALID_ENCODING;
293 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
294 ctxt->sax->error(ctxt->userData,
295 "Input is not proper UTF-8, indicate encoding !\n");
296 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
297 ctxt->input->cur[0], ctxt->input->cur[1],
298 ctxt->input->cur[2], ctxt->input->cur[3]);
299 }
300
301 ctxt->charset = XML_CHAR_ENCODING_8859_1;
302 *len = 1;
303 return((int) *ctxt->input->cur);
304}
305
306/**
Owen Taylor3473f882001-02-23 17:55:21 +0000307 * htmlSkipBlankChars:
308 * @ctxt: the HTML parser context
309 *
310 * skip all blanks character found at that point in the input streams.
311 *
312 * Returns the number of space chars skipped
313 */
314
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000315static int
Owen Taylor3473f882001-02-23 17:55:21 +0000316htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
317 int res = 0;
318
319 while (IS_BLANK(*(ctxt->input->cur))) {
320 if ((*ctxt->input->cur == 0) &&
321 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
322 xmlPopInput(ctxt);
323 } else {
324 if (*(ctxt->input->cur) == '\n') {
325 ctxt->input->line++; ctxt->input->col = 1;
326 } else ctxt->input->col++;
327 ctxt->input->cur++;
328 ctxt->nbChars++;
329 if (*ctxt->input->cur == 0)
330 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
331 }
332 res++;
333 }
334 return(res);
335}
336
337
338
339/************************************************************************
340 * *
341 * The list of HTML elements and their properties *
342 * *
343 ************************************************************************/
344
345/*
346 * Start Tag: 1 means the start tag can be ommited
347 * End Tag: 1 means the end tag can be ommited
348 * 2 means it's forbidden (empty elements)
Daniel Veillard56098d42001-04-24 12:51:09 +0000349 * 3 means the tag is stylistic and should be closed easilly
Owen Taylor3473f882001-02-23 17:55:21 +0000350 * Depr: this element is deprecated
351 * DTD: 1 means that this element is valid only in the Loose DTD
352 * 2 means that this element is valid only in the Frameset DTD
353 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000354 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Owen Taylor3473f882001-02-23 17:55:21 +0000355 */
Daniel Veillard22090732001-07-16 00:06:07 +0000356static const htmlElemDesc
357html40ElementTable[] = {
Daniel Veillard02bb1702001-06-13 21:11:59 +0000358{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor " },
359{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form" },
360{ "acronym", 0, 0, 0, 0, 0, 0, 1, "" },
361{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author " },
362{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet " },
363{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area " },
364{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style" },
365{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri " },
366{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " },
367{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride " },
368{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style" },
369{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation " },
370{ "body", 1, 1, 0, 0, 0, 0, 0, "document body " },
371{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break " },
372{ "button", 0, 0, 0, 0, 0, 0, 2, "push button " },
373{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption " },
374{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center " },
375{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation" },
376{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment" },
377{ "col", 0, 2, 2, 1, 0, 0, 0, "table column " },
378{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group " },
379{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description " },
380{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text " },
381{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition" },
382{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list" },
383{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container"},
384{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list " },
385{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term " },
386{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis" },
387{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group " },
388{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font " },
389{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form " },
390{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " },
391{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" },
392{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading " },
393{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading " },
394{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading " },
395{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading " },
396{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading " },
397{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading " },
398{ "head", 1, 1, 0, 0, 0, 0, 0, "document head " },
399{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " },
400{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element " },
401{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style" },
402{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow " },
403{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image " },
404{ "input", 0, 2, 2, 1, 0, 0, 1, "form control " },
405{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text" },
406{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt " },
407{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user" },
408{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text " },
409{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend " },
410{ "li", 0, 1, 1, 0, 0, 0, 0, "list item " },
411{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link " },
412{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map " },
413{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list " },
414{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation " },
415{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering " },
416{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
417{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object " },
418{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list " },
419{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group " },
420{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " },
421{ "p", 0, 1, 1, 0, 0, 0, 0, "paragraph " },
422{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value " },
423{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text " },
424{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation " },
425{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style" },
426{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc." },
427{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements " },
428{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector " },
429{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style" },
430{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container " },
431{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text" },
432{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis" },
433{ "style", 0, 0, 0, 0, 0, 0, 0, "style info " },
434{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript" },
435{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript " },
436{ "table", 0, 0, 0, 0, 0, 0, 0, "&#160;" },
437{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body " },
438{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell" },
439{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field " },
440{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer " },
441{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell" },
442{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header " },
443{ "title", 0, 0, 0, 0, 0, 0, 0, "document title " },
444{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row " },
445{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style" },
446{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style" },
447{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list " },
448{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument" },
Owen Taylor3473f882001-02-23 17:55:21 +0000449};
450
451/*
Owen Taylor3473f882001-02-23 17:55:21 +0000452 * start tags that imply the end of current element
453 */
Daniel Veillard22090732001-07-16 00:06:07 +0000454static const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000455"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
456 "dl", "ul", "ol", "menu", "dir", "address", "pre",
457 "listing", "xmp", "head", NULL,
458"head", "p", NULL,
459"title", "p", NULL,
460"body", "head", "style", "link", "title", "p", NULL,
461"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
462 "pre", "listing", "xmp", "head", "li", NULL,
463"hr", "p", "head", NULL,
464"h1", "p", "head", NULL,
465"h2", "p", "head", NULL,
466"h3", "p", "head", NULL,
467"h4", "p", "head", NULL,
468"h5", "p", "head", NULL,
469"h6", "p", "head", NULL,
470"dir", "p", "head", NULL,
471"address", "p", "head", "ul", NULL,
472"pre", "p", "head", "ul", NULL,
473"listing", "p", "head", NULL,
474"xmp", "p", "head", NULL,
475"blockquote", "p", "head", NULL,
476"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
477 "xmp", "head", NULL,
478"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
479 "head", "dd", NULL,
480"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
481 "head", "dt", NULL,
482"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
483 "listing", "xmp", NULL,
484"ol", "p", "head", "ul", NULL,
485"menu", "p", "head", "ul", NULL,
486"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
487"div", "p", "head", NULL,
488"noscript", "p", "head", NULL,
489"center", "font", "b", "i", "p", "head", NULL,
490"a", "a", NULL,
491"caption", "p", NULL,
492"colgroup", "caption", "colgroup", "col", "p", NULL,
493"col", "caption", "col", "p", NULL,
494"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
495 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000496"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
497"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000498"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
499"thead", "caption", "col", "colgroup", NULL,
500"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
501 "tbody", "p", NULL,
502"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
503 "tfoot", "tbody", "p", NULL,
504"optgroup", "option", NULL,
505"option", "option", NULL,
506"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
507 "pre", "listing", "xmp", "a", NULL,
508NULL
509};
510
511/*
512 * The list of HTML elements which are supposed not to have
513 * CDATA content and where a p element will be implied
514 *
515 * TODO: extend that list by reading the HTML SGML DtD on
516 * implied paragraph
517 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000518static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000519 "html",
520 "head",
521 "body",
522 NULL
523};
524
525/*
526 * The list of HTML attributes which are of content %Script;
527 * NOTE: when adding ones, check htmlIsScriptAttribute() since
528 * it assumes the name starts with 'on'
529 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000530static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000531 "onclick",
532 "ondblclick",
533 "onmousedown",
534 "onmouseup",
535 "onmouseover",
536 "onmousemove",
537 "onmouseout",
538 "onkeypress",
539 "onkeydown",
540 "onkeyup",
541 "onload",
542 "onunload",
543 "onfocus",
544 "onblur",
545 "onsubmit",
546 "onrest",
547 "onchange",
548 "onselect"
549};
550
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000551/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000552 * This table is used by the htmlparser to know what to do with
553 * broken html pages. By assigning different priorities to different
554 * elements the parser can decide how to handle extra endtags.
555 * Endtags are only allowed to close elements with lower or equal
556 * priority.
557 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000558
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000559typedef struct {
560 const char *name;
561 int priority;
562} elementPriority;
563
Daniel Veillard22090732001-07-16 00:06:07 +0000564static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000565 {"div", 150},
566 {"td", 160},
567 {"th", 160},
568 {"tr", 170},
569 {"thead", 180},
570 {"tbody", 180},
571 {"tfoot", 180},
572 {"table", 190},
573 {"head", 200},
574 {"body", 200},
575 {"html", 220},
576 {NULL, 100} /* Default priority */
577};
Owen Taylor3473f882001-02-23 17:55:21 +0000578
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000579static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +0000580static int htmlStartCloseIndexinitialized = 0;
581
582/************************************************************************
583 * *
584 * functions to handle HTML specific data *
585 * *
586 ************************************************************************/
587
588/**
589 * htmlInitAutoClose:
590 *
591 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
592 * This is not reentrant. Call xmlInitParser() once before processing in
593 * case of use in multithreaded programs.
594 */
595void
596htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000597 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000598
599 if (htmlStartCloseIndexinitialized) return;
600
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000601 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
602 indx = 0;
603 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
604 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +0000605 while (htmlStartClose[i] != NULL) i++;
606 i++;
607 }
608 htmlStartCloseIndexinitialized = 1;
609}
610
611/**
612 * htmlTagLookup:
613 * @tag: The tag name in lowercase
614 *
615 * Lookup the HTML tag in the ElementTable
616 *
617 * Returns the related htmlElemDescPtr or NULL if not found.
618 */
Daniel Veillardbb371292001-08-16 23:26:59 +0000619const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +0000620htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000621 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000622
623 for (i = 0; i < (sizeof(html40ElementTable) /
624 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000625 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillard22090732001-07-16 00:06:07 +0000626 return((const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +0000627 }
628 return(NULL);
629}
630
631/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000632 * htmlGetEndPriority:
633 * @name: The name of the element to look up the priority for.
634 *
635 * Return value: The "endtag" priority.
636 **/
637static int
638htmlGetEndPriority (const xmlChar *name) {
639 int i = 0;
640
641 while ((htmlEndPriority[i].name != NULL) &&
642 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
643 i++;
644
645 return(htmlEndPriority[i].priority);
646}
647
648/**
Owen Taylor3473f882001-02-23 17:55:21 +0000649 * htmlCheckAutoClose:
650 * @newtag: The new tag name
651 * @oldtag: The old tag name
652 *
653 * Checks wether the new tag is one of the registered valid tags for closing old.
654 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
655 *
656 * Returns 0 if no, 1 if yes.
657 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000658static int
Owen Taylor3473f882001-02-23 17:55:21 +0000659htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000660 int i, indx;
661 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +0000662
663 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
664
665 /* inefficient, but not a big deal */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000666 for (indx = 0; indx < 100;indx++) {
667 closed = htmlStartCloseIndex[indx];
668 if (closed == NULL) return(0);
669 if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor3473f882001-02-23 17:55:21 +0000670 }
671
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000672 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +0000673 i++;
674 while (htmlStartClose[i] != NULL) {
675 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
676 return(1);
677 }
678 i++;
679 }
680 return(0);
681}
682
683/**
684 * htmlAutoCloseOnClose:
685 * @ctxt: an HTML parser context
686 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000687 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +0000688 *
689 * The HTmL DtD allows an ending tag to implicitely close other tags.
690 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000691static void
Owen Taylor3473f882001-02-23 17:55:21 +0000692htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillardbb371292001-08-16 23:26:59 +0000693 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +0000694 xmlChar *oldname;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000695 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +0000696
697#ifdef DEBUG
698 xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
699 for (i = 0;i < ctxt->nameNr;i++)
700 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
701#endif
702
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000703 priority = htmlGetEndPriority (newtag);
704
Owen Taylor3473f882001-02-23 17:55:21 +0000705 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000706
Owen Taylor3473f882001-02-23 17:55:21 +0000707 if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000708 /*
709 * A missplaced endtagad can only close elements with lower
710 * or equal priority, so if we find an element with higher
711 * priority before we find an element with
712 * matching name, we just ignore this endtag
713 */
714 if (htmlGetEndPriority (ctxt->nameTab[i]) > priority) return;
Owen Taylor3473f882001-02-23 17:55:21 +0000715 }
716 if (i < 0) return;
717
718 while (!xmlStrEqual(newtag, ctxt->name)) {
719 info = htmlTagLookup(ctxt->name);
720 if ((info == NULL) || (info->endTag == 1)) {
721#ifdef DEBUG
722 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
723#endif
Daniel Veillard56098d42001-04-24 12:51:09 +0000724 } else if (info->endTag == 3) {
725#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +0000726 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", newtag, ctxt->name);
William M. Brack1633d182001-10-05 15:41:19 +0000727
Daniel Veillard56098d42001-04-24 12:51:09 +0000728#endif
729 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
730 ctxt->sax->error(ctxt->userData,
731 "Opening and ending tag mismatch: %s and %s\n",
732 newtag, ctxt->name);
733 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000734 }
735 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
736 ctxt->sax->endElement(ctxt->userData, ctxt->name);
737 oldname = htmlnamePop(ctxt);
738 if (oldname != NULL) {
739#ifdef DEBUG
740 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
741#endif
742 xmlFree(oldname);
743 }
744 }
745}
746
747/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000748 * htmlAutoCloseOnEnd:
749 * @ctxt: an HTML parser context
750 *
751 * Close all remaining tags at the end of the stream
752 */
753static void
754htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
755 xmlChar *oldname;
756 int i;
757
758 if (ctxt->nameNr == 0)
759 return;
760#ifdef DEBUG
761 xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
762#endif
763
764 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
765#ifdef DEBUG
766 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
767#endif
768 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
769 ctxt->sax->endElement(ctxt->userData, ctxt->name);
770 oldname = htmlnamePop(ctxt);
771 if (oldname != NULL) {
772#ifdef DEBUG
773 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
774#endif
775 xmlFree(oldname);
776 }
777 }
778}
779
780/**
Owen Taylor3473f882001-02-23 17:55:21 +0000781 * htmlAutoClose:
782 * @ctxt: an HTML parser context
783 * @newtag: The new tag name or NULL
784 *
785 * The HTmL DtD allows a tag to implicitely close other tags.
786 * The list is kept in htmlStartClose array. This function is
787 * called when a new tag has been detected and generates the
788 * appropriates closes if possible/needed.
789 * If newtag is NULL this mean we are at the end of the resource
790 * and we should check
791 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000792static void
Owen Taylor3473f882001-02-23 17:55:21 +0000793htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
794 xmlChar *oldname;
795 while ((newtag != NULL) && (ctxt->name != NULL) &&
796 (htmlCheckAutoClose(newtag, ctxt->name))) {
797#ifdef DEBUG
798 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
799#endif
800 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
801 ctxt->sax->endElement(ctxt->userData, ctxt->name);
802 oldname = htmlnamePop(ctxt);
803 if (oldname != NULL) {
804#ifdef DEBUG
805 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
806#endif
807 xmlFree(oldname);
808 }
809 }
810 if (newtag == NULL) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000811 htmlAutoCloseOnEnd(ctxt);
812 return;
Owen Taylor3473f882001-02-23 17:55:21 +0000813 }
814 while ((newtag == NULL) && (ctxt->name != NULL) &&
815 ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
816 (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
817 (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
818#ifdef DEBUG
819 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
820#endif
821 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
822 ctxt->sax->endElement(ctxt->userData, ctxt->name);
823 oldname = htmlnamePop(ctxt);
824 if (oldname != NULL) {
825#ifdef DEBUG
826 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
827#endif
828 xmlFree(oldname);
829 }
830 }
831
832}
833
834/**
835 * htmlAutoCloseTag:
836 * @doc: the HTML document
837 * @name: The tag name
838 * @elem: the HTML element
839 *
840 * The HTmL DtD allows a tag to implicitely close other tags.
841 * The list is kept in htmlStartClose array. This function checks
842 * if the element or one of it's children would autoclose the
843 * given tag.
844 *
845 * Returns 1 if autoclose, 0 otherwise
846 */
847int
848htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
849 htmlNodePtr child;
850
851 if (elem == NULL) return(1);
852 if (xmlStrEqual(name, elem->name)) return(0);
853 if (htmlCheckAutoClose(elem->name, name)) return(1);
854 child = elem->children;
855 while (child != NULL) {
856 if (htmlAutoCloseTag(doc, name, child)) return(1);
857 child = child->next;
858 }
859 return(0);
860}
861
862/**
863 * htmlIsAutoClosed:
864 * @doc: the HTML document
865 * @elem: the HTML element
866 *
867 * The HTmL DtD allows a tag to implicitely close other tags.
868 * The list is kept in htmlStartClose array. This function checks
869 * if a tag is autoclosed by one of it's child
870 *
871 * Returns 1 if autoclosed, 0 otherwise
872 */
873int
874htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
875 htmlNodePtr child;
876
877 if (elem == NULL) return(1);
878 child = elem->children;
879 while (child != NULL) {
880 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
881 child = child->next;
882 }
883 return(0);
884}
885
886/**
887 * htmlCheckImplied:
888 * @ctxt: an HTML parser context
889 * @newtag: The new tag name
890 *
891 * The HTML DtD allows a tag to exists only implicitely
892 * called when a new tag has been detected and generates the
893 * appropriates implicit tags if missing
894 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000895static void
Owen Taylor3473f882001-02-23 17:55:21 +0000896htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
897 if (!htmlOmittedDefaultValue)
898 return;
899 if (xmlStrEqual(newtag, BAD_CAST"html"))
900 return;
901 if (ctxt->nameNr <= 0) {
902#ifdef DEBUG
903 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
904#endif
905 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
906 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
907 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
908 }
909 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
910 return;
911 if ((ctxt->nameNr <= 1) &&
912 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
913 (xmlStrEqual(newtag, BAD_CAST"style")) ||
914 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
915 (xmlStrEqual(newtag, BAD_CAST"link")) ||
916 (xmlStrEqual(newtag, BAD_CAST"title")) ||
917 (xmlStrEqual(newtag, BAD_CAST"base")))) {
918 /*
919 * dropped OBJECT ... i you put it first BODY will be
920 * assumed !
921 */
922#ifdef DEBUG
923 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
924#endif
925 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
926 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
927 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
928 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
929 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
930 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
931 int i;
932 for (i = 0;i < ctxt->nameNr;i++) {
933 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
934 return;
935 }
936 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
937 return;
938 }
939 }
940
941#ifdef DEBUG
942 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
943#endif
944 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
945 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
946 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
947 }
948}
949
950/**
951 * htmlCheckParagraph
952 * @ctxt: an HTML parser context
953 *
954 * Check whether a p element need to be implied before inserting
955 * characters in the current element.
956 *
957 * Returns 1 if a paragraph has been inserted, 0 if not and -1
958 * in case of error.
959 */
960
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000961static int
Owen Taylor3473f882001-02-23 17:55:21 +0000962htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
963 const xmlChar *tag;
964 int i;
965
966 if (ctxt == NULL)
967 return(-1);
968 tag = ctxt->name;
969 if (tag == NULL) {
970 htmlAutoClose(ctxt, BAD_CAST"p");
971 htmlCheckImplied(ctxt, BAD_CAST"p");
972 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
973 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
974 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
975 return(1);
976 }
977 if (!htmlOmittedDefaultValue)
978 return(0);
979 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
980 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
981#ifdef DEBUG
982 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
983#endif
984 htmlAutoClose(ctxt, BAD_CAST"p");
985 htmlCheckImplied(ctxt, BAD_CAST"p");
986 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
987 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
988 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
989 return(1);
990 }
991 }
992 return(0);
993}
994
995/**
996 * htmlIsScriptAttribute:
997 * @name: an attribute name
998 *
999 * Check if an attribute is of content type Script
1000 *
1001 * Returns 1 is the attribute is a script 0 otherwise
1002 */
1003int
1004htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001005 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001006
1007 if (name == NULL)
1008 return(0);
1009 /*
1010 * all script attributes start with 'on'
1011 */
1012 if ((name[0] != 'o') || (name[1] != 'n'))
1013 return(0);
1014 for (i = 0;
1015 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1016 i++) {
1017 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1018 return(1);
1019 }
1020 return(0);
1021}
1022
1023/************************************************************************
1024 * *
1025 * The list of HTML predefined entities *
1026 * *
1027 ************************************************************************/
1028
1029
Daniel Veillard22090732001-07-16 00:06:07 +00001030static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001031/*
1032 * the 4 absolute ones, plus apostrophe.
1033 */
1034{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1035{ 38, "amp", "ampersand, U+0026 ISOnum" },
1036{ 39, "apos", "single quote" },
1037{ 60, "lt", "less-than sign, U+003C ISOnum" },
1038{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1039
1040/*
1041 * A bunch still in the 128-255 range
1042 * Replacing them depend really on the charset used.
1043 */
1044{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1045{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1046{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1047{ 163, "pound","pound sign, U+00A3 ISOnum" },
1048{ 164, "curren","currency sign, U+00A4 ISOnum" },
1049{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1050{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1051{ 167, "sect", "section sign, U+00A7 ISOnum" },
1052{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1053{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1054{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1055{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1056{ 172, "not", "not sign, U+00AC ISOnum" },
1057{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1058{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1059{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1060{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1061{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1062{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1063{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1064{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1065{ 181, "micro","micro sign, U+00B5 ISOnum" },
1066{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1067{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1068{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1069{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1070{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1071{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1072{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1073{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1074{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1075{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1076{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1077{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1078{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1079{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1080{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1081{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1082{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1083{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1084{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1085{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1086{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1087{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1088{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1089{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1090{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1091{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1092{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1093{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1094{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1095{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1096{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1097{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1098{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1099{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1100{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1101{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1102{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1103{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1104{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1105{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1106{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1107{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1108{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1109{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1110{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1111{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1112{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1113{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1114{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1115{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1116{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1117{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1118{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1119{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1120{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1121{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1122{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1123{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1124{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1125{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1126{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1127{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1128{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1129{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1130{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1131{ 247, "divide","division sign, U+00F7 ISOnum" },
1132{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1133{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1134{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1135{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1136{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1137{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1138{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1139{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1140
1141{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1142{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1143{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1144{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1145{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1146
1147/*
1148 * Anything below should really be kept as entities references
1149 */
1150{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1151
1152{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1153{ 732, "tilde","small tilde, U+02DC ISOdia" },
1154
1155{ 913, "Alpha","greek capital letter alpha, U+0391" },
1156{ 914, "Beta", "greek capital letter beta, U+0392" },
1157{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1158{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1159{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1160{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1161{ 919, "Eta", "greek capital letter eta, U+0397" },
1162{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1163{ 921, "Iota", "greek capital letter iota, U+0399" },
1164{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001165{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001166{ 924, "Mu", "greek capital letter mu, U+039C" },
1167{ 925, "Nu", "greek capital letter nu, U+039D" },
1168{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1169{ 927, "Omicron","greek capital letter omicron, U+039F" },
1170{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1171{ 929, "Rho", "greek capital letter rho, U+03A1" },
1172{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1173{ 932, "Tau", "greek capital letter tau, U+03A4" },
1174{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1175{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1176{ 935, "Chi", "greek capital letter chi, U+03A7" },
1177{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1178{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1179
1180{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1181{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1182{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1183{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1184{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1185{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1186{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1187{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1188{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1189{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1190{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1191{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1192{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1193{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1194{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1195{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1196{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1197{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1198{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1199{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1200{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1201{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1202{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1203{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1204{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1205{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1206{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1207{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1208
1209{ 8194, "ensp", "en space, U+2002 ISOpub" },
1210{ 8195, "emsp", "em space, U+2003 ISOpub" },
1211{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1212{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1213{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1214{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1215{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1216{ 8211, "ndash","en dash, U+2013 ISOpub" },
1217{ 8212, "mdash","em dash, U+2014 ISOpub" },
1218{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1219{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1220{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1221{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1222{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1223{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1224{ 8224, "dagger","dagger, U+2020 ISOpub" },
1225{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1226
1227{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1228{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1229
1230{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1231
1232{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1233{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1234
1235{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1236{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1237
1238{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1239{ 8260, "frasl","fraction slash, U+2044 NEW" },
1240
1241{ 8364, "euro", "euro sign, U+20AC NEW" },
1242
1243{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1244{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1245{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1246{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1247{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1248{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1249{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1250{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1251{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1252{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1253{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1254{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1255{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1256{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1257{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1258{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1259
1260{ 8704, "forall","for all, U+2200 ISOtech" },
1261{ 8706, "part", "partial differential, U+2202 ISOtech" },
1262{ 8707, "exist","there exists, U+2203 ISOtech" },
1263{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1264{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1265{ 8712, "isin", "element of, U+2208 ISOtech" },
1266{ 8713, "notin","not an element of, U+2209 ISOtech" },
1267{ 8715, "ni", "contains as member, U+220B ISOtech" },
1268{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1269{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
1270{ 8722, "minus","minus sign, U+2212 ISOtech" },
1271{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1272{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1273{ 8733, "prop", "proportional to, U+221D ISOtech" },
1274{ 8734, "infin","infinity, U+221E ISOtech" },
1275{ 8736, "ang", "angle, U+2220 ISOamso" },
1276{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1277{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1278{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1279{ 8746, "cup", "union = cup, U+222A ISOtech" },
1280{ 8747, "int", "integral, U+222B ISOtech" },
1281{ 8756, "there4","therefore, U+2234 ISOtech" },
1282{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1283{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1284{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1285{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1286{ 8801, "equiv","identical to, U+2261 ISOtech" },
1287{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1288{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1289{ 8834, "sub", "subset of, U+2282 ISOtech" },
1290{ 8835, "sup", "superset of, U+2283 ISOtech" },
1291{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1292{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1293{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1294{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1295{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1296{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1297{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1298{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1299{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1300{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1301{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1302{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1303{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1304{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1305
1306{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1307{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1308{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1309{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1310
1311};
1312
1313/************************************************************************
1314 * *
1315 * Commodity functions to handle entities *
1316 * *
1317 ************************************************************************/
1318
1319/*
1320 * Macro used to grow the current buffer.
1321 */
1322#define growBuffer(buffer) { \
1323 buffer##_size *= 2; \
1324 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1325 if (buffer == NULL) { \
1326 perror("realloc failed"); \
1327 return(NULL); \
1328 } \
1329}
1330
1331/**
1332 * htmlEntityLookup:
1333 * @name: the entity name
1334 *
1335 * Lookup the given entity in EntitiesTable
1336 *
1337 * TODO: the linear scan is really ugly, an hash table is really needed.
1338 *
1339 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1340 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001341const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001342htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001343 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001344
1345 for (i = 0;i < (sizeof(html40EntitiesTable)/
1346 sizeof(html40EntitiesTable[0]));i++) {
1347 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1348#ifdef DEBUG
1349 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
1350#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001351 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001352 }
1353 }
1354 return(NULL);
1355}
1356
1357/**
1358 * htmlEntityValueLookup:
1359 * @value: the entity's unicode value
1360 *
1361 * Lookup the given entity in EntitiesTable
1362 *
1363 * TODO: the linear scan is really ugly, an hash table is really needed.
1364 *
1365 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1366 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001367const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001368htmlEntityValueLookup(unsigned int value) {
1369 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001370#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00001371 unsigned int lv = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001372#endif
1373
1374 for (i = 0;i < (sizeof(html40EntitiesTable)/
1375 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001376 if (html40EntitiesTable[i].value >= value) {
1377 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001378 break;
1379#ifdef DEBUG
1380 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
1381#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001382 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001383 }
1384#ifdef DEBUG
1385 if (lv > html40EntitiesTable[i].value) {
1386 xmlGenericError(xmlGenericErrorContext,
1387 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1388 lv, html40EntitiesTable[i].value);
1389 }
1390 lv = html40EntitiesTable[i].value;
1391#endif
1392 }
1393 return(NULL);
1394}
1395
1396/**
1397 * UTF8ToHtml:
1398 * @out: a pointer to an array of bytes to store the result
1399 * @outlen: the length of @out
1400 * @in: a pointer to an array of UTF-8 chars
1401 * @inlen: the length of @in
1402 *
1403 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1404 * plus HTML entities block of chars out.
1405 *
1406 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1407 * The value of @inlen after return is the number of octets consumed
1408 * as the return value is positive, else unpredictiable.
1409 * The value of @outlen after return is the number of octets consumed.
1410 */
1411int
1412UTF8ToHtml(unsigned char* out, int *outlen,
1413 const unsigned char* in, int *inlen) {
1414 const unsigned char* processed = in;
1415 const unsigned char* outend;
1416 const unsigned char* outstart = out;
1417 const unsigned char* instart = in;
1418 const unsigned char* inend;
1419 unsigned int c, d;
1420 int trailing;
1421
1422 if (in == NULL) {
1423 /*
1424 * initialization nothing to do
1425 */
1426 *outlen = 0;
1427 *inlen = 0;
1428 return(0);
1429 }
1430 inend = in + (*inlen);
1431 outend = out + (*outlen);
1432 while (in < inend) {
1433 d = *in++;
1434 if (d < 0x80) { c= d; trailing= 0; }
1435 else if (d < 0xC0) {
1436 /* trailing byte in leading position */
1437 *outlen = out - outstart;
1438 *inlen = processed - instart;
1439 return(-2);
1440 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1441 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1442 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1443 else {
1444 /* no chance for this in Ascii */
1445 *outlen = out - outstart;
1446 *inlen = processed - instart;
1447 return(-2);
1448 }
1449
1450 if (inend - in < trailing) {
1451 break;
1452 }
1453
1454 for ( ; trailing; trailing--) {
1455 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1456 break;
1457 c <<= 6;
1458 c |= d & 0x3F;
1459 }
1460
1461 /* assertion: c is a single UTF-4 value */
1462 if (c < 0x80) {
1463 if (out + 1 >= outend)
1464 break;
1465 *out++ = c;
1466 } else {
1467 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001468 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001469
1470 /*
1471 * Try to lookup a predefined HTML entity for it
1472 */
1473
1474 ent = htmlEntityValueLookup(c);
1475 if (ent == NULL) {
1476 /* no chance for this in Ascii */
1477 *outlen = out - outstart;
1478 *inlen = processed - instart;
1479 return(-2);
1480 }
1481 len = strlen(ent->name);
1482 if (out + 2 + len >= outend)
1483 break;
1484 *out++ = '&';
1485 memcpy(out, ent->name, len);
1486 out += len;
1487 *out++ = ';';
1488 }
1489 processed = in;
1490 }
1491 *outlen = out - outstart;
1492 *inlen = processed - instart;
1493 return(0);
1494}
1495
1496/**
1497 * htmlEncodeEntities:
1498 * @out: a pointer to an array of bytes to store the result
1499 * @outlen: the length of @out
1500 * @in: a pointer to an array of UTF-8 chars
1501 * @inlen: the length of @in
1502 * @quoteChar: the quote character to escape (' or ") or zero.
1503 *
1504 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1505 * plus HTML entities block of chars out.
1506 *
1507 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1508 * The value of @inlen after return is the number of octets consumed
1509 * as the return value is positive, else unpredictiable.
1510 * The value of @outlen after return is the number of octets consumed.
1511 */
1512int
1513htmlEncodeEntities(unsigned char* out, int *outlen,
1514 const unsigned char* in, int *inlen, int quoteChar) {
1515 const unsigned char* processed = in;
1516 const unsigned char* outend = out + (*outlen);
1517 const unsigned char* outstart = out;
1518 const unsigned char* instart = in;
1519 const unsigned char* inend = in + (*inlen);
1520 unsigned int c, d;
1521 int trailing;
1522
1523 while (in < inend) {
1524 d = *in++;
1525 if (d < 0x80) { c= d; trailing= 0; }
1526 else if (d < 0xC0) {
1527 /* trailing byte in leading position */
1528 *outlen = out - outstart;
1529 *inlen = processed - instart;
1530 return(-2);
1531 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1532 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1533 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1534 else {
1535 /* no chance for this in Ascii */
1536 *outlen = out - outstart;
1537 *inlen = processed - instart;
1538 return(-2);
1539 }
1540
1541 if (inend - in < trailing)
1542 break;
1543
1544 while (trailing--) {
1545 if (((d= *in++) & 0xC0) != 0x80) {
1546 *outlen = out - outstart;
1547 *inlen = processed - instart;
1548 return(-2);
1549 }
1550 c <<= 6;
1551 c |= d & 0x3F;
1552 }
1553
1554 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001555 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1556 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001557 if (out >= outend)
1558 break;
1559 *out++ = c;
1560 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001561 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001562 const char *cp;
1563 char nbuf[16];
1564 int len;
1565
1566 /*
1567 * Try to lookup a predefined HTML entity for it
1568 */
1569 ent = htmlEntityValueLookup(c);
1570 if (ent == NULL) {
1571 sprintf(nbuf, "#%u", c);
1572 cp = nbuf;
1573 }
1574 else
1575 cp = ent->name;
1576 len = strlen(cp);
1577 if (out + 2 + len > outend)
1578 break;
1579 *out++ = '&';
1580 memcpy(out, cp, len);
1581 out += len;
1582 *out++ = ';';
1583 }
1584 processed = in;
1585 }
1586 *outlen = out - outstart;
1587 *inlen = processed - instart;
1588 return(0);
1589}
1590
1591/**
1592 * htmlDecodeEntities:
1593 * @ctxt: the parser context
1594 * @len: the len to decode (in bytes !), -1 for no size limit
1595 * @end: an end marker xmlChar, 0 if none
1596 * @end2: an end marker xmlChar, 0 if none
1597 * @end3: an end marker xmlChar, 0 if none
1598 *
1599 * Subtitute the HTML entities by their value
1600 *
1601 * DEPRECATED !!!!
1602 *
1603 * Returns A newly allocated string with the substitution done. The caller
1604 * must deallocate it !
1605 */
1606xmlChar *
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00001607htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
1608 xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001609 static int deprecated = 0;
1610 if (!deprecated) {
1611 xmlGenericError(xmlGenericErrorContext,
1612 "htmlDecodeEntities() deprecated function reached\n");
1613 deprecated = 1;
1614 }
1615 return(NULL);
1616#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00001617 xmlChar *name = NULL;
1618 xmlChar *buffer = NULL;
1619 unsigned int buffer_size = 0;
1620 unsigned int nbchars = 0;
1621 htmlEntityDescPtr ent;
1622 unsigned int max = (unsigned int) len;
1623 int c,l;
1624
1625 if (ctxt->depth > 40) {
1626 ctxt->errNo = XML_ERR_ENTITY_LOOP;
1627 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1628 ctxt->sax->error(ctxt->userData,
1629 "Detected entity reference loop\n");
1630 ctxt->wellFormed = 0;
1631 ctxt->disableSAX = 1;
1632 return(NULL);
1633 }
1634
1635 /*
1636 * allocate a translation buffer.
1637 */
1638 buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1639 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1640 if (buffer == NULL) {
1641 perror("xmlDecodeEntities: malloc failed");
1642 return(NULL);
1643 }
1644
1645 /*
1646 * Ok loop until we reach one of the ending char or a size limit.
1647 */
1648 c = CUR_CHAR(l);
1649 while ((nbchars < max) && (c != end) &&
1650 (c != end2) && (c != end3)) {
1651
1652 if (c == 0) break;
1653 if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
1654 int val = htmlParseCharRef(ctxt);
1655 COPY_BUF(0,buffer,nbchars,val);
1656 NEXTL(l);
1657 } else if ((c == '&') && (ctxt->token != '&')) {
1658 ent = htmlParseEntityRef(ctxt, &name);
1659 if (name != NULL) {
1660 if (ent != NULL) {
1661 int val = ent->value;
1662 COPY_BUF(0,buffer,nbchars,val);
1663 NEXTL(l);
1664 } else {
1665 const xmlChar *cur = name;
1666
1667 buffer[nbchars++] = '&';
1668 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1669 growBuffer(buffer);
1670 }
1671 while (*cur != 0) {
1672 buffer[nbchars++] = *cur++;
1673 }
1674 buffer[nbchars++] = ';';
1675 }
1676 }
1677 } else {
1678 COPY_BUF(l,buffer,nbchars,c);
1679 NEXTL(l);
1680 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1681 growBuffer(buffer);
1682 }
1683 }
1684 c = CUR_CHAR(l);
1685 }
1686 buffer[nbchars++] = 0;
1687 return(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001688#endif
Owen Taylor3473f882001-02-23 17:55:21 +00001689}
1690
1691/************************************************************************
1692 * *
1693 * Commodity functions to handle streams *
1694 * *
1695 ************************************************************************/
1696
1697/**
Owen Taylor3473f882001-02-23 17:55:21 +00001698 * htmlNewInputStream:
1699 * @ctxt: an HTML parser context
1700 *
1701 * Create a new input stream structure
1702 * Returns the new input stream or NULL
1703 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001704static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001705htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1706 htmlParserInputPtr input;
1707
1708 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1709 if (input == NULL) {
1710 ctxt->errNo = XML_ERR_NO_MEMORY;
1711 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1712 ctxt->sax->error(ctxt->userData,
1713 "malloc: couldn't allocate a new input stream\n");
1714 return(NULL);
1715 }
1716 memset(input, 0, sizeof(htmlParserInput));
1717 input->filename = NULL;
1718 input->directory = NULL;
1719 input->base = NULL;
1720 input->cur = NULL;
1721 input->buf = NULL;
1722 input->line = 1;
1723 input->col = 1;
1724 input->buf = NULL;
1725 input->free = NULL;
1726 input->version = NULL;
1727 input->consumed = 0;
1728 input->length = 0;
1729 return(input);
1730}
1731
1732
1733/************************************************************************
1734 * *
1735 * Commodity functions, cleanup needed ? *
1736 * *
1737 ************************************************************************/
1738
1739/**
1740 * areBlanks:
1741 * @ctxt: an HTML parser context
1742 * @str: a xmlChar *
1743 * @len: the size of @str
1744 *
1745 * Is this a sequence of blank chars that one can ignore ?
1746 *
1747 * Returns 1 if ignorable 0 otherwise.
1748 */
1749
1750static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
1751 int i;
1752 xmlNodePtr lastChild;
1753
1754 for (i = 0;i < len;i++)
1755 if (!(IS_BLANK(str[i]))) return(0);
1756
1757 if (CUR == 0) return(1);
1758 if (CUR != '<') return(0);
1759 if (ctxt->name == NULL)
1760 return(1);
1761 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
1762 return(1);
1763 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
1764 return(1);
1765 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
1766 return(1);
1767 if (ctxt->node == NULL) return(0);
1768 lastChild = xmlGetLastChild(ctxt->node);
1769 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00001770 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
1771 (ctxt->node->content != NULL)) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001772 } else if (xmlNodeIsText(lastChild)) {
1773 return(0);
1774 } else if (xmlStrEqual(lastChild->name, BAD_CAST"b")) {
1775 return(0);
1776 } else if (xmlStrEqual(lastChild->name, BAD_CAST"bold")) {
1777 return(0);
1778 } else if (xmlStrEqual(lastChild->name, BAD_CAST"em")) {
1779 return(0);
1780 }
1781 return(1);
1782}
1783
1784/**
Owen Taylor3473f882001-02-23 17:55:21 +00001785 * htmlNewDocNoDtD:
1786 * @URI: URI for the dtd, or NULL
1787 * @ExternalID: the external ID of the DTD, or NULL
1788 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001789 * Creates a new HTML document without a DTD node if @URI and @ExternalID
1790 * are NULL
1791 *
Owen Taylor3473f882001-02-23 17:55:21 +00001792 * Returns a new document, do not intialize the DTD if not provided
1793 */
1794htmlDocPtr
1795htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
1796 xmlDocPtr cur;
1797
1798 /*
1799 * Allocate a new document and fill the fields.
1800 */
1801 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
1802 if (cur == NULL) {
1803 xmlGenericError(xmlGenericErrorContext,
1804 "xmlNewDoc : malloc failed\n");
1805 return(NULL);
1806 }
1807 memset(cur, 0, sizeof(xmlDoc));
1808
1809 cur->type = XML_HTML_DOCUMENT_NODE;
1810 cur->version = NULL;
1811 cur->intSubset = NULL;
1812 if ((ExternalID != NULL) ||
1813 (URI != NULL))
1814 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
1815 cur->doc = cur;
1816 cur->name = NULL;
1817 cur->children = NULL;
1818 cur->extSubset = NULL;
1819 cur->oldNs = NULL;
1820 cur->encoding = NULL;
1821 cur->standalone = 1;
1822 cur->compression = 0;
1823 cur->ids = NULL;
1824 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001825 cur->_private = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001826 return(cur);
1827}
1828
1829/**
1830 * htmlNewDoc:
1831 * @URI: URI for the dtd, or NULL
1832 * @ExternalID: the external ID of the DTD, or NULL
1833 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001834 * Creates a new HTML document
1835 *
Owen Taylor3473f882001-02-23 17:55:21 +00001836 * Returns a new document
1837 */
1838htmlDocPtr
1839htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1840 if ((URI == NULL) && (ExternalID == NULL))
1841 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00001842 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
1843 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00001844
1845 return(htmlNewDocNoDtD(URI, ExternalID));
1846}
1847
1848
1849/************************************************************************
1850 * *
1851 * The parser itself *
1852 * Relates to http://www.w3.org/TR/html40 *
1853 * *
1854 ************************************************************************/
1855
1856/************************************************************************
1857 * *
1858 * The parser itself *
1859 * *
1860 ************************************************************************/
1861
1862/**
1863 * htmlParseHTMLName:
1864 * @ctxt: an HTML parser context
1865 *
1866 * parse an HTML tag or attribute name, note that we convert it to lowercase
1867 * since HTML names are not case-sensitive.
1868 *
1869 * Returns the Tag Name parsed or NULL
1870 */
1871
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001872static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001873htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1874 xmlChar *ret = NULL;
1875 int i = 0;
1876 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
1877
1878 if (!IS_LETTER(CUR) && (CUR != '_') &&
1879 (CUR != ':')) return(NULL);
1880
1881 while ((i < HTML_PARSER_BUFFER_SIZE) &&
1882 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1883 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
1884 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
1885 else loc[i] = CUR;
1886 i++;
1887
1888 NEXT;
1889 }
1890
1891 ret = xmlStrndup(loc, i);
1892
1893 return(ret);
1894}
1895
1896/**
1897 * htmlParseName:
1898 * @ctxt: an HTML parser context
1899 *
1900 * parse an HTML name, this routine is case sensistive.
1901 *
1902 * Returns the Name parsed or NULL
1903 */
1904
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001905static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001906htmlParseName(htmlParserCtxtPtr ctxt) {
1907 xmlChar buf[HTML_MAX_NAMELEN];
1908 int len = 0;
1909
1910 GROW;
1911 if (!IS_LETTER(CUR) && (CUR != '_')) {
1912 return(NULL);
1913 }
1914
1915 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1916 (CUR == '.') || (CUR == '-') ||
1917 (CUR == '_') || (CUR == ':') ||
1918 (IS_COMBINING(CUR)) ||
1919 (IS_EXTENDER(CUR))) {
1920 buf[len++] = CUR;
1921 NEXT;
1922 if (len >= HTML_MAX_NAMELEN) {
1923 xmlGenericError(xmlGenericErrorContext,
1924 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1925 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1926 (CUR == '.') || (CUR == '-') ||
1927 (CUR == '_') || (CUR == ':') ||
1928 (IS_COMBINING(CUR)) ||
1929 (IS_EXTENDER(CUR)))
1930 NEXT;
1931 break;
1932 }
1933 }
1934 return(xmlStrndup(buf, len));
1935}
1936
1937/**
1938 * htmlParseHTMLAttribute:
1939 * @ctxt: an HTML parser context
1940 * @stop: a char stop value
1941 *
1942 * parse an HTML attribute value till the stop (quote), if
1943 * stop is 0 then it stops at the first space
1944 *
1945 * Returns the attribute parsed or NULL
1946 */
1947
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001948static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001949htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
1950 xmlChar *buffer = NULL;
1951 int buffer_size = 0;
1952 xmlChar *out = NULL;
1953 xmlChar *name = NULL;
1954
1955 xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00001956 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001957
1958 /*
1959 * allocate a translation buffer.
1960 */
1961 buffer_size = HTML_PARSER_BUFFER_SIZE;
1962 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1963 if (buffer == NULL) {
1964 perror("htmlParseHTMLAttribute: malloc failed");
1965 return(NULL);
1966 }
1967 out = buffer;
1968
1969 /*
1970 * Ok loop until we reach one of the ending chars
1971 */
1972 while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1973 if ((stop == 0) && (IS_BLANK(CUR))) break;
1974 if (CUR == '&') {
1975 if (NXT(1) == '#') {
1976 unsigned int c;
1977 int bits;
1978
1979 c = htmlParseCharRef(ctxt);
1980 if (c < 0x80)
1981 { *out++ = c; bits= -6; }
1982 else if (c < 0x800)
1983 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1984 else if (c < 0x10000)
1985 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1986 else
1987 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1988
1989 for ( ; bits >= 0; bits-= 6) {
1990 *out++ = ((c >> bits) & 0x3F) | 0x80;
1991 }
1992 } else {
1993 ent = htmlParseEntityRef(ctxt, &name);
1994 if (name == NULL) {
1995 *out++ = '&';
1996 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001997 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00001998
1999 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002000 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002001 }
2002 } else if (ent == NULL) {
2003 *out++ = '&';
2004 cur = name;
2005 while (*cur != 0) {
2006 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002007 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002008
2009 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002010 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002011 }
2012 *out++ = *cur++;
2013 }
2014 xmlFree(name);
2015 } else {
2016 unsigned int c;
2017 int bits;
2018
2019 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002020 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002021
2022 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002023 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002024 }
2025 c = (xmlChar)ent->value;
2026 if (c < 0x80)
2027 { *out++ = c; bits= -6; }
2028 else if (c < 0x800)
2029 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2030 else if (c < 0x10000)
2031 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2032 else
2033 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2034
2035 for ( ; bits >= 0; bits-= 6) {
2036 *out++ = ((c >> bits) & 0x3F) | 0x80;
2037 }
2038 xmlFree(name);
2039 }
2040 }
2041 } else {
2042 unsigned int c;
2043 int bits, l;
2044
2045 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002046 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002047
2048 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002049 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002050 }
2051 c = CUR_CHAR(l);
2052 if (c < 0x80)
2053 { *out++ = c; bits= -6; }
2054 else if (c < 0x800)
2055 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2056 else if (c < 0x10000)
2057 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2058 else
2059 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2060
2061 for ( ; bits >= 0; bits-= 6) {
2062 *out++ = ((c >> bits) & 0x3F) | 0x80;
2063 }
2064 NEXT;
2065 }
2066 }
2067 *out++ = 0;
2068 return(buffer);
2069}
2070
2071/**
Owen Taylor3473f882001-02-23 17:55:21 +00002072 * htmlParseEntityRef:
2073 * @ctxt: an HTML parser context
2074 * @str: location to store the entity name
2075 *
2076 * parse an HTML ENTITY references
2077 *
2078 * [68] EntityRef ::= '&' Name ';'
2079 *
2080 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2081 * if non-NULL *str will have to be freed by the caller.
2082 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002083const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00002084htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2085 xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002086 const htmlEntityDesc * ent = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002087 *str = NULL;
2088
2089 if (CUR == '&') {
2090 NEXT;
2091 name = htmlParseName(ctxt);
2092 if (name == NULL) {
2093 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2094 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2095 ctxt->wellFormed = 0;
2096 } else {
2097 GROW;
2098 if (CUR == ';') {
2099 *str = name;
2100
2101 /*
2102 * Lookup the entity in the table.
2103 */
2104 ent = htmlEntityLookup(name);
2105 if (ent != NULL) /* OK that's ugly !!! */
2106 NEXT;
2107 } else {
2108 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2109 ctxt->sax->error(ctxt->userData,
2110 "htmlParseEntityRef: expecting ';'\n");
2111 *str = name;
2112 }
2113 }
2114 }
2115 return(ent);
2116}
2117
2118/**
2119 * htmlParseAttValue:
2120 * @ctxt: an HTML parser context
2121 *
2122 * parse a value for an attribute
2123 * Note: the parser won't do substitution of entities here, this
2124 * will be handled later in xmlStringGetNodeList, unless it was
2125 * asked for ctxt->replaceEntities != 0
2126 *
2127 * Returns the AttValue parsed or NULL.
2128 */
2129
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002130static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002131htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2132 xmlChar *ret = NULL;
2133
2134 if (CUR == '"') {
2135 NEXT;
2136 ret = htmlParseHTMLAttribute(ctxt, '"');
2137 if (CUR != '"') {
2138 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2139 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2140 ctxt->wellFormed = 0;
2141 } else
2142 NEXT;
2143 } else if (CUR == '\'') {
2144 NEXT;
2145 ret = htmlParseHTMLAttribute(ctxt, '\'');
2146 if (CUR != '\'') {
2147 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2148 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2149 ctxt->wellFormed = 0;
2150 } else
2151 NEXT;
2152 } else {
2153 /*
2154 * That's an HTMLism, the attribute value may not be quoted
2155 */
2156 ret = htmlParseHTMLAttribute(ctxt, 0);
2157 if (ret == NULL) {
2158 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2159 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2160 ctxt->wellFormed = 0;
2161 }
2162 }
2163 return(ret);
2164}
2165
2166/**
2167 * htmlParseSystemLiteral:
2168 * @ctxt: an HTML parser context
2169 *
2170 * parse an HTML Literal
2171 *
2172 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2173 *
2174 * Returns the SystemLiteral parsed or NULL
2175 */
2176
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002177static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002178htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2179 const xmlChar *q;
2180 xmlChar *ret = NULL;
2181
2182 if (CUR == '"') {
2183 NEXT;
2184 q = CUR_PTR;
2185 while ((IS_CHAR(CUR)) && (CUR != '"'))
2186 NEXT;
2187 if (!IS_CHAR(CUR)) {
2188 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2189 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2190 ctxt->wellFormed = 0;
2191 } else {
2192 ret = xmlStrndup(q, CUR_PTR - q);
2193 NEXT;
2194 }
2195 } else if (CUR == '\'') {
2196 NEXT;
2197 q = CUR_PTR;
2198 while ((IS_CHAR(CUR)) && (CUR != '\''))
2199 NEXT;
2200 if (!IS_CHAR(CUR)) {
2201 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2202 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2203 ctxt->wellFormed = 0;
2204 } else {
2205 ret = xmlStrndup(q, CUR_PTR - q);
2206 NEXT;
2207 }
2208 } else {
2209 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2210 ctxt->sax->error(ctxt->userData,
2211 "SystemLiteral \" or ' expected\n");
2212 ctxt->wellFormed = 0;
2213 }
2214
2215 return(ret);
2216}
2217
2218/**
2219 * htmlParsePubidLiteral:
2220 * @ctxt: an HTML parser context
2221 *
2222 * parse an HTML public literal
2223 *
2224 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2225 *
2226 * Returns the PubidLiteral parsed or NULL.
2227 */
2228
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002229static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002230htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2231 const xmlChar *q;
2232 xmlChar *ret = NULL;
2233 /*
2234 * Name ::= (Letter | '_') (NameChar)*
2235 */
2236 if (CUR == '"') {
2237 NEXT;
2238 q = CUR_PTR;
2239 while (IS_PUBIDCHAR(CUR)) NEXT;
2240 if (CUR != '"') {
2241 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2242 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2243 ctxt->wellFormed = 0;
2244 } else {
2245 ret = xmlStrndup(q, CUR_PTR - q);
2246 NEXT;
2247 }
2248 } else if (CUR == '\'') {
2249 NEXT;
2250 q = CUR_PTR;
2251 while ((IS_LETTER(CUR)) && (CUR != '\''))
2252 NEXT;
2253 if (!IS_LETTER(CUR)) {
2254 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2255 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2256 ctxt->wellFormed = 0;
2257 } else {
2258 ret = xmlStrndup(q, CUR_PTR - q);
2259 NEXT;
2260 }
2261 } else {
2262 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2263 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2264 ctxt->wellFormed = 0;
2265 }
2266
2267 return(ret);
2268}
2269
2270/**
2271 * htmlParseScript:
2272 * @ctxt: an HTML parser context
2273 *
2274 * parse the content of an HTML SCRIPT or STYLE element
2275 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2276 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2277 * http://www.w3.org/TR/html4/types.html#type-script
2278 * http://www.w3.org/TR/html4/types.html#h-6.15
2279 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2280 *
2281 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2282 * element and the value of intrinsic event attributes. User agents must
2283 * not evaluate script data as HTML markup but instead must pass it on as
2284 * data to a script engine.
2285 * NOTES:
2286 * - The content is passed like CDATA
2287 * - the attributes for style and scripting "onXXX" are also described
2288 * as CDATA but SGML allows entities references in attributes so their
2289 * processing is identical as other attributes
2290 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002291static void
Owen Taylor3473f882001-02-23 17:55:21 +00002292htmlParseScript(htmlParserCtxtPtr ctxt) {
2293 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2294 int nbchar = 0;
2295 xmlChar cur;
2296
2297 SHRINK;
2298 cur = CUR;
2299 while (IS_CHAR(cur)) {
2300 if ((cur == '<') && (NXT(1) == '/')) {
2301 /*
2302 * One should break here, the specification is clear:
2303 * Authors should therefore escape "</" within the content.
2304 * Escape mechanisms are specific to each scripting or
2305 * style sheet language.
2306 */
2307 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2308 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2309 break; /* while */
2310 }
2311 buf[nbchar++] = cur;
2312 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2313 if (ctxt->sax->cdataBlock!= NULL) {
2314 /*
2315 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2316 */
2317 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2318 }
2319 nbchar = 0;
2320 }
2321 NEXT;
2322 cur = CUR;
2323 }
2324 if (!(IS_CHAR(cur))) {
2325 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2326 ctxt->sax->error(ctxt->userData,
2327 "Invalid char in CDATA 0x%X\n", cur);
2328 ctxt->wellFormed = 0;
2329 NEXT;
2330 }
2331
2332 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2333 if (ctxt->sax->cdataBlock!= NULL) {
2334 /*
2335 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2336 */
2337 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2338 }
2339 }
2340}
2341
2342
2343/**
2344 * htmlParseCharData:
2345 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002346 *
2347 * parse a CharData section.
2348 * if we are within a CDATA section ']]>' marks an end of section.
2349 *
2350 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2351 */
2352
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002353static void
2354htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002355 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2356 int nbchar = 0;
2357 int cur, l;
2358
2359 SHRINK;
2360 cur = CUR_CHAR(l);
2361 while (((cur != '<') || (ctxt->token == '<')) &&
2362 ((cur != '&') || (ctxt->token == '&')) &&
2363 (IS_CHAR(cur))) {
2364 COPY_BUF(l,buf,nbchar,cur);
2365 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2366 /*
2367 * Ok the segment is to be consumed as chars.
2368 */
2369 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2370 if (areBlanks(ctxt, buf, nbchar)) {
2371 if (ctxt->sax->ignorableWhitespace != NULL)
2372 ctxt->sax->ignorableWhitespace(ctxt->userData,
2373 buf, nbchar);
2374 } else {
2375 htmlCheckParagraph(ctxt);
2376 if (ctxt->sax->characters != NULL)
2377 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2378 }
2379 }
2380 nbchar = 0;
2381 }
2382 NEXTL(l);
2383 cur = CUR_CHAR(l);
2384 }
2385 if (nbchar != 0) {
2386 /*
2387 * Ok the segment is to be consumed as chars.
2388 */
2389 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2390 if (areBlanks(ctxt, buf, nbchar)) {
2391 if (ctxt->sax->ignorableWhitespace != NULL)
2392 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2393 } else {
2394 htmlCheckParagraph(ctxt);
2395 if (ctxt->sax->characters != NULL)
2396 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2397 }
2398 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002399 } else {
2400 /*
2401 * Loop detection
2402 */
2403 if (cur == 0)
2404 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002405 }
2406}
2407
2408/**
2409 * htmlParseExternalID:
2410 * @ctxt: an HTML parser context
2411 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002412 *
2413 * Parse an External ID or a Public ID
2414 *
Owen Taylor3473f882001-02-23 17:55:21 +00002415 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2416 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2417 *
2418 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2419 *
2420 * Returns the function returns SystemLiteral and in the second
2421 * case publicID receives PubidLiteral, is strict is off
2422 * it is possible to return NULL and have publicID set.
2423 */
2424
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002425static xmlChar *
2426htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002427 xmlChar *URI = NULL;
2428
2429 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2430 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2431 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2432 SKIP(6);
2433 if (!IS_BLANK(CUR)) {
2434 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2435 ctxt->sax->error(ctxt->userData,
2436 "Space required after 'SYSTEM'\n");
2437 ctxt->wellFormed = 0;
2438 }
2439 SKIP_BLANKS;
2440 URI = htmlParseSystemLiteral(ctxt);
2441 if (URI == NULL) {
2442 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2443 ctxt->sax->error(ctxt->userData,
2444 "htmlParseExternalID: SYSTEM, no URI\n");
2445 ctxt->wellFormed = 0;
2446 }
2447 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2448 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2449 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2450 SKIP(6);
2451 if (!IS_BLANK(CUR)) {
2452 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2453 ctxt->sax->error(ctxt->userData,
2454 "Space required after 'PUBLIC'\n");
2455 ctxt->wellFormed = 0;
2456 }
2457 SKIP_BLANKS;
2458 *publicID = htmlParsePubidLiteral(ctxt);
2459 if (*publicID == NULL) {
2460 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2461 ctxt->sax->error(ctxt->userData,
2462 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2463 ctxt->wellFormed = 0;
2464 }
2465 SKIP_BLANKS;
2466 if ((CUR == '"') || (CUR == '\'')) {
2467 URI = htmlParseSystemLiteral(ctxt);
2468 }
2469 }
2470 return(URI);
2471}
2472
2473/**
2474 * htmlParseComment:
2475 * @ctxt: an HTML parser context
2476 *
2477 * Parse an XML (SGML) comment <!-- .... -->
2478 *
2479 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2480 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002481static void
Owen Taylor3473f882001-02-23 17:55:21 +00002482htmlParseComment(htmlParserCtxtPtr ctxt) {
2483 xmlChar *buf = NULL;
2484 int len;
2485 int size = HTML_PARSER_BUFFER_SIZE;
2486 int q, ql;
2487 int r, rl;
2488 int cur, l;
2489 xmlParserInputState state;
2490
2491 /*
2492 * Check that there is a comment right here.
2493 */
2494 if ((RAW != '<') || (NXT(1) != '!') ||
2495 (NXT(2) != '-') || (NXT(3) != '-')) return;
2496
2497 state = ctxt->instate;
2498 ctxt->instate = XML_PARSER_COMMENT;
2499 SHRINK;
2500 SKIP(4);
2501 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2502 if (buf == NULL) {
2503 xmlGenericError(xmlGenericErrorContext,
2504 "malloc of %d byte failed\n", size);
2505 ctxt->instate = state;
2506 return;
2507 }
2508 q = CUR_CHAR(ql);
2509 NEXTL(ql);
2510 r = CUR_CHAR(rl);
2511 NEXTL(rl);
2512 cur = CUR_CHAR(l);
2513 len = 0;
2514 while (IS_CHAR(cur) &&
2515 ((cur != '>') ||
2516 (r != '-') || (q != '-'))) {
2517 if (len + 5 >= size) {
2518 size *= 2;
2519 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2520 if (buf == NULL) {
2521 xmlGenericError(xmlGenericErrorContext,
2522 "realloc of %d byte failed\n", size);
2523 ctxt->instate = state;
2524 return;
2525 }
2526 }
2527 COPY_BUF(ql,buf,len,q);
2528 q = r;
2529 ql = rl;
2530 r = cur;
2531 rl = l;
2532 NEXTL(l);
2533 cur = CUR_CHAR(l);
2534 if (cur == 0) {
2535 SHRINK;
2536 GROW;
2537 cur = CUR_CHAR(l);
2538 }
2539 }
2540 buf[len] = 0;
2541 if (!IS_CHAR(cur)) {
2542 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2543 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2544 ctxt->sax->error(ctxt->userData,
2545 "Comment not terminated \n<!--%.50s\n", buf);
2546 ctxt->wellFormed = 0;
2547 xmlFree(buf);
2548 } else {
2549 NEXT;
2550 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2551 (!ctxt->disableSAX))
2552 ctxt->sax->comment(ctxt->userData, buf);
2553 xmlFree(buf);
2554 }
2555 ctxt->instate = state;
2556}
2557
2558/**
2559 * htmlParseCharRef:
2560 * @ctxt: an HTML parser context
2561 *
2562 * parse Reference declarations
2563 *
2564 * [66] CharRef ::= '&#' [0-9]+ ';' |
2565 * '&#x' [0-9a-fA-F]+ ';'
2566 *
2567 * Returns the value parsed (as an int)
2568 */
2569int
2570htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2571 int val = 0;
2572
2573 if ((CUR == '&') && (NXT(1) == '#') &&
2574 (NXT(2) == 'x')) {
2575 SKIP(3);
2576 while (CUR != ';') {
2577 if ((CUR >= '0') && (CUR <= '9'))
2578 val = val * 16 + (CUR - '0');
2579 else if ((CUR >= 'a') && (CUR <= 'f'))
2580 val = val * 16 + (CUR - 'a') + 10;
2581 else if ((CUR >= 'A') && (CUR <= 'F'))
2582 val = val * 16 + (CUR - 'A') + 10;
2583 else {
2584 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2585 ctxt->sax->error(ctxt->userData,
2586 "htmlParseCharRef: invalid hexadecimal value\n");
2587 ctxt->wellFormed = 0;
2588 return(0);
2589 }
2590 NEXT;
2591 }
2592 if (CUR == ';')
2593 NEXT;
2594 } else if ((CUR == '&') && (NXT(1) == '#')) {
2595 SKIP(2);
2596 while (CUR != ';') {
2597 if ((CUR >= '0') && (CUR <= '9'))
2598 val = val * 10 + (CUR - '0');
2599 else {
2600 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2601 ctxt->sax->error(ctxt->userData,
2602 "htmlParseCharRef: invalid decimal value\n");
2603 ctxt->wellFormed = 0;
2604 return(0);
2605 }
2606 NEXT;
2607 }
2608 if (CUR == ';')
2609 NEXT;
2610 } else {
2611 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2612 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2613 ctxt->wellFormed = 0;
2614 }
2615 /*
2616 * Check the value IS_CHAR ...
2617 */
2618 if (IS_CHAR(val)) {
2619 return(val);
2620 } else {
2621 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2622 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
2623 val);
2624 ctxt->wellFormed = 0;
2625 }
2626 return(0);
2627}
2628
2629
2630/**
2631 * htmlParseDocTypeDecl :
2632 * @ctxt: an HTML parser context
2633 *
2634 * parse a DOCTYPE declaration
2635 *
2636 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2637 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2638 */
2639
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002640static void
Owen Taylor3473f882001-02-23 17:55:21 +00002641htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
2642 xmlChar *name;
2643 xmlChar *ExternalID = NULL;
2644 xmlChar *URI = NULL;
2645
2646 /*
2647 * We know that '<!DOCTYPE' has been detected.
2648 */
2649 SKIP(9);
2650
2651 SKIP_BLANKS;
2652
2653 /*
2654 * Parse the DOCTYPE name.
2655 */
2656 name = htmlParseName(ctxt);
2657 if (name == NULL) {
2658 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2659 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2660 ctxt->wellFormed = 0;
2661 }
2662 /*
2663 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2664 */
2665
2666 SKIP_BLANKS;
2667
2668 /*
2669 * Check for SystemID and ExternalID
2670 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002671 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00002672 SKIP_BLANKS;
2673
2674 /*
2675 * We should be at the end of the DOCTYPE declaration.
2676 */
2677 if (CUR != '>') {
2678 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardf6ed8bc2001-10-02 09:22:47 +00002679 ctxt->sax->error(ctxt->userData, "DOCTYPE improperly terminated\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002680 ctxt->wellFormed = 0;
2681 /* We shouldn't try to resynchronize ... */
2682 }
2683 NEXT;
2684
2685 /*
2686 * Create or update the document accordingly to the DOCTYPE
2687 */
2688 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2689 (!ctxt->disableSAX))
2690 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
2691
2692 /*
2693 * Cleanup, since we don't use all those identifiers
2694 */
2695 if (URI != NULL) xmlFree(URI);
2696 if (ExternalID != NULL) xmlFree(ExternalID);
2697 if (name != NULL) xmlFree(name);
2698}
2699
2700/**
2701 * htmlParseAttribute:
2702 * @ctxt: an HTML parser context
2703 * @value: a xmlChar ** used to store the value of the attribute
2704 *
2705 * parse an attribute
2706 *
2707 * [41] Attribute ::= Name Eq AttValue
2708 *
2709 * [25] Eq ::= S? '=' S?
2710 *
2711 * With namespace:
2712 *
2713 * [NS 11] Attribute ::= QName Eq AttValue
2714 *
2715 * Also the case QName == xmlns:??? is handled independently as a namespace
2716 * definition.
2717 *
2718 * Returns the attribute name, and the value in *value.
2719 */
2720
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002721static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002722htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
2723 xmlChar *name, *val = NULL;
2724
2725 *value = NULL;
2726 name = htmlParseHTMLName(ctxt);
2727 if (name == NULL) {
2728 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2729 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2730 ctxt->wellFormed = 0;
2731 return(NULL);
2732 }
2733
2734 /*
2735 * read the value
2736 */
2737 SKIP_BLANKS;
2738 if (CUR == '=') {
2739 NEXT;
2740 SKIP_BLANKS;
2741 val = htmlParseAttValue(ctxt);
2742 /******
2743 } else {
2744 * TODO : some attribute must have values, some may not
2745 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2746 ctxt->sax->warning(ctxt->userData,
2747 "No value for attribute %s\n", name); */
2748 }
2749
2750 *value = val;
2751 return(name);
2752}
2753
2754/**
2755 * htmlCheckEncoding:
2756 * @ctxt: an HTML parser context
2757 * @attvalue: the attribute value
2758 *
2759 * Checks an http-equiv attribute from a Meta tag to detect
2760 * the encoding
2761 * If a new encoding is detected the parser is switched to decode
2762 * it and pass UTF8
2763 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002764static void
Owen Taylor3473f882001-02-23 17:55:21 +00002765htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2766 const xmlChar *encoding;
2767
2768 if ((ctxt == NULL) || (attvalue == NULL))
2769 return;
2770
2771 /* do not change encoding */
2772 if (ctxt->input->encoding != NULL)
2773 return;
2774
2775 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
2776 if (encoding != NULL) {
2777 encoding += 8;
2778 } else {
2779 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
2780 if (encoding != NULL)
2781 encoding += 9;
2782 }
2783 if (encoding != NULL) {
2784 xmlCharEncoding enc;
2785 xmlCharEncodingHandlerPtr handler;
2786
2787 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2788
2789 if (ctxt->input->encoding != NULL)
2790 xmlFree((xmlChar *) ctxt->input->encoding);
2791 ctxt->input->encoding = xmlStrdup(encoding);
2792
2793 enc = xmlParseCharEncoding((const char *) encoding);
2794 /*
2795 * registered set of known encodings
2796 */
2797 if (enc != XML_CHAR_ENCODING_ERROR) {
2798 xmlSwitchEncoding(ctxt, enc);
2799 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2800 } else {
2801 /*
2802 * fallback for unknown encodings
2803 */
2804 handler = xmlFindCharEncodingHandler((const char *) encoding);
2805 if (handler != NULL) {
2806 xmlSwitchToEncoding(ctxt, handler);
2807 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2808 } else {
2809 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2810 }
2811 }
2812
2813 if ((ctxt->input->buf != NULL) &&
2814 (ctxt->input->buf->encoder != NULL) &&
2815 (ctxt->input->buf->raw != NULL) &&
2816 (ctxt->input->buf->buffer != NULL)) {
2817 int nbchars;
2818 int processed;
2819
2820 /*
2821 * convert as much as possible to the parser reading buffer.
2822 */
2823 processed = ctxt->input->cur - ctxt->input->base;
2824 xmlBufferShrink(ctxt->input->buf->buffer, processed);
2825 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2826 ctxt->input->buf->buffer,
2827 ctxt->input->buf->raw);
2828 if (nbchars < 0) {
2829 ctxt->errNo = XML_ERR_INVALID_ENCODING;
2830 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2831 ctxt->sax->error(ctxt->userData,
2832 "htmlCheckEncoding: encoder error\n");
2833 }
2834 ctxt->input->base =
2835 ctxt->input->cur = ctxt->input->buf->buffer->content;
2836 }
2837 }
2838}
2839
2840/**
2841 * htmlCheckMeta:
2842 * @ctxt: an HTML parser context
2843 * @atts: the attributes values
2844 *
2845 * Checks an attributes from a Meta tag
2846 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002847static void
Owen Taylor3473f882001-02-23 17:55:21 +00002848htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2849 int i;
2850 const xmlChar *att, *value;
2851 int http = 0;
2852 const xmlChar *content = NULL;
2853
2854 if ((ctxt == NULL) || (atts == NULL))
2855 return;
2856
2857 i = 0;
2858 att = atts[i++];
2859 while (att != NULL) {
2860 value = atts[i++];
2861 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
2862 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
2863 http = 1;
2864 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
2865 content = value;
2866 att = atts[i++];
2867 }
2868 if ((http) && (content != NULL))
2869 htmlCheckEncoding(ctxt, content);
2870
2871}
2872
2873/**
2874 * htmlParseStartTag:
2875 * @ctxt: an HTML parser context
2876 *
2877 * parse a start of tag either for rule element or
2878 * EmptyElement. In both case we don't parse the tag closing chars.
2879 *
2880 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2881 *
2882 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2883 *
2884 * With namespace:
2885 *
2886 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2887 *
2888 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2889 *
2890 */
2891
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002892static void
Owen Taylor3473f882001-02-23 17:55:21 +00002893htmlParseStartTag(htmlParserCtxtPtr ctxt) {
2894 xmlChar *name;
2895 xmlChar *attname;
2896 xmlChar *attvalue;
2897 const xmlChar **atts = NULL;
2898 int nbatts = 0;
2899 int maxatts = 0;
2900 int meta = 0;
2901 int i;
2902
2903 if (CUR != '<') return;
2904 NEXT;
2905
2906 GROW;
2907 name = htmlParseHTMLName(ctxt);
2908 if (name == NULL) {
2909 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2910 ctxt->sax->error(ctxt->userData,
2911 "htmlParseStartTag: invalid element name\n");
2912 ctxt->wellFormed = 0;
2913 /* Dump the bogus tag like browsers do */
2914 while ((IS_CHAR(CUR)) && (CUR != '>'))
2915 NEXT;
2916 return;
2917 }
2918 if (xmlStrEqual(name, BAD_CAST"meta"))
2919 meta = 1;
2920
2921 /*
2922 * Check for auto-closure of HTML elements.
2923 */
2924 htmlAutoClose(ctxt, name);
2925
2926 /*
2927 * Check for implied HTML elements.
2928 */
2929 htmlCheckImplied(ctxt, name);
2930
2931 /*
2932 * Avoid html at any level > 0, head at any level != 1
2933 * or any attempt to recurse body
2934 */
2935 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
2936 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2937 ctxt->sax->error(ctxt->userData,
2938 "htmlParseStartTag: misplaced <html> tag\n");
2939 ctxt->wellFormed = 0;
2940 xmlFree(name);
2941 return;
2942 }
2943 if ((ctxt->nameNr != 1) &&
2944 (xmlStrEqual(name, BAD_CAST"head"))) {
2945 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2946 ctxt->sax->error(ctxt->userData,
2947 "htmlParseStartTag: misplaced <head> tag\n");
2948 ctxt->wellFormed = 0;
2949 xmlFree(name);
2950 return;
2951 }
2952 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002953 int indx;
2954 for (indx = 0;indx < ctxt->nameNr;indx++) {
2955 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor3473f882001-02-23 17:55:21 +00002956 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2957 ctxt->sax->error(ctxt->userData,
2958 "htmlParseStartTag: misplaced <body> tag\n");
2959 ctxt->wellFormed = 0;
2960 xmlFree(name);
2961 return;
2962 }
2963 }
2964 }
2965
2966 /*
2967 * Now parse the attributes, it ends up with the ending
2968 *
2969 * (S Attribute)* S?
2970 */
2971 SKIP_BLANKS;
2972 while ((IS_CHAR(CUR)) &&
2973 (CUR != '>') &&
2974 ((CUR != '/') || (NXT(1) != '>'))) {
2975 long cons = ctxt->nbChars;
2976
2977 GROW;
2978 attname = htmlParseAttribute(ctxt, &attvalue);
2979 if (attname != NULL) {
2980
2981 /*
2982 * Well formedness requires at most one declaration of an attribute
2983 */
2984 for (i = 0; i < nbatts;i += 2) {
2985 if (xmlStrEqual(atts[i], attname)) {
2986 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2987 ctxt->sax->error(ctxt->userData,
2988 "Attribute %s redefined\n",
2989 attname);
2990 ctxt->wellFormed = 0;
2991 xmlFree(attname);
2992 if (attvalue != NULL)
2993 xmlFree(attvalue);
2994 goto failed;
2995 }
2996 }
2997
2998 /*
2999 * Add the pair to atts
3000 */
3001 if (atts == NULL) {
3002 maxatts = 10;
3003 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
3004 if (atts == NULL) {
3005 xmlGenericError(xmlGenericErrorContext,
3006 "malloc of %ld byte failed\n",
3007 maxatts * (long)sizeof(xmlChar *));
3008 if (name != NULL) xmlFree(name);
3009 return;
3010 }
3011 } else if (nbatts + 4 > maxatts) {
3012 maxatts *= 2;
3013 atts = (const xmlChar **) xmlRealloc((void *) atts,
3014 maxatts * sizeof(xmlChar *));
3015 if (atts == NULL) {
3016 xmlGenericError(xmlGenericErrorContext,
3017 "realloc of %ld byte failed\n",
3018 maxatts * (long)sizeof(xmlChar *));
3019 if (name != NULL) xmlFree(name);
3020 return;
3021 }
3022 }
3023 atts[nbatts++] = attname;
3024 atts[nbatts++] = attvalue;
3025 atts[nbatts] = NULL;
3026 atts[nbatts + 1] = NULL;
3027 }
3028 else {
3029 /* Dump the bogus attribute string up to the next blank or
3030 * the end of the tag. */
3031 while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
3032 && ((CUR != '/') || (NXT(1) != '>')))
3033 NEXT;
3034 }
3035
3036failed:
3037 SKIP_BLANKS;
3038 if (cons == ctxt->nbChars) {
3039 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3040 ctxt->sax->error(ctxt->userData,
3041 "htmlParseStartTag: problem parsing attributes\n");
3042 ctxt->wellFormed = 0;
3043 break;
3044 }
3045 }
3046
3047 /*
3048 * Handle specific association to the META tag
3049 */
3050 if (meta)
3051 htmlCheckMeta(ctxt, atts);
3052
3053 /*
3054 * SAX: Start of Element !
3055 */
3056 htmlnamePush(ctxt, xmlStrdup(name));
3057#ifdef DEBUG
3058 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
3059#endif
3060 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3061 ctxt->sax->startElement(ctxt->userData, name, atts);
3062
3063 if (atts != NULL) {
3064 for (i = 0;i < nbatts;i++) {
3065 if (atts[i] != NULL)
3066 xmlFree((xmlChar *) atts[i]);
3067 }
3068 xmlFree((void *) atts);
3069 }
3070 if (name != NULL) xmlFree(name);
3071}
3072
3073/**
3074 * htmlParseEndTag:
3075 * @ctxt: an HTML parser context
3076 *
3077 * parse an end of tag
3078 *
3079 * [42] ETag ::= '</' Name S? '>'
3080 *
3081 * With namespace
3082 *
3083 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003084 *
3085 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003086 */
3087
Daniel Veillardf420ac52001-07-04 16:04:09 +00003088static int
Owen Taylor3473f882001-02-23 17:55:21 +00003089htmlParseEndTag(htmlParserCtxtPtr ctxt) {
3090 xmlChar *name;
3091 xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003092 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003093
3094 if ((CUR != '<') || (NXT(1) != '/')) {
3095 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3096 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3097 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003098 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003099 }
3100 SKIP(2);
3101
3102 name = htmlParseHTMLName(ctxt);
Daniel Veillardf420ac52001-07-04 16:04:09 +00003103 if (name == NULL) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003104
3105 /*
3106 * We should definitely be at the ending "S? '>'" part
3107 */
3108 SKIP_BLANKS;
3109 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3110 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3111 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3112 ctxt->wellFormed = 0;
3113 } else
3114 NEXT;
3115
3116 /*
3117 * If the name read is not one of the element in the parsing stack
3118 * then return, it's just an error.
3119 */
3120 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
3121 if (xmlStrEqual(name, ctxt->nameTab[i])) break;
3122 }
3123 if (i < 0) {
3124 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3125 ctxt->sax->error(ctxt->userData,
3126 "Unexpected end tag : %s\n", name);
3127 xmlFree(name);
3128 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003129 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003130 }
3131
3132
3133 /*
3134 * Check for auto-closure of HTML elements.
3135 */
3136
3137 htmlAutoCloseOnClose(ctxt, name);
3138
3139 /*
3140 * Well formedness constraints, opening and closing must match.
3141 * With the exception that the autoclose may have popped stuff out
3142 * of the stack.
3143 */
3144 if (!xmlStrEqual(name, ctxt->name)) {
3145#ifdef DEBUG
3146 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
3147#endif
3148 if ((ctxt->name != NULL) &&
3149 (!xmlStrEqual(ctxt->name, name))) {
3150 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3151 ctxt->sax->error(ctxt->userData,
3152 "Opening and ending tag mismatch: %s and %s\n",
3153 name, ctxt->name);
3154 ctxt->wellFormed = 0;
3155 }
3156 }
3157
3158 /*
3159 * SAX: End of Tag
3160 */
3161 oldname = ctxt->name;
3162 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3163 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3164 ctxt->sax->endElement(ctxt->userData, name);
3165 oldname = htmlnamePop(ctxt);
3166 if (oldname != NULL) {
3167#ifdef DEBUG
3168 xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
3169#endif
3170 xmlFree(oldname);
3171#ifdef DEBUG
3172 } else {
3173 xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
3174#endif
3175 }
Daniel Veillardf420ac52001-07-04 16:04:09 +00003176 ret = 1;
3177 } else {
3178 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003179 }
3180
3181 if (name != NULL)
3182 xmlFree(name);
3183
Daniel Veillardf420ac52001-07-04 16:04:09 +00003184 return(ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003185}
3186
3187
3188/**
3189 * htmlParseReference:
3190 * @ctxt: an HTML parser context
3191 *
3192 * parse and handle entity references in content,
3193 * this will end-up in a call to character() since this is either a
3194 * CharRef, or a predefined entity.
3195 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003196static void
Owen Taylor3473f882001-02-23 17:55:21 +00003197htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003198 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003199 xmlChar out[6];
3200 xmlChar *name;
3201 if (CUR != '&') return;
3202
3203 if (NXT(1) == '#') {
3204 unsigned int c;
3205 int bits, i = 0;
3206
3207 c = htmlParseCharRef(ctxt);
3208 if (c == 0)
3209 return;
3210
3211 if (c < 0x80) { out[i++]= c; bits= -6; }
3212 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3213 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3214 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3215
3216 for ( ; bits >= 0; bits-= 6) {
3217 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3218 }
3219 out[i] = 0;
3220
3221 htmlCheckParagraph(ctxt);
3222 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3223 ctxt->sax->characters(ctxt->userData, out, i);
3224 } else {
3225 ent = htmlParseEntityRef(ctxt, &name);
3226 if (name == NULL) {
3227 htmlCheckParagraph(ctxt);
3228 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3229 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3230 return;
3231 }
3232 if ((ent == NULL) || (ent->value <= 0)) {
3233 htmlCheckParagraph(ctxt);
3234 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3235 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3236 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3237 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3238 }
3239 } else {
3240 unsigned int c;
3241 int bits, i = 0;
3242
3243 c = ent->value;
3244 if (c < 0x80)
3245 { out[i++]= c; bits= -6; }
3246 else if (c < 0x800)
3247 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3248 else if (c < 0x10000)
3249 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3250 else
3251 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3252
3253 for ( ; bits >= 0; bits-= 6) {
3254 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3255 }
3256 out[i] = 0;
3257
3258 htmlCheckParagraph(ctxt);
3259 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3260 ctxt->sax->characters(ctxt->userData, out, i);
3261 }
3262 xmlFree(name);
3263 }
3264}
3265
3266/**
3267 * htmlParseContent:
3268 * @ctxt: an HTML parser context
3269 * @name: the node name
3270 *
3271 * Parse a content: comment, sub-element, reference or text.
3272 *
3273 */
3274
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003275static void
Owen Taylor3473f882001-02-23 17:55:21 +00003276htmlParseContent(htmlParserCtxtPtr ctxt) {
3277 xmlChar *currentNode;
3278 int depth;
3279
3280 currentNode = xmlStrdup(ctxt->name);
3281 depth = ctxt->nameNr;
3282 while (1) {
3283 long cons = ctxt->nbChars;
3284
3285 GROW;
3286 /*
3287 * Our tag or one of it's parent or children is ending.
3288 */
3289 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003290 if (htmlParseEndTag(ctxt) &&
3291 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3292 if (currentNode != NULL)
3293 xmlFree(currentNode);
3294 return;
3295 }
3296 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003297 }
3298
3299 /*
3300 * Has this node been popped out during parsing of
3301 * the next element
3302 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003303 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3304 (!xmlStrEqual(currentNode, ctxt->name)))
3305 {
Owen Taylor3473f882001-02-23 17:55:21 +00003306 if (currentNode != NULL) xmlFree(currentNode);
3307 return;
3308 }
3309
Daniel Veillardf9533d12001-03-03 10:04:57 +00003310 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3311 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003312 /*
3313 * Handle SCRIPT/STYLE separately
3314 */
3315 htmlParseScript(ctxt);
3316 } else {
3317 /*
3318 * Sometimes DOCTYPE arrives in the middle of the document
3319 */
3320 if ((CUR == '<') && (NXT(1) == '!') &&
3321 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3322 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3323 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3324 (UPP(8) == 'E')) {
3325 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3326 ctxt->sax->error(ctxt->userData,
3327 "Misplaced DOCTYPE declaration\n");
3328 ctxt->wellFormed = 0;
3329 htmlParseDocTypeDecl(ctxt);
3330 }
3331
3332 /*
3333 * First case : a comment
3334 */
3335 if ((CUR == '<') && (NXT(1) == '!') &&
3336 (NXT(2) == '-') && (NXT(3) == '-')) {
3337 htmlParseComment(ctxt);
3338 }
3339
3340 /*
3341 * Second case : a sub-element.
3342 */
3343 else if (CUR == '<') {
3344 htmlParseElement(ctxt);
3345 }
3346
3347 /*
3348 * Third case : a reference. If if has not been resolved,
3349 * parsing returns it's Name, create the node
3350 */
3351 else if (CUR == '&') {
3352 htmlParseReference(ctxt);
3353 }
3354
3355 /*
3356 * Fourth : end of the resource
3357 */
3358 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003359 htmlAutoCloseOnEnd(ctxt);
3360 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003361 }
3362
3363 /*
3364 * Last case, text. Note that References are handled directly.
3365 */
3366 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003367 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003368 }
3369
3370 if (cons == ctxt->nbChars) {
3371 if (ctxt->node != NULL) {
3372 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3373 ctxt->sax->error(ctxt->userData,
3374 "detected an error in element content\n");
3375 ctxt->wellFormed = 0;
3376 }
3377 break;
3378 }
3379 }
3380 GROW;
3381 }
3382 if (currentNode != NULL) xmlFree(currentNode);
3383}
3384
3385/**
3386 * htmlParseElement:
3387 * @ctxt: an HTML parser context
3388 *
3389 * parse an HTML element, this is highly recursive
3390 *
3391 * [39] element ::= EmptyElemTag | STag content ETag
3392 *
3393 * [41] Attribute ::= Name Eq AttValue
3394 */
3395
3396void
3397htmlParseElement(htmlParserCtxtPtr ctxt) {
3398 xmlChar *name;
3399 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003400 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003401 htmlParserNodeInfo node_info;
3402 xmlChar *oldname;
3403 int depth = ctxt->nameNr;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003404 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003405
3406 /* Capture start position */
3407 if (ctxt->record_info) {
3408 node_info.begin_pos = ctxt->input->consumed +
3409 (CUR_PTR - ctxt->input->base);
3410 node_info.begin_line = ctxt->input->line;
3411 }
3412
3413 oldname = xmlStrdup(ctxt->name);
3414 htmlParseStartTag(ctxt);
3415 name = ctxt->name;
3416#ifdef DEBUG
3417 if (oldname == NULL)
3418 xmlGenericError(xmlGenericErrorContext,
3419 "Start of element %s\n", name);
3420 else if (name == NULL)
3421 xmlGenericError(xmlGenericErrorContext,
3422 "Start of element failed, was %s\n", oldname);
3423 else
3424 xmlGenericError(xmlGenericErrorContext,
3425 "Start of element %s, was %s\n", name, oldname);
3426#endif
3427 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3428 (name == NULL)) {
3429 if (CUR == '>')
3430 NEXT;
3431 if (oldname != NULL)
3432 xmlFree(oldname);
3433 return;
3434 }
3435 if (oldname != NULL)
3436 xmlFree(oldname);
3437
3438 /*
3439 * Lookup the info for that element.
3440 */
3441 info = htmlTagLookup(name);
3442 if (info == NULL) {
3443 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3444 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3445 name);
3446 ctxt->wellFormed = 0;
3447 } else if (info->depr) {
3448/***************************
3449 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3450 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3451 name);
3452 ***************************/
3453 }
3454
3455 /*
3456 * Check for an Empty Element labelled the XML/SGML way
3457 */
3458 if ((CUR == '/') && (NXT(1) == '>')) {
3459 SKIP(2);
3460 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3461 ctxt->sax->endElement(ctxt->userData, name);
3462 oldname = htmlnamePop(ctxt);
3463#ifdef DEBUG
3464 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
3465#endif
3466 if (oldname != NULL)
3467 xmlFree(oldname);
3468 return;
3469 }
3470
3471 if (CUR == '>') {
3472 NEXT;
3473 } else {
3474 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3475 ctxt->sax->error(ctxt->userData,
3476 "Couldn't find end of Start Tag %s\n",
3477 name);
3478 ctxt->wellFormed = 0;
3479
3480 /*
3481 * end of parsing of this node.
3482 */
3483 if (xmlStrEqual(name, ctxt->name)) {
3484 nodePop(ctxt);
3485 oldname = htmlnamePop(ctxt);
3486#ifdef DEBUG
3487 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
3488#endif
3489 if (oldname != NULL)
3490 xmlFree(oldname);
3491 }
3492
3493 /*
3494 * Capture end position and add node
3495 */
3496 if ( currentNode != NULL && ctxt->record_info ) {
3497 node_info.end_pos = ctxt->input->consumed +
3498 (CUR_PTR - ctxt->input->base);
3499 node_info.end_line = ctxt->input->line;
3500 node_info.node = ctxt->node;
3501 xmlParserAddNodeInfo(ctxt, &node_info);
3502 }
3503 return;
3504 }
3505
3506 /*
3507 * Check for an Empty Element from DTD definition
3508 */
3509 if ((info != NULL) && (info->empty)) {
3510 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3511 ctxt->sax->endElement(ctxt->userData, name);
3512 oldname = htmlnamePop(ctxt);
3513#ifdef DEBUG
3514 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
3515#endif
3516 if (oldname != NULL)
3517 xmlFree(oldname);
3518 return;
3519 }
3520
3521 /*
3522 * Parse the content of the element:
3523 */
3524 currentNode = xmlStrdup(ctxt->name);
3525 depth = ctxt->nameNr;
3526 while (IS_CHAR(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00003527 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00003528 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00003529 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00003530 if (ctxt->nameNr < depth) break;
3531 }
3532
Owen Taylor3473f882001-02-23 17:55:21 +00003533 /*
3534 * Capture end position and add node
3535 */
3536 if ( currentNode != NULL && ctxt->record_info ) {
3537 node_info.end_pos = ctxt->input->consumed +
3538 (CUR_PTR - ctxt->input->base);
3539 node_info.end_line = ctxt->input->line;
3540 node_info.node = ctxt->node;
3541 xmlParserAddNodeInfo(ctxt, &node_info);
3542 }
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003543 if (!IS_CHAR(CUR)) {
3544 htmlAutoCloseOnEnd(ctxt);
3545 }
3546
Owen Taylor3473f882001-02-23 17:55:21 +00003547 if (currentNode != NULL)
3548 xmlFree(currentNode);
3549}
3550
3551/**
3552 * htmlParseDocument :
3553 * @ctxt: an HTML parser context
3554 *
3555 * parse an HTML document (and build a tree if using the standard SAX
3556 * interface).
3557 *
3558 * Returns 0, -1 in case of error. the parser context is augmented
3559 * as a result of the parsing.
3560 */
3561
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003562static int
Owen Taylor3473f882001-02-23 17:55:21 +00003563htmlParseDocument(htmlParserCtxtPtr ctxt) {
3564 xmlDtdPtr dtd;
3565
Daniel Veillardd0463562001-10-13 09:15:48 +00003566 xmlInitParser();
3567
Owen Taylor3473f882001-02-23 17:55:21 +00003568 htmlDefaultSAXHandlerInit();
3569 ctxt->html = 1;
3570
3571 GROW;
3572 /*
3573 * SAX: beginning of the document processing.
3574 */
3575 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3576 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3577
3578 /*
3579 * Wipe out everything which is before the first '<'
3580 */
3581 SKIP_BLANKS;
3582 if (CUR == 0) {
3583 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3584 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3585 ctxt->wellFormed = 0;
3586 }
3587
3588 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3589 ctxt->sax->startDocument(ctxt->userData);
3590
3591
3592 /*
3593 * Parse possible comments before any content
3594 */
3595 while ((CUR == '<') && (NXT(1) == '!') &&
3596 (NXT(2) == '-') && (NXT(3) == '-')) {
3597 htmlParseComment(ctxt);
3598 SKIP_BLANKS;
3599 }
3600
3601
3602 /*
3603 * Then possibly doc type declaration(s) and more Misc
3604 * (doctypedecl Misc*)?
3605 */
3606 if ((CUR == '<') && (NXT(1) == '!') &&
3607 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3608 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3609 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3610 (UPP(8) == 'E')) {
3611 htmlParseDocTypeDecl(ctxt);
3612 }
3613 SKIP_BLANKS;
3614
3615 /*
3616 * Parse possible comments before any content
3617 */
3618 while ((CUR == '<') && (NXT(1) == '!') &&
3619 (NXT(2) == '-') && (NXT(3) == '-')) {
3620 htmlParseComment(ctxt);
3621 SKIP_BLANKS;
3622 }
3623
3624 /*
3625 * Time to start parsing the tree itself
3626 */
3627 htmlParseContent(ctxt);
3628
3629 /*
3630 * autoclose
3631 */
3632 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003633 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003634
3635
3636 /*
3637 * SAX: end of the document processing.
3638 */
3639 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3640 ctxt->sax->endDocument(ctxt->userData);
3641
3642 if (ctxt->myDoc != NULL) {
3643 dtd = xmlGetIntSubset(ctxt->myDoc);
3644 if (dtd == NULL)
3645 ctxt->myDoc->intSubset =
3646 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3647 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3648 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3649 }
3650 if (! ctxt->wellFormed) return(-1);
3651 return(0);
3652}
3653
3654
3655/************************************************************************
3656 * *
3657 * Parser contexts handling *
3658 * *
3659 ************************************************************************/
3660
3661/**
3662 * xmlInitParserCtxt:
3663 * @ctxt: an HTML parser context
3664 *
3665 * Initialize a parser context
3666 */
3667
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003668static void
Owen Taylor3473f882001-02-23 17:55:21 +00003669htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3670{
3671 htmlSAXHandler *sax;
3672
3673 if (ctxt == NULL) return;
3674 memset(ctxt, 0, sizeof(htmlParserCtxt));
3675
3676 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3677 if (sax == NULL) {
3678 xmlGenericError(xmlGenericErrorContext,
3679 "htmlInitParserCtxt: out of memory\n");
3680 }
3681 else
3682 memset(sax, 0, sizeof(htmlSAXHandler));
3683
3684 /* Allocate the Input stack */
3685 ctxt->inputTab = (htmlParserInputPtr *)
3686 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3687 if (ctxt->inputTab == NULL) {
3688 xmlGenericError(xmlGenericErrorContext,
3689 "htmlInitParserCtxt: out of memory\n");
3690 ctxt->inputNr = 0;
3691 ctxt->inputMax = 0;
3692 ctxt->input = NULL;
3693 return;
3694 }
3695 ctxt->inputNr = 0;
3696 ctxt->inputMax = 5;
3697 ctxt->input = NULL;
3698 ctxt->version = NULL;
3699 ctxt->encoding = NULL;
3700 ctxt->standalone = -1;
3701 ctxt->instate = XML_PARSER_START;
3702
3703 /* Allocate the Node stack */
3704 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3705 if (ctxt->nodeTab == NULL) {
3706 xmlGenericError(xmlGenericErrorContext,
3707 "htmlInitParserCtxt: out of memory\n");
3708 ctxt->nodeNr = 0;
3709 ctxt->nodeMax = 0;
3710 ctxt->node = NULL;
3711 ctxt->inputNr = 0;
3712 ctxt->inputMax = 0;
3713 ctxt->input = NULL;
3714 return;
3715 }
3716 ctxt->nodeNr = 0;
3717 ctxt->nodeMax = 10;
3718 ctxt->node = NULL;
3719
3720 /* Allocate the Name stack */
3721 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
3722 if (ctxt->nameTab == NULL) {
3723 xmlGenericError(xmlGenericErrorContext,
3724 "htmlInitParserCtxt: out of memory\n");
3725 ctxt->nameNr = 0;
3726 ctxt->nameMax = 10;
3727 ctxt->name = NULL;
3728 ctxt->nodeNr = 0;
3729 ctxt->nodeMax = 0;
3730 ctxt->node = NULL;
3731 ctxt->inputNr = 0;
3732 ctxt->inputMax = 0;
3733 ctxt->input = NULL;
3734 return;
3735 }
3736 ctxt->nameNr = 0;
3737 ctxt->nameMax = 10;
3738 ctxt->name = NULL;
3739
3740 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3741 else {
3742 ctxt->sax = sax;
3743 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3744 }
3745 ctxt->userData = ctxt;
3746 ctxt->myDoc = NULL;
3747 ctxt->wellFormed = 1;
3748 ctxt->replaceEntities = 0;
3749 ctxt->html = 1;
3750 ctxt->record_info = 0;
3751 ctxt->validate = 0;
3752 ctxt->nbChars = 0;
3753 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00003754 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003755 xmlInitNodeInfoSeq(&ctxt->node_seq);
3756}
3757
3758/**
3759 * htmlFreeParserCtxt:
3760 * @ctxt: an HTML parser context
3761 *
3762 * Free all the memory used by a parser context. However the parsed
3763 * document in ctxt->myDoc is not freed.
3764 */
3765
3766void
3767htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3768{
3769 xmlFreeParserCtxt(ctxt);
3770}
3771
3772/**
3773 * htmlCreateDocParserCtxt :
3774 * @cur: a pointer to an array of xmlChar
3775 * @encoding: a free form C string describing the HTML document encoding, or NULL
3776 *
3777 * Create a parser context for an HTML document.
3778 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003779 * TODO: check the need to add encoding handling there
3780 *
Owen Taylor3473f882001-02-23 17:55:21 +00003781 * Returns the new parser context or NULL
3782 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003783static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00003784htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Owen Taylor3473f882001-02-23 17:55:21 +00003785 htmlParserCtxtPtr ctxt;
3786 htmlParserInputPtr input;
3787 /* htmlCharEncoding enc; */
3788
3789 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
3790 if (ctxt == NULL) {
3791 perror("malloc");
3792 return(NULL);
3793 }
3794 htmlInitParserCtxt(ctxt);
3795 input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
3796 if (input == NULL) {
3797 perror("malloc");
3798 xmlFree(ctxt);
3799 return(NULL);
3800 }
3801 memset(input, 0, sizeof(htmlParserInput));
3802
3803 input->line = 1;
3804 input->col = 1;
3805 input->base = cur;
3806 input->cur = cur;
3807
3808 inputPush(ctxt, input);
3809 return(ctxt);
3810}
3811
3812/************************************************************************
3813 * *
3814 * Progressive parsing interfaces *
3815 * *
3816 ************************************************************************/
3817
3818/**
3819 * htmlParseLookupSequence:
3820 * @ctxt: an HTML parser context
3821 * @first: the first char to lookup
3822 * @next: the next char to lookup or zero
3823 * @third: the next char to lookup or zero
3824 *
3825 * Try to find if a sequence (first, next, third) or just (first next) or
3826 * (first) is available in the input stream.
3827 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3828 * to avoid rescanning sequences of bytes, it DOES change the state of the
3829 * parser, do not use liberally.
3830 * This is basically similar to xmlParseLookupSequence()
3831 *
3832 * Returns the index to the current parsing point if the full sequence
3833 * is available, -1 otherwise.
3834 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003835static int
Owen Taylor3473f882001-02-23 17:55:21 +00003836htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3837 xmlChar next, xmlChar third) {
3838 int base, len;
3839 htmlParserInputPtr in;
3840 const xmlChar *buf;
3841
3842 in = ctxt->input;
3843 if (in == NULL) return(-1);
3844 base = in->cur - in->base;
3845 if (base < 0) return(-1);
3846 if (ctxt->checkIndex > base)
3847 base = ctxt->checkIndex;
3848 if (in->buf == NULL) {
3849 buf = in->base;
3850 len = in->length;
3851 } else {
3852 buf = in->buf->buffer->content;
3853 len = in->buf->buffer->use;
3854 }
3855 /* take into account the sequence length */
3856 if (third) len -= 2;
3857 else if (next) len --;
3858 for (;base < len;base++) {
3859 if (buf[base] == first) {
3860 if (third != 0) {
3861 if ((buf[base + 1] != next) ||
3862 (buf[base + 2] != third)) continue;
3863 } else if (next != 0) {
3864 if (buf[base + 1] != next) continue;
3865 }
3866 ctxt->checkIndex = 0;
3867#ifdef DEBUG_PUSH
3868 if (next == 0)
3869 xmlGenericError(xmlGenericErrorContext,
3870 "HPP: lookup '%c' found at %d\n",
3871 first, base);
3872 else if (third == 0)
3873 xmlGenericError(xmlGenericErrorContext,
3874 "HPP: lookup '%c%c' found at %d\n",
3875 first, next, base);
3876 else
3877 xmlGenericError(xmlGenericErrorContext,
3878 "HPP: lookup '%c%c%c' found at %d\n",
3879 first, next, third, base);
3880#endif
3881 return(base - (in->cur - in->base));
3882 }
3883 }
3884 ctxt->checkIndex = base;
3885#ifdef DEBUG_PUSH
3886 if (next == 0)
3887 xmlGenericError(xmlGenericErrorContext,
3888 "HPP: lookup '%c' failed\n", first);
3889 else if (third == 0)
3890 xmlGenericError(xmlGenericErrorContext,
3891 "HPP: lookup '%c%c' failed\n", first, next);
3892 else
3893 xmlGenericError(xmlGenericErrorContext,
3894 "HPP: lookup '%c%c%c' failed\n", first, next, third);
3895#endif
3896 return(-1);
3897}
3898
3899/**
3900 * htmlParseTryOrFinish:
3901 * @ctxt: an HTML parser context
3902 * @terminate: last chunk indicator
3903 *
3904 * Try to progress on parsing
3905 *
3906 * Returns zero if no parsing was possible
3907 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003908static int
Owen Taylor3473f882001-02-23 17:55:21 +00003909htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
3910 int ret = 0;
3911 htmlParserInputPtr in;
3912 int avail = 0;
3913 xmlChar cur, next;
3914
3915#ifdef DEBUG_PUSH
3916 switch (ctxt->instate) {
3917 case XML_PARSER_EOF:
3918 xmlGenericError(xmlGenericErrorContext,
3919 "HPP: try EOF\n"); break;
3920 case XML_PARSER_START:
3921 xmlGenericError(xmlGenericErrorContext,
3922 "HPP: try START\n"); break;
3923 case XML_PARSER_MISC:
3924 xmlGenericError(xmlGenericErrorContext,
3925 "HPP: try MISC\n");break;
3926 case XML_PARSER_COMMENT:
3927 xmlGenericError(xmlGenericErrorContext,
3928 "HPP: try COMMENT\n");break;
3929 case XML_PARSER_PROLOG:
3930 xmlGenericError(xmlGenericErrorContext,
3931 "HPP: try PROLOG\n");break;
3932 case XML_PARSER_START_TAG:
3933 xmlGenericError(xmlGenericErrorContext,
3934 "HPP: try START_TAG\n");break;
3935 case XML_PARSER_CONTENT:
3936 xmlGenericError(xmlGenericErrorContext,
3937 "HPP: try CONTENT\n");break;
3938 case XML_PARSER_CDATA_SECTION:
3939 xmlGenericError(xmlGenericErrorContext,
3940 "HPP: try CDATA_SECTION\n");break;
3941 case XML_PARSER_END_TAG:
3942 xmlGenericError(xmlGenericErrorContext,
3943 "HPP: try END_TAG\n");break;
3944 case XML_PARSER_ENTITY_DECL:
3945 xmlGenericError(xmlGenericErrorContext,
3946 "HPP: try ENTITY_DECL\n");break;
3947 case XML_PARSER_ENTITY_VALUE:
3948 xmlGenericError(xmlGenericErrorContext,
3949 "HPP: try ENTITY_VALUE\n");break;
3950 case XML_PARSER_ATTRIBUTE_VALUE:
3951 xmlGenericError(xmlGenericErrorContext,
3952 "HPP: try ATTRIBUTE_VALUE\n");break;
3953 case XML_PARSER_DTD:
3954 xmlGenericError(xmlGenericErrorContext,
3955 "HPP: try DTD\n");break;
3956 case XML_PARSER_EPILOG:
3957 xmlGenericError(xmlGenericErrorContext,
3958 "HPP: try EPILOG\n");break;
3959 case XML_PARSER_PI:
3960 xmlGenericError(xmlGenericErrorContext,
3961 "HPP: try PI\n");break;
3962 case XML_PARSER_SYSTEM_LITERAL:
3963 xmlGenericError(xmlGenericErrorContext,
3964 "HPP: try SYSTEM_LITERAL\n");break;
3965 }
3966#endif
3967
3968 while (1) {
3969
3970 in = ctxt->input;
3971 if (in == NULL) break;
3972 if (in->buf == NULL)
3973 avail = in->length - (in->cur - in->base);
3974 else
3975 avail = in->buf->buffer->use - (in->cur - in->base);
3976 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003977 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003978 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
3979 /*
3980 * SAX: end of the document processing.
3981 */
3982 ctxt->instate = XML_PARSER_EOF;
3983 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3984 ctxt->sax->endDocument(ctxt->userData);
3985 }
3986 }
3987 if (avail < 1)
3988 goto done;
3989 switch (ctxt->instate) {
3990 case XML_PARSER_EOF:
3991 /*
3992 * Document parsing is done !
3993 */
3994 goto done;
3995 case XML_PARSER_START:
3996 /*
3997 * Very first chars read from the document flow.
3998 */
3999 cur = in->cur[0];
4000 if (IS_BLANK(cur)) {
4001 SKIP_BLANKS;
4002 if (in->buf == NULL)
4003 avail = in->length - (in->cur - in->base);
4004 else
4005 avail = in->buf->buffer->use - (in->cur - in->base);
4006 }
4007 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4008 ctxt->sax->setDocumentLocator(ctxt->userData,
4009 &xmlDefaultSAXLocator);
4010 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4011 (!ctxt->disableSAX))
4012 ctxt->sax->startDocument(ctxt->userData);
4013
4014 cur = in->cur[0];
4015 next = in->cur[1];
4016 if ((cur == '<') && (next == '!') &&
4017 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4018 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4019 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4020 (UPP(8) == 'E')) {
4021 if ((!terminate) &&
4022 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4023 goto done;
4024#ifdef DEBUG_PUSH
4025 xmlGenericError(xmlGenericErrorContext,
4026 "HPP: Parsing internal subset\n");
4027#endif
4028 htmlParseDocTypeDecl(ctxt);
4029 ctxt->instate = XML_PARSER_PROLOG;
4030#ifdef DEBUG_PUSH
4031 xmlGenericError(xmlGenericErrorContext,
4032 "HPP: entering PROLOG\n");
4033#endif
4034 } else {
4035 ctxt->instate = XML_PARSER_MISC;
4036 }
4037#ifdef DEBUG_PUSH
4038 xmlGenericError(xmlGenericErrorContext,
4039 "HPP: entering MISC\n");
4040#endif
4041 break;
4042 case XML_PARSER_MISC:
4043 SKIP_BLANKS;
4044 if (in->buf == NULL)
4045 avail = in->length - (in->cur - in->base);
4046 else
4047 avail = in->buf->buffer->use - (in->cur - in->base);
4048 if (avail < 2)
4049 goto done;
4050 cur = in->cur[0];
4051 next = in->cur[1];
4052 if ((cur == '<') && (next == '!') &&
4053 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4054 if ((!terminate) &&
4055 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4056 goto done;
4057#ifdef DEBUG_PUSH
4058 xmlGenericError(xmlGenericErrorContext,
4059 "HPP: Parsing Comment\n");
4060#endif
4061 htmlParseComment(ctxt);
4062 ctxt->instate = XML_PARSER_MISC;
4063 } else if ((cur == '<') && (next == '!') &&
4064 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4065 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4066 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4067 (UPP(8) == 'E')) {
4068 if ((!terminate) &&
4069 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4070 goto done;
4071#ifdef DEBUG_PUSH
4072 xmlGenericError(xmlGenericErrorContext,
4073 "HPP: Parsing internal subset\n");
4074#endif
4075 htmlParseDocTypeDecl(ctxt);
4076 ctxt->instate = XML_PARSER_PROLOG;
4077#ifdef DEBUG_PUSH
4078 xmlGenericError(xmlGenericErrorContext,
4079 "HPP: entering PROLOG\n");
4080#endif
4081 } else if ((cur == '<') && (next == '!') &&
4082 (avail < 9)) {
4083 goto done;
4084 } else {
4085 ctxt->instate = XML_PARSER_START_TAG;
4086#ifdef DEBUG_PUSH
4087 xmlGenericError(xmlGenericErrorContext,
4088 "HPP: entering START_TAG\n");
4089#endif
4090 }
4091 break;
4092 case XML_PARSER_PROLOG:
4093 SKIP_BLANKS;
4094 if (in->buf == NULL)
4095 avail = in->length - (in->cur - in->base);
4096 else
4097 avail = in->buf->buffer->use - (in->cur - in->base);
4098 if (avail < 2)
4099 goto done;
4100 cur = in->cur[0];
4101 next = in->cur[1];
4102 if ((cur == '<') && (next == '!') &&
4103 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4104 if ((!terminate) &&
4105 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4106 goto done;
4107#ifdef DEBUG_PUSH
4108 xmlGenericError(xmlGenericErrorContext,
4109 "HPP: Parsing Comment\n");
4110#endif
4111 htmlParseComment(ctxt);
4112 ctxt->instate = XML_PARSER_PROLOG;
4113 } else if ((cur == '<') && (next == '!') &&
4114 (avail < 4)) {
4115 goto done;
4116 } else {
4117 ctxt->instate = XML_PARSER_START_TAG;
4118#ifdef DEBUG_PUSH
4119 xmlGenericError(xmlGenericErrorContext,
4120 "HPP: entering START_TAG\n");
4121#endif
4122 }
4123 break;
4124 case XML_PARSER_EPILOG:
4125 if (in->buf == NULL)
4126 avail = in->length - (in->cur - in->base);
4127 else
4128 avail = in->buf->buffer->use - (in->cur - in->base);
4129 if (avail < 1)
4130 goto done;
4131 cur = in->cur[0];
4132 if (IS_BLANK(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004133 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004134 goto done;
4135 }
4136 if (avail < 2)
4137 goto done;
4138 next = in->cur[1];
4139 if ((cur == '<') && (next == '!') &&
4140 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4141 if ((!terminate) &&
4142 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4143 goto done;
4144#ifdef DEBUG_PUSH
4145 xmlGenericError(xmlGenericErrorContext,
4146 "HPP: Parsing Comment\n");
4147#endif
4148 htmlParseComment(ctxt);
4149 ctxt->instate = XML_PARSER_EPILOG;
4150 } else if ((cur == '<') && (next == '!') &&
4151 (avail < 4)) {
4152 goto done;
4153 } else {
4154 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004155 ctxt->wellFormed = 0;
4156 ctxt->instate = XML_PARSER_EOF;
4157#ifdef DEBUG_PUSH
4158 xmlGenericError(xmlGenericErrorContext,
4159 "HPP: entering EOF\n");
4160#endif
4161 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4162 ctxt->sax->endDocument(ctxt->userData);
4163 goto done;
4164 }
4165 break;
4166 case XML_PARSER_START_TAG: {
4167 xmlChar *name, *oldname;
4168 int depth = ctxt->nameNr;
Daniel Veillardbb371292001-08-16 23:26:59 +00004169 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004170
4171 if (avail < 2)
4172 goto done;
4173 cur = in->cur[0];
4174 if (cur != '<') {
4175 ctxt->instate = XML_PARSER_CONTENT;
4176#ifdef DEBUG_PUSH
4177 xmlGenericError(xmlGenericErrorContext,
4178 "HPP: entering CONTENT\n");
4179#endif
4180 break;
4181 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004182 if (in->cur[1] == '/') {
4183 ctxt->instate = XML_PARSER_END_TAG;
4184 ctxt->checkIndex = 0;
4185#ifdef DEBUG_PUSH
4186 xmlGenericError(xmlGenericErrorContext,
4187 "HPP: entering END_TAG\n");
4188#endif
4189 break;
4190 }
Owen Taylor3473f882001-02-23 17:55:21 +00004191 if ((!terminate) &&
4192 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4193 goto done;
4194
4195 oldname = xmlStrdup(ctxt->name);
4196 htmlParseStartTag(ctxt);
4197 name = ctxt->name;
4198#ifdef DEBUG
4199 if (oldname == NULL)
4200 xmlGenericError(xmlGenericErrorContext,
4201 "Start of element %s\n", name);
4202 else if (name == NULL)
4203 xmlGenericError(xmlGenericErrorContext,
4204 "Start of element failed, was %s\n",
4205 oldname);
4206 else
4207 xmlGenericError(xmlGenericErrorContext,
4208 "Start of element %s, was %s\n",
4209 name, oldname);
4210#endif
4211 if (((depth == ctxt->nameNr) &&
4212 (xmlStrEqual(oldname, ctxt->name))) ||
4213 (name == NULL)) {
4214 if (CUR == '>')
4215 NEXT;
4216 if (oldname != NULL)
4217 xmlFree(oldname);
4218 break;
4219 }
4220 if (oldname != NULL)
4221 xmlFree(oldname);
4222
4223 /*
4224 * Lookup the info for that element.
4225 */
4226 info = htmlTagLookup(name);
4227 if (info == NULL) {
4228 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4229 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4230 name);
4231 ctxt->wellFormed = 0;
4232 } else if (info->depr) {
4233 /***************************
4234 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4235 ctxt->sax->warning(ctxt->userData,
4236 "Tag %s is deprecated\n",
4237 name);
4238 ***************************/
4239 }
4240
4241 /*
4242 * Check for an Empty Element labelled the XML/SGML way
4243 */
4244 if ((CUR == '/') && (NXT(1) == '>')) {
4245 SKIP(2);
4246 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4247 ctxt->sax->endElement(ctxt->userData, name);
4248 oldname = htmlnamePop(ctxt);
4249#ifdef DEBUG
4250 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
4251 oldname);
4252#endif
4253 if (oldname != NULL)
4254 xmlFree(oldname);
4255 ctxt->instate = XML_PARSER_CONTENT;
4256#ifdef DEBUG_PUSH
4257 xmlGenericError(xmlGenericErrorContext,
4258 "HPP: entering CONTENT\n");
4259#endif
4260 break;
4261 }
4262
4263 if (CUR == '>') {
4264 NEXT;
4265 } else {
4266 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4267 ctxt->sax->error(ctxt->userData,
4268 "Couldn't find end of Start Tag %s\n",
4269 name);
4270 ctxt->wellFormed = 0;
4271
4272 /*
4273 * end of parsing of this node.
4274 */
4275 if (xmlStrEqual(name, ctxt->name)) {
4276 nodePop(ctxt);
4277 oldname = htmlnamePop(ctxt);
4278#ifdef DEBUG
4279 xmlGenericError(xmlGenericErrorContext,
4280 "End of start tag problem: popping out %s\n", oldname);
4281#endif
4282 if (oldname != NULL)
4283 xmlFree(oldname);
4284 }
4285
4286 ctxt->instate = XML_PARSER_CONTENT;
4287#ifdef DEBUG_PUSH
4288 xmlGenericError(xmlGenericErrorContext,
4289 "HPP: entering CONTENT\n");
4290#endif
4291 break;
4292 }
4293
4294 /*
4295 * Check for an Empty Element from DTD definition
4296 */
4297 if ((info != NULL) && (info->empty)) {
4298 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4299 ctxt->sax->endElement(ctxt->userData, name);
4300 oldname = htmlnamePop(ctxt);
4301#ifdef DEBUG
4302 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
4303#endif
4304 if (oldname != NULL)
4305 xmlFree(oldname);
4306 }
4307 ctxt->instate = XML_PARSER_CONTENT;
4308#ifdef DEBUG_PUSH
4309 xmlGenericError(xmlGenericErrorContext,
4310 "HPP: entering CONTENT\n");
4311#endif
4312 break;
4313 }
4314 case XML_PARSER_CONTENT: {
4315 long cons;
4316 /*
4317 * Handle preparsed entities and charRef
4318 */
4319 if (ctxt->token != 0) {
4320 xmlChar chr[2] = { 0 , 0 } ;
4321
4322 chr[0] = (xmlChar) ctxt->token;
4323 htmlCheckParagraph(ctxt);
4324 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4325 ctxt->sax->characters(ctxt->userData, chr, 1);
4326 ctxt->token = 0;
4327 ctxt->checkIndex = 0;
4328 }
4329 if ((avail == 1) && (terminate)) {
4330 cur = in->cur[0];
4331 if ((cur != '<') && (cur != '&')) {
4332 if (ctxt->sax != NULL) {
4333 if (IS_BLANK(cur)) {
4334 if (ctxt->sax->ignorableWhitespace != NULL)
4335 ctxt->sax->ignorableWhitespace(
4336 ctxt->userData, &cur, 1);
4337 } else {
4338 htmlCheckParagraph(ctxt);
4339 if (ctxt->sax->characters != NULL)
4340 ctxt->sax->characters(
4341 ctxt->userData, &cur, 1);
4342 }
4343 }
4344 ctxt->token = 0;
4345 ctxt->checkIndex = 0;
4346 NEXT;
William M. Brack1633d182001-10-05 15:41:19 +00004347 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004348 }
Owen Taylor3473f882001-02-23 17:55:21 +00004349 }
4350 if (avail < 2)
4351 goto done;
4352 cur = in->cur[0];
4353 next = in->cur[1];
4354 cons = ctxt->nbChars;
4355 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4356 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4357 /*
4358 * Handle SCRIPT/STYLE separately
4359 */
4360 if ((!terminate) &&
4361 (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4362 goto done;
4363 htmlParseScript(ctxt);
4364 if ((cur == '<') && (next == '/')) {
4365 ctxt->instate = XML_PARSER_END_TAG;
4366 ctxt->checkIndex = 0;
4367#ifdef DEBUG_PUSH
4368 xmlGenericError(xmlGenericErrorContext,
4369 "HPP: entering END_TAG\n");
4370#endif
4371 break;
4372 }
4373 } else {
4374 /*
4375 * Sometimes DOCTYPE arrives in the middle of the document
4376 */
4377 if ((cur == '<') && (next == '!') &&
4378 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4379 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4380 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4381 (UPP(8) == 'E')) {
4382 if ((!terminate) &&
4383 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4384 goto done;
4385 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4386 ctxt->sax->error(ctxt->userData,
4387 "Misplaced DOCTYPE declaration\n");
4388 ctxt->wellFormed = 0;
4389 htmlParseDocTypeDecl(ctxt);
4390 } else if ((cur == '<') && (next == '!') &&
4391 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4392 if ((!terminate) &&
4393 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4394 goto done;
4395#ifdef DEBUG_PUSH
4396 xmlGenericError(xmlGenericErrorContext,
4397 "HPP: Parsing Comment\n");
4398#endif
4399 htmlParseComment(ctxt);
4400 ctxt->instate = XML_PARSER_CONTENT;
4401 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4402 goto done;
4403 } else if ((cur == '<') && (next == '/')) {
4404 ctxt->instate = XML_PARSER_END_TAG;
4405 ctxt->checkIndex = 0;
4406#ifdef DEBUG_PUSH
4407 xmlGenericError(xmlGenericErrorContext,
4408 "HPP: entering END_TAG\n");
4409#endif
4410 break;
4411 } else if (cur == '<') {
4412 ctxt->instate = XML_PARSER_START_TAG;
4413 ctxt->checkIndex = 0;
4414#ifdef DEBUG_PUSH
4415 xmlGenericError(xmlGenericErrorContext,
4416 "HPP: entering START_TAG\n");
4417#endif
4418 break;
4419 } else if (cur == '&') {
4420 if ((!terminate) &&
4421 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4422 goto done;
4423#ifdef DEBUG_PUSH
4424 xmlGenericError(xmlGenericErrorContext,
4425 "HPP: Parsing Reference\n");
4426#endif
4427 /* TODO: check generation of subtrees if noent !!! */
4428 htmlParseReference(ctxt);
4429 } else {
4430 /* TODO Avoid the extra copy, handle directly !!!!!! */
4431 /*
4432 * Goal of the following test is :
4433 * - minimize calls to the SAX 'character' callback
4434 * when they are mergeable
4435 */
4436 if ((ctxt->inputNr == 1) &&
4437 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4438 if ((!terminate) &&
4439 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4440 goto done;
4441 }
4442 ctxt->checkIndex = 0;
4443#ifdef DEBUG_PUSH
4444 xmlGenericError(xmlGenericErrorContext,
4445 "HPP: Parsing char data\n");
4446#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004447 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004448 }
4449 }
4450 if (cons == ctxt->nbChars) {
4451 if (ctxt->node != NULL) {
4452 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4453 ctxt->sax->error(ctxt->userData,
4454 "detected an error in element content\n");
4455 ctxt->wellFormed = 0;
4456 }
4457 NEXT;
4458 break;
4459 }
4460
4461 break;
4462 }
4463 case XML_PARSER_END_TAG:
4464 if (avail < 2)
4465 goto done;
4466 if ((!terminate) &&
4467 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4468 goto done;
4469 htmlParseEndTag(ctxt);
4470 if (ctxt->nameNr == 0) {
4471 ctxt->instate = XML_PARSER_EPILOG;
4472 } else {
4473 ctxt->instate = XML_PARSER_CONTENT;
4474 }
4475 ctxt->checkIndex = 0;
4476#ifdef DEBUG_PUSH
4477 xmlGenericError(xmlGenericErrorContext,
4478 "HPP: entering CONTENT\n");
4479#endif
4480 break;
4481 case XML_PARSER_CDATA_SECTION:
4482 xmlGenericError(xmlGenericErrorContext,
4483 "HPP: internal error, state == CDATA\n");
4484 ctxt->instate = XML_PARSER_CONTENT;
4485 ctxt->checkIndex = 0;
4486#ifdef DEBUG_PUSH
4487 xmlGenericError(xmlGenericErrorContext,
4488 "HPP: entering CONTENT\n");
4489#endif
4490 break;
4491 case XML_PARSER_DTD:
4492 xmlGenericError(xmlGenericErrorContext,
4493 "HPP: internal error, state == DTD\n");
4494 ctxt->instate = XML_PARSER_CONTENT;
4495 ctxt->checkIndex = 0;
4496#ifdef DEBUG_PUSH
4497 xmlGenericError(xmlGenericErrorContext,
4498 "HPP: entering CONTENT\n");
4499#endif
4500 break;
4501 case XML_PARSER_COMMENT:
4502 xmlGenericError(xmlGenericErrorContext,
4503 "HPP: internal error, state == COMMENT\n");
4504 ctxt->instate = XML_PARSER_CONTENT;
4505 ctxt->checkIndex = 0;
4506#ifdef DEBUG_PUSH
4507 xmlGenericError(xmlGenericErrorContext,
4508 "HPP: entering CONTENT\n");
4509#endif
4510 break;
4511 case XML_PARSER_PI:
4512 xmlGenericError(xmlGenericErrorContext,
4513 "HPP: internal error, state == PI\n");
4514 ctxt->instate = XML_PARSER_CONTENT;
4515 ctxt->checkIndex = 0;
4516#ifdef DEBUG_PUSH
4517 xmlGenericError(xmlGenericErrorContext,
4518 "HPP: entering CONTENT\n");
4519#endif
4520 break;
4521 case XML_PARSER_ENTITY_DECL:
4522 xmlGenericError(xmlGenericErrorContext,
4523 "HPP: internal error, state == ENTITY_DECL\n");
4524 ctxt->instate = XML_PARSER_CONTENT;
4525 ctxt->checkIndex = 0;
4526#ifdef DEBUG_PUSH
4527 xmlGenericError(xmlGenericErrorContext,
4528 "HPP: entering CONTENT\n");
4529#endif
4530 break;
4531 case XML_PARSER_ENTITY_VALUE:
4532 xmlGenericError(xmlGenericErrorContext,
4533 "HPP: internal error, state == ENTITY_VALUE\n");
4534 ctxt->instate = XML_PARSER_CONTENT;
4535 ctxt->checkIndex = 0;
4536#ifdef DEBUG_PUSH
4537 xmlGenericError(xmlGenericErrorContext,
4538 "HPP: entering DTD\n");
4539#endif
4540 break;
4541 case XML_PARSER_ATTRIBUTE_VALUE:
4542 xmlGenericError(xmlGenericErrorContext,
4543 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
4544 ctxt->instate = XML_PARSER_START_TAG;
4545 ctxt->checkIndex = 0;
4546#ifdef DEBUG_PUSH
4547 xmlGenericError(xmlGenericErrorContext,
4548 "HPP: entering START_TAG\n");
4549#endif
4550 break;
4551 case XML_PARSER_SYSTEM_LITERAL:
4552 xmlGenericError(xmlGenericErrorContext,
4553 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
4554 ctxt->instate = XML_PARSER_CONTENT;
4555 ctxt->checkIndex = 0;
4556#ifdef DEBUG_PUSH
4557 xmlGenericError(xmlGenericErrorContext,
4558 "HPP: entering CONTENT\n");
4559#endif
4560 break;
4561 case XML_PARSER_IGNORE:
4562 xmlGenericError(xmlGenericErrorContext,
4563 "HPP: internal error, state == XML_PARSER_IGNORE\n");
4564 ctxt->instate = XML_PARSER_CONTENT;
4565 ctxt->checkIndex = 0;
4566#ifdef DEBUG_PUSH
4567 xmlGenericError(xmlGenericErrorContext,
4568 "HPP: entering CONTENT\n");
4569#endif
4570 break;
4571 }
4572 }
4573done:
4574 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004575 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004576 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4577 /*
4578 * SAX: end of the document processing.
4579 */
4580 ctxt->instate = XML_PARSER_EOF;
4581 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4582 ctxt->sax->endDocument(ctxt->userData);
4583 }
4584 }
4585 if ((ctxt->myDoc != NULL) &&
4586 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4587 (ctxt->instate == XML_PARSER_EPILOG))) {
4588 xmlDtdPtr dtd;
4589 dtd = xmlGetIntSubset(ctxt->myDoc);
4590 if (dtd == NULL)
4591 ctxt->myDoc->intSubset =
4592 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4593 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4594 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4595 }
4596#ifdef DEBUG_PUSH
4597 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4598#endif
4599 return(ret);
4600}
4601
4602/**
Owen Taylor3473f882001-02-23 17:55:21 +00004603 * htmlParseChunk:
4604 * @ctxt: an XML parser context
4605 * @chunk: an char array
4606 * @size: the size in byte of the chunk
4607 * @terminate: last chunk indicator
4608 *
4609 * Parse a Chunk of memory
4610 *
4611 * Returns zero if no error, the xmlParserErrors otherwise.
4612 */
4613int
4614htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4615 int terminate) {
4616 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4617 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4618 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4619 int cur = ctxt->input->cur - ctxt->input->base;
4620
4621 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4622 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4623 ctxt->input->cur = ctxt->input->base + cur;
4624#ifdef DEBUG_PUSH
4625 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4626#endif
4627
4628 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4629 htmlParseTryOrFinish(ctxt, terminate);
4630 } else if (ctxt->instate != XML_PARSER_EOF) {
4631 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
4632 htmlParseTryOrFinish(ctxt, terminate);
4633 }
4634 if (terminate) {
4635 if ((ctxt->instate != XML_PARSER_EOF) &&
4636 (ctxt->instate != XML_PARSER_EPILOG) &&
4637 (ctxt->instate != XML_PARSER_MISC)) {
4638 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004639 ctxt->wellFormed = 0;
4640 }
4641 if (ctxt->instate != XML_PARSER_EOF) {
4642 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4643 ctxt->sax->endDocument(ctxt->userData);
4644 }
4645 ctxt->instate = XML_PARSER_EOF;
4646 }
4647 return((xmlParserErrors) ctxt->errNo);
4648}
4649
4650/************************************************************************
4651 * *
4652 * User entry points *
4653 * *
4654 ************************************************************************/
4655
4656/**
4657 * htmlCreatePushParserCtxt :
4658 * @sax: a SAX handler
4659 * @user_data: The user data returned on SAX callbacks
4660 * @chunk: a pointer to an array of chars
4661 * @size: number of chars in the array
4662 * @filename: an optional file name or URI
4663 * @enc: an optional encoding
4664 *
4665 * Create a parser context for using the HTML parser in push mode
4666 * To allow content encoding detection, @size should be >= 4
4667 * The value of @filename is used for fetching external entities
4668 * and error/warning reports.
4669 *
4670 * Returns the new parser context or NULL
4671 */
4672htmlParserCtxtPtr
4673htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4674 const char *chunk, int size, const char *filename,
4675 xmlCharEncoding enc) {
4676 htmlParserCtxtPtr ctxt;
4677 htmlParserInputPtr inputStream;
4678 xmlParserInputBufferPtr buf;
4679
Daniel Veillardd0463562001-10-13 09:15:48 +00004680 xmlInitParser();
4681
Owen Taylor3473f882001-02-23 17:55:21 +00004682 buf = xmlAllocParserInputBuffer(enc);
4683 if (buf == NULL) return(NULL);
4684
4685 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4686 if (ctxt == NULL) {
4687 xmlFree(buf);
4688 return(NULL);
4689 }
4690 memset(ctxt, 0, sizeof(htmlParserCtxt));
4691 htmlInitParserCtxt(ctxt);
4692 if (sax != NULL) {
4693 if (ctxt->sax != &htmlDefaultSAXHandler)
4694 xmlFree(ctxt->sax);
4695 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4696 if (ctxt->sax == NULL) {
4697 xmlFree(buf);
4698 xmlFree(ctxt);
4699 return(NULL);
4700 }
4701 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4702 if (user_data != NULL)
4703 ctxt->userData = user_data;
4704 }
4705 if (filename == NULL) {
4706 ctxt->directory = NULL;
4707 } else {
4708 ctxt->directory = xmlParserGetDirectory(filename);
4709 }
4710
4711 inputStream = htmlNewInputStream(ctxt);
4712 if (inputStream == NULL) {
4713 xmlFreeParserCtxt(ctxt);
4714 return(NULL);
4715 }
4716
4717 if (filename == NULL)
4718 inputStream->filename = NULL;
4719 else
4720 inputStream->filename = xmlMemStrdup(filename);
4721 inputStream->buf = buf;
4722 inputStream->base = inputStream->buf->buffer->content;
4723 inputStream->cur = inputStream->buf->buffer->content;
4724
4725 inputPush(ctxt, inputStream);
4726
4727 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4728 (ctxt->input->buf != NULL)) {
4729 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4730#ifdef DEBUG_PUSH
4731 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4732#endif
4733 }
4734
4735 return(ctxt);
4736}
4737
4738/**
4739 * htmlSAXParseDoc :
4740 * @cur: a pointer to an array of xmlChar
4741 * @encoding: a free form C string describing the HTML document encoding, or NULL
4742 * @sax: the SAX handler block
4743 * @userData: if using SAX, this pointer will be provided on callbacks.
4744 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004745 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
4746 * to handle parse events. If sax is NULL, fallback to the default DOM
4747 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00004748 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004749 * Returns the resulting document tree unless SAX is NULL or the document is
4750 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00004751 */
4752
4753htmlDocPtr
4754htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
4755 htmlDocPtr ret;
4756 htmlParserCtxtPtr ctxt;
4757
Daniel Veillardd0463562001-10-13 09:15:48 +00004758 xmlInitParser();
4759
Owen Taylor3473f882001-02-23 17:55:21 +00004760 if (cur == NULL) return(NULL);
4761
4762
4763 ctxt = htmlCreateDocParserCtxt(cur, encoding);
4764 if (ctxt == NULL) return(NULL);
4765 if (sax != NULL) {
4766 ctxt->sax = sax;
4767 ctxt->userData = userData;
4768 }
4769
4770 htmlParseDocument(ctxt);
4771 ret = ctxt->myDoc;
4772 if (sax != NULL) {
4773 ctxt->sax = NULL;
4774 ctxt->userData = NULL;
4775 }
4776 htmlFreeParserCtxt(ctxt);
4777
4778 return(ret);
4779}
4780
4781/**
4782 * htmlParseDoc :
4783 * @cur: a pointer to an array of xmlChar
4784 * @encoding: a free form C string describing the HTML document encoding, or NULL
4785 *
4786 * parse an HTML in-memory document and build a tree.
4787 *
4788 * Returns the resulting document tree
4789 */
4790
4791htmlDocPtr
4792htmlParseDoc(xmlChar *cur, const char *encoding) {
4793 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4794}
4795
4796
4797/**
4798 * htmlCreateFileParserCtxt :
4799 * @filename: the filename
4800 * @encoding: a free form C string describing the HTML document encoding, or NULL
4801 *
4802 * Create a parser context for a file content.
4803 * Automatic support for ZLIB/Compress compressed document is provided
4804 * by default if found at compile-time.
4805 *
4806 * Returns the new parser context or NULL
4807 */
4808htmlParserCtxtPtr
4809htmlCreateFileParserCtxt(const char *filename, const char *encoding)
4810{
4811 htmlParserCtxtPtr ctxt;
4812 htmlParserInputPtr inputStream;
4813 xmlParserInputBufferPtr buf;
4814 /* htmlCharEncoding enc; */
4815 xmlChar *content, *content_line = (xmlChar *) "charset=";
4816
4817 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
4818 if (buf == NULL) return(NULL);
4819
4820 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4821 if (ctxt == NULL) {
4822 perror("malloc");
4823 return(NULL);
4824 }
4825 memset(ctxt, 0, sizeof(htmlParserCtxt));
4826 htmlInitParserCtxt(ctxt);
4827 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
4828 if (inputStream == NULL) {
4829 perror("malloc");
4830 xmlFree(ctxt);
4831 return(NULL);
4832 }
4833 memset(inputStream, 0, sizeof(htmlParserInput));
4834
4835 inputStream->filename = xmlMemStrdup(filename);
4836 inputStream->line = 1;
4837 inputStream->col = 1;
4838 inputStream->buf = buf;
4839 inputStream->directory = NULL;
4840
4841 inputStream->base = inputStream->buf->buffer->content;
4842 inputStream->cur = inputStream->buf->buffer->content;
4843 inputStream->free = NULL;
4844
4845 inputPush(ctxt, inputStream);
4846
4847 /* set encoding */
4848 if (encoding) {
4849 content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
4850 if (content) {
4851 strcpy ((char *)content, (char *)content_line);
4852 strcat ((char *)content, (char *)encoding);
4853 htmlCheckEncoding (ctxt, content);
4854 xmlFree (content);
4855 }
4856 }
4857
4858 return(ctxt);
4859}
4860
4861/**
4862 * htmlSAXParseFile :
4863 * @filename: the filename
4864 * @encoding: a free form C string describing the HTML document encoding, or NULL
4865 * @sax: the SAX handler block
4866 * @userData: if using SAX, this pointer will be provided on callbacks.
4867 *
4868 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4869 * compressed document is provided by default if found at compile-time.
4870 * It use the given SAX function block to handle the parsing callback.
4871 * If sax is NULL, fallback to the default DOM tree building routines.
4872 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004873 * Returns the resulting document tree unless SAX is NULL or the document is
4874 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00004875 */
4876
4877htmlDocPtr
4878htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
4879 void *userData) {
4880 htmlDocPtr ret;
4881 htmlParserCtxtPtr ctxt;
4882 htmlSAXHandlerPtr oldsax = NULL;
4883
Daniel Veillardd0463562001-10-13 09:15:48 +00004884 xmlInitParser();
4885
Owen Taylor3473f882001-02-23 17:55:21 +00004886 ctxt = htmlCreateFileParserCtxt(filename, encoding);
4887 if (ctxt == NULL) return(NULL);
4888 if (sax != NULL) {
4889 oldsax = ctxt->sax;
4890 ctxt->sax = sax;
4891 ctxt->userData = userData;
4892 }
4893
4894 htmlParseDocument(ctxt);
4895
4896 ret = ctxt->myDoc;
4897 if (sax != NULL) {
4898 ctxt->sax = oldsax;
4899 ctxt->userData = NULL;
4900 }
4901 htmlFreeParserCtxt(ctxt);
4902
4903 return(ret);
4904}
4905
4906/**
4907 * htmlParseFile :
4908 * @filename: the filename
4909 * @encoding: a free form C string describing the HTML document encoding, or NULL
4910 *
4911 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4912 * compressed document is provided by default if found at compile-time.
4913 *
4914 * Returns the resulting document tree
4915 */
4916
4917htmlDocPtr
4918htmlParseFile(const char *filename, const char *encoding) {
4919 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
4920}
4921
4922/**
4923 * htmlHandleOmittedElem:
4924 * @val: int 0 or 1
4925 *
4926 * Set and return the previous value for handling HTML omitted tags.
4927 *
4928 * Returns the last value for 0 for no handling, 1 for auto insertion.
4929 */
4930
4931int
4932htmlHandleOmittedElem(int val) {
4933 int old = htmlOmittedDefaultValue;
4934
4935 htmlOmittedDefaultValue = val;
4936 return(old);
4937}
4938
4939#endif /* LIBXML_HTML_ENABLED */