blob: 4b0193ea01618daa88a02d6333506d54287d04fd [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Bjorn Reese70a9da52001-04-21 16:57:29 +00009#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000010#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000011
Owen Taylor3473f882001-02-23 17:55:21 +000012#include <string.h>
13#ifdef HAVE_CTYPE_H
14#include <ctype.h>
15#endif
16#ifdef HAVE_STDLIB_H
17#include <stdlib.h>
18#endif
19#ifdef HAVE_SYS_STAT_H
20#include <sys/stat.h>
21#endif
22#ifdef HAVE_FCNTL_H
23#include <fcntl.h>
24#endif
25#ifdef HAVE_UNISTD_H
26#include <unistd.h>
27#endif
28#ifdef HAVE_ZLIB_H
29#include <zlib.h>
30#endif
31
32#include <libxml/xmlmemory.h>
33#include <libxml/tree.h>
34#include <libxml/parser.h>
35#include <libxml/parserInternals.h>
36#include <libxml/xmlerror.h>
37#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000038#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000039#include <libxml/entities.h>
40#include <libxml/encoding.h>
41#include <libxml/valid.h>
42#include <libxml/xmlIO.h>
43
44#define HTML_MAX_NAMELEN 1000
45#define HTML_PARSER_BIG_BUFFER_SIZE 1000
46#define HTML_PARSER_BUFFER_SIZE 100
47
48/* #define DEBUG */
49/* #define DEBUG_PUSH */
50
Daniel Veillard22090732001-07-16 00:06:07 +000051static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000052
Daniel Veillard56a4cb82001-03-24 17:00:36 +000053xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
54 xmlChar end, xmlChar end2, xmlChar end3);
55
56/************************************************************************
57 * *
Owen Taylor3473f882001-02-23 17:55:21 +000058 * Parser stacks related functions and macros *
59 * *
60 ************************************************************************/
61
62/*
63 * Generic function for accessing stacks in the Parser Context
64 */
65
66#define PUSH_AND_POP(scope, type, name) \
67scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
68 if (ctxt->name##Nr >= ctxt->name##Max) { \
69 ctxt->name##Max *= 2; \
70 ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
71 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
72 if (ctxt->name##Tab == NULL) { \
73 xmlGenericError(xmlGenericErrorContext, \
74 "realloc failed !\n"); \
75 return(0); \
76 } \
77 } \
78 ctxt->name##Tab[ctxt->name##Nr] = value; \
79 ctxt->name = value; \
80 return(ctxt->name##Nr++); \
81} \
82scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
83 type ret; \
84 if (ctxt->name##Nr < 0) return(0); \
85 ctxt->name##Nr--; \
86 if (ctxt->name##Nr < 0) return(0); \
87 if (ctxt->name##Nr > 0) \
88 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
89 else \
90 ctxt->name = NULL; \
91 ret = ctxt->name##Tab[ctxt->name##Nr]; \
92 ctxt->name##Tab[ctxt->name##Nr] = 0; \
93 return(ret); \
94} \
95
Daniel Veillard56a4cb82001-03-24 17:00:36 +000096/* PUSH_AND_POP(static, xmlNodePtr, node) */
97PUSH_AND_POP(static, xmlChar*, name)
Owen Taylor3473f882001-02-23 17:55:21 +000098
99/*
100 * Macros for accessing the content. Those should be used only by the parser,
101 * and not exported.
102 *
103 * Dirty macros, i.e. one need to make assumption on the context to use them
104 *
105 * CUR_PTR return the current pointer to the xmlChar to be parsed.
106 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
107 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
108 * in UNICODE mode. This should be used internally by the parser
109 * only to compare to ASCII values otherwise it would break when
110 * running with UTF-8 encoding.
111 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
112 * to compare on ASCII based substring.
113 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
114 * it should be used only to compare on ASCII based substring.
115 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
116 * strings within the parser.
117 *
118 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
119 *
120 * CURRENT Returns the current char value, with the full decoding of
121 * UTF-8 if we are using this mode. It returns an int.
122 * NEXT Skip to the next character, this does the proper decoding
123 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
124 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
125 */
126
127#define UPPER (toupper(*ctxt->input->cur))
128
129#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
130
131#define NXT(val) ctxt->input->cur[(val)]
132
133#define UPP(val) (toupper(ctxt->input->cur[(val)]))
134
135#define CUR_PTR ctxt->input->cur
136
137#define SHRINK xmlParserInputShrink(ctxt->input)
138
139#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
140
141#define CURRENT ((int) (*ctxt->input->cur))
142
143#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
144
145/* Inported from XML */
146
147/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
148#define CUR ((int) (*ctxt->input->cur))
149#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
150
151#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
152#define NXT(val) ctxt->input->cur[(val)]
153#define CUR_PTR ctxt->input->cur
154
155
156#define NEXTL(l) do { \
157 if (*(ctxt->input->cur) == '\n') { \
158 ctxt->input->line++; ctxt->input->col = 1; \
159 } else ctxt->input->col++; \
160 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
161 } while (0)
162
163/************
164 \
165 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
166 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
167 ************/
168
169#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
170#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
171
172#define COPY_BUF(l,b,i,v) \
173 if (l == 1) b[i++] = (xmlChar) v; \
174 else i += xmlCopyChar(l,&b[i],v)
175
176/**
177 * htmlCurrentChar:
178 * @ctxt: the HTML parser context
179 * @len: pointer to the length of the char read
180 *
181 * The current char value, if using UTF-8 this may actaully span multiple
182 * bytes in the input buffer. Implement the end of line normalization:
183 * 2.11 End-of-Line Handling
184 * If the encoding is unspecified, in the case we find an ISO-Latin-1
185 * char, then the encoding converter is plugged in automatically.
186 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000187 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000188 */
189
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000190static int
Owen Taylor3473f882001-02-23 17:55:21 +0000191htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
192 if (ctxt->instate == XML_PARSER_EOF)
193 return(0);
194
195 if (ctxt->token != 0) {
196 *len = 0;
197 return(ctxt->token);
198 }
199 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
200 /*
201 * We are supposed to handle UTF8, check it's valid
202 * From rfc2044: encoding of the Unicode values on UTF-8:
203 *
204 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
205 * 0000 0000-0000 007F 0xxxxxxx
206 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
207 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
208 *
209 * Check for the 0x110000 limit too
210 */
211 const unsigned char *cur = ctxt->input->cur;
212 unsigned char c;
213 unsigned int val;
214
215 c = *cur;
216 if (c & 0x80) {
217 if (cur[1] == 0)
218 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
219 if ((cur[1] & 0xc0) != 0x80)
220 goto encoding_error;
221 if ((c & 0xe0) == 0xe0) {
222
223 if (cur[2] == 0)
224 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
225 if ((cur[2] & 0xc0) != 0x80)
226 goto encoding_error;
227 if ((c & 0xf0) == 0xf0) {
228 if (cur[3] == 0)
229 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
230 if (((c & 0xf8) != 0xf0) ||
231 ((cur[3] & 0xc0) != 0x80))
232 goto encoding_error;
233 /* 4-byte code */
234 *len = 4;
235 val = (cur[0] & 0x7) << 18;
236 val |= (cur[1] & 0x3f) << 12;
237 val |= (cur[2] & 0x3f) << 6;
238 val |= cur[3] & 0x3f;
239 } else {
240 /* 3-byte code */
241 *len = 3;
242 val = (cur[0] & 0xf) << 12;
243 val |= (cur[1] & 0x3f) << 6;
244 val |= cur[2] & 0x3f;
245 }
246 } else {
247 /* 2-byte code */
248 *len = 2;
249 val = (cur[0] & 0x1f) << 6;
250 val |= cur[1] & 0x3f;
251 }
252 if (!IS_CHAR(val)) {
253 ctxt->errNo = XML_ERR_INVALID_ENCODING;
254 if ((ctxt->sax != NULL) &&
255 (ctxt->sax->error != NULL))
256 ctxt->sax->error(ctxt->userData,
257 "Char 0x%X out of allowed range\n", val);
258 ctxt->wellFormed = 0;
259 ctxt->disableSAX = 1;
260 }
261 return(val);
262 } else {
263 /* 1-byte code */
264 *len = 1;
265 return((int) *ctxt->input->cur);
266 }
267 }
268 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000269 * Assume it's a fixed length encoding (1) with
Owen Taylor3473f882001-02-23 17:55:21 +0000270 * a compatibke encoding for the ASCII set, since
271 * XML constructs only use < 128 chars
272 */
273 *len = 1;
274 if ((int) *ctxt->input->cur < 0x80)
275 return((int) *ctxt->input->cur);
276
277 /*
278 * Humm this is bad, do an automatic flow conversion
279 */
280 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
281 ctxt->charset = XML_CHAR_ENCODING_UTF8;
282 return(xmlCurrentChar(ctxt, len));
283
284encoding_error:
285 /*
286 * If we detect an UTF8 error that probably mean that the
287 * input encoding didn't get properly advertized in the
288 * declaration header. Report the error and switch the encoding
289 * to ISO-Latin-1 (if you don't like this policy, just declare the
290 * encoding !)
291 */
292 ctxt->errNo = XML_ERR_INVALID_ENCODING;
293 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
294 ctxt->sax->error(ctxt->userData,
295 "Input is not proper UTF-8, indicate encoding !\n");
296 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
297 ctxt->input->cur[0], ctxt->input->cur[1],
298 ctxt->input->cur[2], ctxt->input->cur[3]);
299 }
300
301 ctxt->charset = XML_CHAR_ENCODING_8859_1;
302 *len = 1;
303 return((int) *ctxt->input->cur);
304}
305
306/**
Owen Taylor3473f882001-02-23 17:55:21 +0000307 * htmlSkipBlankChars:
308 * @ctxt: the HTML parser context
309 *
310 * skip all blanks character found at that point in the input streams.
311 *
312 * Returns the number of space chars skipped
313 */
314
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000315static int
Owen Taylor3473f882001-02-23 17:55:21 +0000316htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
317 int res = 0;
318
319 while (IS_BLANK(*(ctxt->input->cur))) {
320 if ((*ctxt->input->cur == 0) &&
321 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
322 xmlPopInput(ctxt);
323 } else {
324 if (*(ctxt->input->cur) == '\n') {
325 ctxt->input->line++; ctxt->input->col = 1;
326 } else ctxt->input->col++;
327 ctxt->input->cur++;
328 ctxt->nbChars++;
329 if (*ctxt->input->cur == 0)
330 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
331 }
332 res++;
333 }
334 return(res);
335}
336
337
338
339/************************************************************************
340 * *
341 * The list of HTML elements and their properties *
342 * *
343 ************************************************************************/
344
345/*
346 * Start Tag: 1 means the start tag can be ommited
347 * End Tag: 1 means the end tag can be ommited
348 * 2 means it's forbidden (empty elements)
Daniel Veillard56098d42001-04-24 12:51:09 +0000349 * 3 means the tag is stylistic and should be closed easilly
Owen Taylor3473f882001-02-23 17:55:21 +0000350 * Depr: this element is deprecated
351 * DTD: 1 means that this element is valid only in the Loose DTD
352 * 2 means that this element is valid only in the Frameset DTD
353 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000354 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Owen Taylor3473f882001-02-23 17:55:21 +0000355 */
Daniel Veillard22090732001-07-16 00:06:07 +0000356static const htmlElemDesc
357html40ElementTable[] = {
Daniel Veillard02bb1702001-06-13 21:11:59 +0000358{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor " },
359{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form" },
360{ "acronym", 0, 0, 0, 0, 0, 0, 1, "" },
361{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author " },
362{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet " },
363{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area " },
364{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style" },
365{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri " },
366{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " },
367{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride " },
368{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style" },
369{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation " },
370{ "body", 1, 1, 0, 0, 0, 0, 0, "document body " },
371{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break " },
372{ "button", 0, 0, 0, 0, 0, 0, 2, "push button " },
373{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption " },
374{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center " },
375{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation" },
376{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment" },
377{ "col", 0, 2, 2, 1, 0, 0, 0, "table column " },
378{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group " },
379{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description " },
380{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text " },
381{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition" },
382{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list" },
383{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container"},
384{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list " },
385{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term " },
386{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis" },
387{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group " },
388{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font " },
389{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form " },
390{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " },
391{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" },
392{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading " },
393{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading " },
394{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading " },
395{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading " },
396{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading " },
397{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading " },
398{ "head", 1, 1, 0, 0, 0, 0, 0, "document head " },
399{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " },
400{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element " },
401{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style" },
402{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow " },
403{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image " },
404{ "input", 0, 2, 2, 1, 0, 0, 1, "form control " },
405{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text" },
406{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt " },
407{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user" },
408{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text " },
409{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend " },
410{ "li", 0, 1, 1, 0, 0, 0, 0, "list item " },
411{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link " },
412{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map " },
413{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list " },
414{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation " },
415{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering " },
416{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
417{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object " },
418{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list " },
419{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group " },
420{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " },
421{ "p", 0, 1, 1, 0, 0, 0, 0, "paragraph " },
422{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value " },
423{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text " },
424{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation " },
425{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style" },
426{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc." },
427{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements " },
428{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector " },
429{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style" },
430{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container " },
431{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text" },
432{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis" },
433{ "style", 0, 0, 0, 0, 0, 0, 0, "style info " },
434{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript" },
435{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript " },
436{ "table", 0, 0, 0, 0, 0, 0, 0, "&#160;" },
437{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body " },
438{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell" },
439{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field " },
440{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer " },
441{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell" },
442{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header " },
443{ "title", 0, 0, 0, 0, 0, 0, 0, "document title " },
444{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row " },
445{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style" },
446{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style" },
447{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list " },
448{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument" },
Owen Taylor3473f882001-02-23 17:55:21 +0000449};
450
451/*
Owen Taylor3473f882001-02-23 17:55:21 +0000452 * start tags that imply the end of current element
453 */
Daniel Veillard22090732001-07-16 00:06:07 +0000454static const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000455"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
456 "dl", "ul", "ol", "menu", "dir", "address", "pre",
457 "listing", "xmp", "head", NULL,
458"head", "p", NULL,
459"title", "p", NULL,
460"body", "head", "style", "link", "title", "p", NULL,
461"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
462 "pre", "listing", "xmp", "head", "li", NULL,
463"hr", "p", "head", NULL,
464"h1", "p", "head", NULL,
465"h2", "p", "head", NULL,
466"h3", "p", "head", NULL,
467"h4", "p", "head", NULL,
468"h5", "p", "head", NULL,
469"h6", "p", "head", NULL,
470"dir", "p", "head", NULL,
471"address", "p", "head", "ul", NULL,
472"pre", "p", "head", "ul", NULL,
473"listing", "p", "head", NULL,
474"xmp", "p", "head", NULL,
475"blockquote", "p", "head", NULL,
476"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
477 "xmp", "head", NULL,
478"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
479 "head", "dd", NULL,
480"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
481 "head", "dt", NULL,
482"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
483 "listing", "xmp", NULL,
484"ol", "p", "head", "ul", NULL,
485"menu", "p", "head", "ul", NULL,
486"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
487"div", "p", "head", NULL,
488"noscript", "p", "head", NULL,
489"center", "font", "b", "i", "p", "head", NULL,
490"a", "a", NULL,
491"caption", "p", NULL,
492"colgroup", "caption", "colgroup", "col", "p", NULL,
493"col", "caption", "col", "p", NULL,
494"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
495 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000496"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
497"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000498"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
499"thead", "caption", "col", "colgroup", NULL,
500"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
501 "tbody", "p", NULL,
502"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
503 "tfoot", "tbody", "p", NULL,
504"optgroup", "option", NULL,
505"option", "option", NULL,
506"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
507 "pre", "listing", "xmp", "a", NULL,
508NULL
509};
510
511/*
512 * The list of HTML elements which are supposed not to have
513 * CDATA content and where a p element will be implied
514 *
515 * TODO: extend that list by reading the HTML SGML DtD on
516 * implied paragraph
517 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000518static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000519 "html",
520 "head",
521 "body",
522 NULL
523};
524
525/*
526 * The list of HTML attributes which are of content %Script;
527 * NOTE: when adding ones, check htmlIsScriptAttribute() since
528 * it assumes the name starts with 'on'
529 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000530static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000531 "onclick",
532 "ondblclick",
533 "onmousedown",
534 "onmouseup",
535 "onmouseover",
536 "onmousemove",
537 "onmouseout",
538 "onkeypress",
539 "onkeydown",
540 "onkeyup",
541 "onload",
542 "onunload",
543 "onfocus",
544 "onblur",
545 "onsubmit",
546 "onrest",
547 "onchange",
548 "onselect"
549};
550
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000551/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000552 * This table is used by the htmlparser to know what to do with
553 * broken html pages. By assigning different priorities to different
554 * elements the parser can decide how to handle extra endtags.
555 * Endtags are only allowed to close elements with lower or equal
556 * priority.
557 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000558
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000559typedef struct {
560 const char *name;
561 int priority;
562} elementPriority;
563
Daniel Veillard22090732001-07-16 00:06:07 +0000564static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000565 {"div", 150},
566 {"td", 160},
567 {"th", 160},
568 {"tr", 170},
569 {"thead", 180},
570 {"tbody", 180},
571 {"tfoot", 180},
572 {"table", 190},
573 {"head", 200},
574 {"body", 200},
575 {"html", 220},
576 {NULL, 100} /* Default priority */
577};
Owen Taylor3473f882001-02-23 17:55:21 +0000578
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000579static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +0000580static int htmlStartCloseIndexinitialized = 0;
581
582/************************************************************************
583 * *
584 * functions to handle HTML specific data *
585 * *
586 ************************************************************************/
587
588/**
589 * htmlInitAutoClose:
590 *
591 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
592 * This is not reentrant. Call xmlInitParser() once before processing in
593 * case of use in multithreaded programs.
594 */
595void
596htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000597 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000598
599 if (htmlStartCloseIndexinitialized) return;
600
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000601 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
602 indx = 0;
603 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
604 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +0000605 while (htmlStartClose[i] != NULL) i++;
606 i++;
607 }
608 htmlStartCloseIndexinitialized = 1;
609}
610
611/**
612 * htmlTagLookup:
613 * @tag: The tag name in lowercase
614 *
615 * Lookup the HTML tag in the ElementTable
616 *
617 * Returns the related htmlElemDescPtr or NULL if not found.
618 */
Daniel Veillardbb371292001-08-16 23:26:59 +0000619const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +0000620htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000621 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000622
623 for (i = 0; i < (sizeof(html40ElementTable) /
624 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000625 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillard22090732001-07-16 00:06:07 +0000626 return((const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +0000627 }
628 return(NULL);
629}
630
631/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000632 * htmlGetEndPriority:
633 * @name: The name of the element to look up the priority for.
634 *
635 * Return value: The "endtag" priority.
636 **/
637static int
638htmlGetEndPriority (const xmlChar *name) {
639 int i = 0;
640
641 while ((htmlEndPriority[i].name != NULL) &&
642 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
643 i++;
644
645 return(htmlEndPriority[i].priority);
646}
647
648/**
Owen Taylor3473f882001-02-23 17:55:21 +0000649 * htmlCheckAutoClose:
650 * @newtag: The new tag name
651 * @oldtag: The old tag name
652 *
653 * Checks wether the new tag is one of the registered valid tags for closing old.
654 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
655 *
656 * Returns 0 if no, 1 if yes.
657 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000658static int
Owen Taylor3473f882001-02-23 17:55:21 +0000659htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000660 int i, indx;
661 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +0000662
663 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
664
665 /* inefficient, but not a big deal */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000666 for (indx = 0; indx < 100;indx++) {
667 closed = htmlStartCloseIndex[indx];
668 if (closed == NULL) return(0);
669 if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor3473f882001-02-23 17:55:21 +0000670 }
671
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000672 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +0000673 i++;
674 while (htmlStartClose[i] != NULL) {
675 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
676 return(1);
677 }
678 i++;
679 }
680 return(0);
681}
682
683/**
684 * htmlAutoCloseOnClose:
685 * @ctxt: an HTML parser context
686 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000687 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +0000688 *
689 * The HTmL DtD allows an ending tag to implicitely close other tags.
690 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000691static void
Owen Taylor3473f882001-02-23 17:55:21 +0000692htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillardbb371292001-08-16 23:26:59 +0000693 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +0000694 xmlChar *oldname;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000695 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +0000696
697#ifdef DEBUG
698 xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
699 for (i = 0;i < ctxt->nameNr;i++)
700 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
701#endif
702
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000703 priority = htmlGetEndPriority (newtag);
704
Owen Taylor3473f882001-02-23 17:55:21 +0000705 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000706
Owen Taylor3473f882001-02-23 17:55:21 +0000707 if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000708 /*
709 * A missplaced endtagad can only close elements with lower
710 * or equal priority, so if we find an element with higher
711 * priority before we find an element with
712 * matching name, we just ignore this endtag
713 */
714 if (htmlGetEndPriority (ctxt->nameTab[i]) > priority) return;
Owen Taylor3473f882001-02-23 17:55:21 +0000715 }
716 if (i < 0) return;
717
718 while (!xmlStrEqual(newtag, ctxt->name)) {
719 info = htmlTagLookup(ctxt->name);
720 if ((info == NULL) || (info->endTag == 1)) {
721#ifdef DEBUG
722 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
723#endif
Daniel Veillard56098d42001-04-24 12:51:09 +0000724 } else if (info->endTag == 3) {
725#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +0000726 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", newtag, ctxt->name);
William M. Brack1633d182001-10-05 15:41:19 +0000727
Daniel Veillard56098d42001-04-24 12:51:09 +0000728#endif
729 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
730 ctxt->sax->error(ctxt->userData,
731 "Opening and ending tag mismatch: %s and %s\n",
732 newtag, ctxt->name);
733 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000734 }
735 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
736 ctxt->sax->endElement(ctxt->userData, ctxt->name);
737 oldname = htmlnamePop(ctxt);
738 if (oldname != NULL) {
739#ifdef DEBUG
740 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
741#endif
742 xmlFree(oldname);
743 }
744 }
745}
746
747/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000748 * htmlAutoCloseOnEnd:
749 * @ctxt: an HTML parser context
750 *
751 * Close all remaining tags at the end of the stream
752 */
753static void
754htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
755 xmlChar *oldname;
756 int i;
757
758 if (ctxt->nameNr == 0)
759 return;
760#ifdef DEBUG
761 xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
762#endif
763
764 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
765#ifdef DEBUG
766 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
767#endif
768 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
769 ctxt->sax->endElement(ctxt->userData, ctxt->name);
770 oldname = htmlnamePop(ctxt);
771 if (oldname != NULL) {
772#ifdef DEBUG
773 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
774#endif
775 xmlFree(oldname);
776 }
777 }
778}
779
780/**
Owen Taylor3473f882001-02-23 17:55:21 +0000781 * htmlAutoClose:
782 * @ctxt: an HTML parser context
783 * @newtag: The new tag name or NULL
784 *
785 * The HTmL DtD allows a tag to implicitely close other tags.
786 * The list is kept in htmlStartClose array. This function is
787 * called when a new tag has been detected and generates the
788 * appropriates closes if possible/needed.
789 * If newtag is NULL this mean we are at the end of the resource
790 * and we should check
791 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000792static void
Owen Taylor3473f882001-02-23 17:55:21 +0000793htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
794 xmlChar *oldname;
795 while ((newtag != NULL) && (ctxt->name != NULL) &&
796 (htmlCheckAutoClose(newtag, ctxt->name))) {
797#ifdef DEBUG
798 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
799#endif
800 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
801 ctxt->sax->endElement(ctxt->userData, ctxt->name);
802 oldname = htmlnamePop(ctxt);
803 if (oldname != NULL) {
804#ifdef DEBUG
805 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
806#endif
807 xmlFree(oldname);
808 }
809 }
810 if (newtag == NULL) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000811 htmlAutoCloseOnEnd(ctxt);
812 return;
Owen Taylor3473f882001-02-23 17:55:21 +0000813 }
814 while ((newtag == NULL) && (ctxt->name != NULL) &&
815 ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
816 (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
817 (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
818#ifdef DEBUG
819 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
820#endif
821 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
822 ctxt->sax->endElement(ctxt->userData, ctxt->name);
823 oldname = htmlnamePop(ctxt);
824 if (oldname != NULL) {
825#ifdef DEBUG
826 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
827#endif
828 xmlFree(oldname);
829 }
830 }
831
832}
833
834/**
835 * htmlAutoCloseTag:
836 * @doc: the HTML document
837 * @name: The tag name
838 * @elem: the HTML element
839 *
840 * The HTmL DtD allows a tag to implicitely close other tags.
841 * The list is kept in htmlStartClose array. This function checks
842 * if the element or one of it's children would autoclose the
843 * given tag.
844 *
845 * Returns 1 if autoclose, 0 otherwise
846 */
847int
848htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
849 htmlNodePtr child;
850
851 if (elem == NULL) return(1);
852 if (xmlStrEqual(name, elem->name)) return(0);
853 if (htmlCheckAutoClose(elem->name, name)) return(1);
854 child = elem->children;
855 while (child != NULL) {
856 if (htmlAutoCloseTag(doc, name, child)) return(1);
857 child = child->next;
858 }
859 return(0);
860}
861
862/**
863 * htmlIsAutoClosed:
864 * @doc: the HTML document
865 * @elem: the HTML element
866 *
867 * The HTmL DtD allows a tag to implicitely close other tags.
868 * The list is kept in htmlStartClose array. This function checks
869 * if a tag is autoclosed by one of it's child
870 *
871 * Returns 1 if autoclosed, 0 otherwise
872 */
873int
874htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
875 htmlNodePtr child;
876
877 if (elem == NULL) return(1);
878 child = elem->children;
879 while (child != NULL) {
880 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
881 child = child->next;
882 }
883 return(0);
884}
885
886/**
887 * htmlCheckImplied:
888 * @ctxt: an HTML parser context
889 * @newtag: The new tag name
890 *
891 * The HTML DtD allows a tag to exists only implicitely
892 * called when a new tag has been detected and generates the
893 * appropriates implicit tags if missing
894 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000895static void
Owen Taylor3473f882001-02-23 17:55:21 +0000896htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
897 if (!htmlOmittedDefaultValue)
898 return;
899 if (xmlStrEqual(newtag, BAD_CAST"html"))
900 return;
901 if (ctxt->nameNr <= 0) {
902#ifdef DEBUG
903 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
904#endif
905 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
906 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
907 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
908 }
909 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
910 return;
911 if ((ctxt->nameNr <= 1) &&
912 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
913 (xmlStrEqual(newtag, BAD_CAST"style")) ||
914 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
915 (xmlStrEqual(newtag, BAD_CAST"link")) ||
916 (xmlStrEqual(newtag, BAD_CAST"title")) ||
917 (xmlStrEqual(newtag, BAD_CAST"base")))) {
918 /*
919 * dropped OBJECT ... i you put it first BODY will be
920 * assumed !
921 */
922#ifdef DEBUG
923 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
924#endif
925 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
926 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
927 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
928 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
929 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
930 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
931 int i;
932 for (i = 0;i < ctxt->nameNr;i++) {
933 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
934 return;
935 }
936 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
937 return;
938 }
939 }
940
941#ifdef DEBUG
942 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
943#endif
944 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
945 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
946 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
947 }
948}
949
950/**
951 * htmlCheckParagraph
952 * @ctxt: an HTML parser context
953 *
954 * Check whether a p element need to be implied before inserting
955 * characters in the current element.
956 *
957 * Returns 1 if a paragraph has been inserted, 0 if not and -1
958 * in case of error.
959 */
960
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000961static int
Owen Taylor3473f882001-02-23 17:55:21 +0000962htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
963 const xmlChar *tag;
964 int i;
965
966 if (ctxt == NULL)
967 return(-1);
968 tag = ctxt->name;
969 if (tag == NULL) {
970 htmlAutoClose(ctxt, BAD_CAST"p");
971 htmlCheckImplied(ctxt, BAD_CAST"p");
972 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
973 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
974 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
975 return(1);
976 }
977 if (!htmlOmittedDefaultValue)
978 return(0);
979 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
980 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
981#ifdef DEBUG
982 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
983#endif
984 htmlAutoClose(ctxt, BAD_CAST"p");
985 htmlCheckImplied(ctxt, BAD_CAST"p");
986 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
987 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
988 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
989 return(1);
990 }
991 }
992 return(0);
993}
994
995/**
996 * htmlIsScriptAttribute:
997 * @name: an attribute name
998 *
999 * Check if an attribute is of content type Script
1000 *
1001 * Returns 1 is the attribute is a script 0 otherwise
1002 */
1003int
1004htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001005 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001006
1007 if (name == NULL)
1008 return(0);
1009 /*
1010 * all script attributes start with 'on'
1011 */
1012 if ((name[0] != 'o') || (name[1] != 'n'))
1013 return(0);
1014 for (i = 0;
1015 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1016 i++) {
1017 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1018 return(1);
1019 }
1020 return(0);
1021}
1022
1023/************************************************************************
1024 * *
1025 * The list of HTML predefined entities *
1026 * *
1027 ************************************************************************/
1028
1029
Daniel Veillard22090732001-07-16 00:06:07 +00001030static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001031/*
1032 * the 4 absolute ones, plus apostrophe.
1033 */
1034{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1035{ 38, "amp", "ampersand, U+0026 ISOnum" },
1036{ 39, "apos", "single quote" },
1037{ 60, "lt", "less-than sign, U+003C ISOnum" },
1038{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1039
1040/*
1041 * A bunch still in the 128-255 range
1042 * Replacing them depend really on the charset used.
1043 */
1044{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1045{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1046{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1047{ 163, "pound","pound sign, U+00A3 ISOnum" },
1048{ 164, "curren","currency sign, U+00A4 ISOnum" },
1049{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1050{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1051{ 167, "sect", "section sign, U+00A7 ISOnum" },
1052{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1053{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1054{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1055{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1056{ 172, "not", "not sign, U+00AC ISOnum" },
1057{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1058{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1059{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1060{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1061{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1062{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1063{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1064{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1065{ 181, "micro","micro sign, U+00B5 ISOnum" },
1066{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1067{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1068{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1069{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1070{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1071{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1072{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1073{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1074{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1075{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1076{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1077{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1078{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1079{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1080{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1081{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1082{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1083{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1084{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1085{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1086{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1087{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1088{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1089{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1090{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1091{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1092{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1093{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1094{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1095{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1096{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1097{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1098{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1099{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1100{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1101{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1102{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1103{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1104{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1105{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1106{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1107{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1108{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1109{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1110{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1111{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1112{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1113{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1114{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1115{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1116{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1117{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1118{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1119{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1120{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1121{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1122{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1123{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1124{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1125{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1126{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1127{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1128{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1129{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1130{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1131{ 247, "divide","division sign, U+00F7 ISOnum" },
1132{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1133{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1134{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1135{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1136{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1137{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1138{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1139{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1140
1141{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1142{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1143{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1144{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1145{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1146
1147/*
1148 * Anything below should really be kept as entities references
1149 */
1150{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1151
1152{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1153{ 732, "tilde","small tilde, U+02DC ISOdia" },
1154
1155{ 913, "Alpha","greek capital letter alpha, U+0391" },
1156{ 914, "Beta", "greek capital letter beta, U+0392" },
1157{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1158{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1159{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1160{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1161{ 919, "Eta", "greek capital letter eta, U+0397" },
1162{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1163{ 921, "Iota", "greek capital letter iota, U+0399" },
1164{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001165{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001166{ 924, "Mu", "greek capital letter mu, U+039C" },
1167{ 925, "Nu", "greek capital letter nu, U+039D" },
1168{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1169{ 927, "Omicron","greek capital letter omicron, U+039F" },
1170{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1171{ 929, "Rho", "greek capital letter rho, U+03A1" },
1172{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1173{ 932, "Tau", "greek capital letter tau, U+03A4" },
1174{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1175{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1176{ 935, "Chi", "greek capital letter chi, U+03A7" },
1177{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1178{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1179
1180{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1181{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1182{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1183{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1184{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1185{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1186{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1187{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1188{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1189{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1190{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1191{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1192{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1193{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1194{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1195{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1196{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1197{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1198{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1199{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1200{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1201{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1202{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1203{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1204{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1205{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1206{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1207{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1208
1209{ 8194, "ensp", "en space, U+2002 ISOpub" },
1210{ 8195, "emsp", "em space, U+2003 ISOpub" },
1211{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1212{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1213{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1214{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1215{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1216{ 8211, "ndash","en dash, U+2013 ISOpub" },
1217{ 8212, "mdash","em dash, U+2014 ISOpub" },
1218{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1219{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1220{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1221{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1222{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1223{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1224{ 8224, "dagger","dagger, U+2020 ISOpub" },
1225{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1226
1227{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1228{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1229
1230{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1231
1232{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1233{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1234
1235{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1236{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1237
1238{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1239{ 8260, "frasl","fraction slash, U+2044 NEW" },
1240
1241{ 8364, "euro", "euro sign, U+20AC NEW" },
1242
1243{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1244{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1245{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1246{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1247{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1248{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1249{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1250{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1251{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1252{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1253{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1254{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1255{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1256{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1257{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1258{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1259
1260{ 8704, "forall","for all, U+2200 ISOtech" },
1261{ 8706, "part", "partial differential, U+2202 ISOtech" },
1262{ 8707, "exist","there exists, U+2203 ISOtech" },
1263{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1264{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1265{ 8712, "isin", "element of, U+2208 ISOtech" },
1266{ 8713, "notin","not an element of, U+2209 ISOtech" },
1267{ 8715, "ni", "contains as member, U+220B ISOtech" },
1268{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1269{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
1270{ 8722, "minus","minus sign, U+2212 ISOtech" },
1271{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1272{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1273{ 8733, "prop", "proportional to, U+221D ISOtech" },
1274{ 8734, "infin","infinity, U+221E ISOtech" },
1275{ 8736, "ang", "angle, U+2220 ISOamso" },
1276{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1277{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1278{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1279{ 8746, "cup", "union = cup, U+222A ISOtech" },
1280{ 8747, "int", "integral, U+222B ISOtech" },
1281{ 8756, "there4","therefore, U+2234 ISOtech" },
1282{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1283{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1284{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1285{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1286{ 8801, "equiv","identical to, U+2261 ISOtech" },
1287{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1288{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1289{ 8834, "sub", "subset of, U+2282 ISOtech" },
1290{ 8835, "sup", "superset of, U+2283 ISOtech" },
1291{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1292{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1293{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1294{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1295{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1296{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1297{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1298{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1299{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1300{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1301{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1302{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1303{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1304{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1305
1306{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1307{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1308{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1309{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1310
1311};
1312
1313/************************************************************************
1314 * *
1315 * Commodity functions to handle entities *
1316 * *
1317 ************************************************************************/
1318
1319/*
1320 * Macro used to grow the current buffer.
1321 */
1322#define growBuffer(buffer) { \
1323 buffer##_size *= 2; \
1324 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1325 if (buffer == NULL) { \
1326 perror("realloc failed"); \
1327 return(NULL); \
1328 } \
1329}
1330
1331/**
1332 * htmlEntityLookup:
1333 * @name: the entity name
1334 *
1335 * Lookup the given entity in EntitiesTable
1336 *
1337 * TODO: the linear scan is really ugly, an hash table is really needed.
1338 *
1339 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1340 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001341const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001342htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001343 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001344
1345 for (i = 0;i < (sizeof(html40EntitiesTable)/
1346 sizeof(html40EntitiesTable[0]));i++) {
1347 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1348#ifdef DEBUG
1349 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
1350#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001351 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001352 }
1353 }
1354 return(NULL);
1355}
1356
1357/**
1358 * htmlEntityValueLookup:
1359 * @value: the entity's unicode value
1360 *
1361 * Lookup the given entity in EntitiesTable
1362 *
1363 * TODO: the linear scan is really ugly, an hash table is really needed.
1364 *
1365 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1366 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001367const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001368htmlEntityValueLookup(unsigned int value) {
1369 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001370#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00001371 unsigned int lv = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001372#endif
1373
1374 for (i = 0;i < (sizeof(html40EntitiesTable)/
1375 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001376 if (html40EntitiesTable[i].value >= value) {
1377 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001378 break;
1379#ifdef DEBUG
1380 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
1381#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001382 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001383 }
1384#ifdef DEBUG
1385 if (lv > html40EntitiesTable[i].value) {
1386 xmlGenericError(xmlGenericErrorContext,
1387 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1388 lv, html40EntitiesTable[i].value);
1389 }
1390 lv = html40EntitiesTable[i].value;
1391#endif
1392 }
1393 return(NULL);
1394}
1395
1396/**
1397 * UTF8ToHtml:
1398 * @out: a pointer to an array of bytes to store the result
1399 * @outlen: the length of @out
1400 * @in: a pointer to an array of UTF-8 chars
1401 * @inlen: the length of @in
1402 *
1403 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1404 * plus HTML entities block of chars out.
1405 *
1406 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1407 * The value of @inlen after return is the number of octets consumed
1408 * as the return value is positive, else unpredictiable.
1409 * The value of @outlen after return is the number of octets consumed.
1410 */
1411int
1412UTF8ToHtml(unsigned char* out, int *outlen,
1413 const unsigned char* in, int *inlen) {
1414 const unsigned char* processed = in;
1415 const unsigned char* outend;
1416 const unsigned char* outstart = out;
1417 const unsigned char* instart = in;
1418 const unsigned char* inend;
1419 unsigned int c, d;
1420 int trailing;
1421
1422 if (in == NULL) {
1423 /*
1424 * initialization nothing to do
1425 */
1426 *outlen = 0;
1427 *inlen = 0;
1428 return(0);
1429 }
1430 inend = in + (*inlen);
1431 outend = out + (*outlen);
1432 while (in < inend) {
1433 d = *in++;
1434 if (d < 0x80) { c= d; trailing= 0; }
1435 else if (d < 0xC0) {
1436 /* trailing byte in leading position */
1437 *outlen = out - outstart;
1438 *inlen = processed - instart;
1439 return(-2);
1440 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1441 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1442 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1443 else {
1444 /* no chance for this in Ascii */
1445 *outlen = out - outstart;
1446 *inlen = processed - instart;
1447 return(-2);
1448 }
1449
1450 if (inend - in < trailing) {
1451 break;
1452 }
1453
1454 for ( ; trailing; trailing--) {
1455 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1456 break;
1457 c <<= 6;
1458 c |= d & 0x3F;
1459 }
1460
1461 /* assertion: c is a single UTF-4 value */
1462 if (c < 0x80) {
1463 if (out + 1 >= outend)
1464 break;
1465 *out++ = c;
1466 } else {
1467 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001468 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001469
1470 /*
1471 * Try to lookup a predefined HTML entity for it
1472 */
1473
1474 ent = htmlEntityValueLookup(c);
1475 if (ent == NULL) {
1476 /* no chance for this in Ascii */
1477 *outlen = out - outstart;
1478 *inlen = processed - instart;
1479 return(-2);
1480 }
1481 len = strlen(ent->name);
1482 if (out + 2 + len >= outend)
1483 break;
1484 *out++ = '&';
1485 memcpy(out, ent->name, len);
1486 out += len;
1487 *out++ = ';';
1488 }
1489 processed = in;
1490 }
1491 *outlen = out - outstart;
1492 *inlen = processed - instart;
1493 return(0);
1494}
1495
1496/**
1497 * htmlEncodeEntities:
1498 * @out: a pointer to an array of bytes to store the result
1499 * @outlen: the length of @out
1500 * @in: a pointer to an array of UTF-8 chars
1501 * @inlen: the length of @in
1502 * @quoteChar: the quote character to escape (' or ") or zero.
1503 *
1504 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1505 * plus HTML entities block of chars out.
1506 *
1507 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1508 * The value of @inlen after return is the number of octets consumed
1509 * as the return value is positive, else unpredictiable.
1510 * The value of @outlen after return is the number of octets consumed.
1511 */
1512int
1513htmlEncodeEntities(unsigned char* out, int *outlen,
1514 const unsigned char* in, int *inlen, int quoteChar) {
1515 const unsigned char* processed = in;
1516 const unsigned char* outend = out + (*outlen);
1517 const unsigned char* outstart = out;
1518 const unsigned char* instart = in;
1519 const unsigned char* inend = in + (*inlen);
1520 unsigned int c, d;
1521 int trailing;
1522
1523 while (in < inend) {
1524 d = *in++;
1525 if (d < 0x80) { c= d; trailing= 0; }
1526 else if (d < 0xC0) {
1527 /* trailing byte in leading position */
1528 *outlen = out - outstart;
1529 *inlen = processed - instart;
1530 return(-2);
1531 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1532 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1533 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1534 else {
1535 /* no chance for this in Ascii */
1536 *outlen = out - outstart;
1537 *inlen = processed - instart;
1538 return(-2);
1539 }
1540
1541 if (inend - in < trailing)
1542 break;
1543
1544 while (trailing--) {
1545 if (((d= *in++) & 0xC0) != 0x80) {
1546 *outlen = out - outstart;
1547 *inlen = processed - instart;
1548 return(-2);
1549 }
1550 c <<= 6;
1551 c |= d & 0x3F;
1552 }
1553
1554 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001555 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1556 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001557 if (out >= outend)
1558 break;
1559 *out++ = c;
1560 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001561 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001562 const char *cp;
1563 char nbuf[16];
1564 int len;
1565
1566 /*
1567 * Try to lookup a predefined HTML entity for it
1568 */
1569 ent = htmlEntityValueLookup(c);
1570 if (ent == NULL) {
1571 sprintf(nbuf, "#%u", c);
1572 cp = nbuf;
1573 }
1574 else
1575 cp = ent->name;
1576 len = strlen(cp);
1577 if (out + 2 + len > outend)
1578 break;
1579 *out++ = '&';
1580 memcpy(out, cp, len);
1581 out += len;
1582 *out++ = ';';
1583 }
1584 processed = in;
1585 }
1586 *outlen = out - outstart;
1587 *inlen = processed - instart;
1588 return(0);
1589}
1590
1591/**
1592 * htmlDecodeEntities:
1593 * @ctxt: the parser context
1594 * @len: the len to decode (in bytes !), -1 for no size limit
1595 * @end: an end marker xmlChar, 0 if none
1596 * @end2: an end marker xmlChar, 0 if none
1597 * @end3: an end marker xmlChar, 0 if none
1598 *
1599 * Subtitute the HTML entities by their value
1600 *
1601 * DEPRECATED !!!!
1602 *
1603 * Returns A newly allocated string with the substitution done. The caller
1604 * must deallocate it !
1605 */
1606xmlChar *
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00001607htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
1608 xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001609 static int deprecated = 0;
1610 if (!deprecated) {
1611 xmlGenericError(xmlGenericErrorContext,
1612 "htmlDecodeEntities() deprecated function reached\n");
1613 deprecated = 1;
1614 }
1615 return(NULL);
1616#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00001617 xmlChar *name = NULL;
1618 xmlChar *buffer = NULL;
1619 unsigned int buffer_size = 0;
1620 unsigned int nbchars = 0;
1621 htmlEntityDescPtr ent;
1622 unsigned int max = (unsigned int) len;
1623 int c,l;
1624
1625 if (ctxt->depth > 40) {
1626 ctxt->errNo = XML_ERR_ENTITY_LOOP;
1627 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1628 ctxt->sax->error(ctxt->userData,
1629 "Detected entity reference loop\n");
1630 ctxt->wellFormed = 0;
1631 ctxt->disableSAX = 1;
1632 return(NULL);
1633 }
1634
1635 /*
1636 * allocate a translation buffer.
1637 */
1638 buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1639 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1640 if (buffer == NULL) {
1641 perror("xmlDecodeEntities: malloc failed");
1642 return(NULL);
1643 }
1644
1645 /*
1646 * Ok loop until we reach one of the ending char or a size limit.
1647 */
1648 c = CUR_CHAR(l);
1649 while ((nbchars < max) && (c != end) &&
1650 (c != end2) && (c != end3)) {
1651
1652 if (c == 0) break;
1653 if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
1654 int val = htmlParseCharRef(ctxt);
1655 COPY_BUF(0,buffer,nbchars,val);
1656 NEXTL(l);
1657 } else if ((c == '&') && (ctxt->token != '&')) {
1658 ent = htmlParseEntityRef(ctxt, &name);
1659 if (name != NULL) {
1660 if (ent != NULL) {
1661 int val = ent->value;
1662 COPY_BUF(0,buffer,nbchars,val);
1663 NEXTL(l);
1664 } else {
1665 const xmlChar *cur = name;
1666
1667 buffer[nbchars++] = '&';
1668 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1669 growBuffer(buffer);
1670 }
1671 while (*cur != 0) {
1672 buffer[nbchars++] = *cur++;
1673 }
1674 buffer[nbchars++] = ';';
1675 }
1676 }
1677 } else {
1678 COPY_BUF(l,buffer,nbchars,c);
1679 NEXTL(l);
1680 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1681 growBuffer(buffer);
1682 }
1683 }
1684 c = CUR_CHAR(l);
1685 }
1686 buffer[nbchars++] = 0;
1687 return(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001688#endif
Owen Taylor3473f882001-02-23 17:55:21 +00001689}
1690
1691/************************************************************************
1692 * *
1693 * Commodity functions to handle streams *
1694 * *
1695 ************************************************************************/
1696
1697/**
Owen Taylor3473f882001-02-23 17:55:21 +00001698 * htmlNewInputStream:
1699 * @ctxt: an HTML parser context
1700 *
1701 * Create a new input stream structure
1702 * Returns the new input stream or NULL
1703 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001704static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001705htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1706 htmlParserInputPtr input;
1707
1708 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1709 if (input == NULL) {
1710 ctxt->errNo = XML_ERR_NO_MEMORY;
1711 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1712 ctxt->sax->error(ctxt->userData,
1713 "malloc: couldn't allocate a new input stream\n");
1714 return(NULL);
1715 }
1716 memset(input, 0, sizeof(htmlParserInput));
1717 input->filename = NULL;
1718 input->directory = NULL;
1719 input->base = NULL;
1720 input->cur = NULL;
1721 input->buf = NULL;
1722 input->line = 1;
1723 input->col = 1;
1724 input->buf = NULL;
1725 input->free = NULL;
1726 input->version = NULL;
1727 input->consumed = 0;
1728 input->length = 0;
1729 return(input);
1730}
1731
1732
1733/************************************************************************
1734 * *
1735 * Commodity functions, cleanup needed ? *
1736 * *
1737 ************************************************************************/
1738
1739/**
1740 * areBlanks:
1741 * @ctxt: an HTML parser context
1742 * @str: a xmlChar *
1743 * @len: the size of @str
1744 *
1745 * Is this a sequence of blank chars that one can ignore ?
1746 *
1747 * Returns 1 if ignorable 0 otherwise.
1748 */
1749
1750static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
1751 int i;
1752 xmlNodePtr lastChild;
1753
1754 for (i = 0;i < len;i++)
1755 if (!(IS_BLANK(str[i]))) return(0);
1756
1757 if (CUR == 0) return(1);
1758 if (CUR != '<') return(0);
1759 if (ctxt->name == NULL)
1760 return(1);
1761 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
1762 return(1);
1763 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
1764 return(1);
1765 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
1766 return(1);
1767 if (ctxt->node == NULL) return(0);
1768 lastChild = xmlGetLastChild(ctxt->node);
1769 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00001770 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
1771 (ctxt->node->content != NULL)) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001772 } else if (xmlNodeIsText(lastChild)) {
1773 return(0);
1774 } else if (xmlStrEqual(lastChild->name, BAD_CAST"b")) {
1775 return(0);
1776 } else if (xmlStrEqual(lastChild->name, BAD_CAST"bold")) {
1777 return(0);
1778 } else if (xmlStrEqual(lastChild->name, BAD_CAST"em")) {
1779 return(0);
1780 }
1781 return(1);
1782}
1783
1784/**
Owen Taylor3473f882001-02-23 17:55:21 +00001785 * htmlNewDocNoDtD:
1786 * @URI: URI for the dtd, or NULL
1787 * @ExternalID: the external ID of the DTD, or NULL
1788 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001789 * Creates a new HTML document without a DTD node if @URI and @ExternalID
1790 * are NULL
1791 *
Owen Taylor3473f882001-02-23 17:55:21 +00001792 * Returns a new document, do not intialize the DTD if not provided
1793 */
1794htmlDocPtr
1795htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
1796 xmlDocPtr cur;
1797
1798 /*
1799 * Allocate a new document and fill the fields.
1800 */
1801 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
1802 if (cur == NULL) {
1803 xmlGenericError(xmlGenericErrorContext,
1804 "xmlNewDoc : malloc failed\n");
1805 return(NULL);
1806 }
1807 memset(cur, 0, sizeof(xmlDoc));
1808
1809 cur->type = XML_HTML_DOCUMENT_NODE;
1810 cur->version = NULL;
1811 cur->intSubset = NULL;
1812 if ((ExternalID != NULL) ||
1813 (URI != NULL))
1814 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
1815 cur->doc = cur;
1816 cur->name = NULL;
1817 cur->children = NULL;
1818 cur->extSubset = NULL;
1819 cur->oldNs = NULL;
1820 cur->encoding = NULL;
1821 cur->standalone = 1;
1822 cur->compression = 0;
1823 cur->ids = NULL;
1824 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001825 cur->_private = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001826 return(cur);
1827}
1828
1829/**
1830 * htmlNewDoc:
1831 * @URI: URI for the dtd, or NULL
1832 * @ExternalID: the external ID of the DTD, or NULL
1833 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001834 * Creates a new HTML document
1835 *
Owen Taylor3473f882001-02-23 17:55:21 +00001836 * Returns a new document
1837 */
1838htmlDocPtr
1839htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1840 if ((URI == NULL) && (ExternalID == NULL))
1841 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00001842 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
1843 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00001844
1845 return(htmlNewDocNoDtD(URI, ExternalID));
1846}
1847
1848
1849/************************************************************************
1850 * *
1851 * The parser itself *
1852 * Relates to http://www.w3.org/TR/html40 *
1853 * *
1854 ************************************************************************/
1855
1856/************************************************************************
1857 * *
1858 * The parser itself *
1859 * *
1860 ************************************************************************/
1861
1862/**
1863 * htmlParseHTMLName:
1864 * @ctxt: an HTML parser context
1865 *
1866 * parse an HTML tag or attribute name, note that we convert it to lowercase
1867 * since HTML names are not case-sensitive.
1868 *
1869 * Returns the Tag Name parsed or NULL
1870 */
1871
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001872static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001873htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1874 xmlChar *ret = NULL;
1875 int i = 0;
1876 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
1877
1878 if (!IS_LETTER(CUR) && (CUR != '_') &&
1879 (CUR != ':')) return(NULL);
1880
1881 while ((i < HTML_PARSER_BUFFER_SIZE) &&
1882 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1883 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
1884 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
1885 else loc[i] = CUR;
1886 i++;
1887
1888 NEXT;
1889 }
1890
1891 ret = xmlStrndup(loc, i);
1892
1893 return(ret);
1894}
1895
1896/**
1897 * htmlParseName:
1898 * @ctxt: an HTML parser context
1899 *
1900 * parse an HTML name, this routine is case sensistive.
1901 *
1902 * Returns the Name parsed or NULL
1903 */
1904
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001905static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001906htmlParseName(htmlParserCtxtPtr ctxt) {
1907 xmlChar buf[HTML_MAX_NAMELEN];
1908 int len = 0;
1909
1910 GROW;
1911 if (!IS_LETTER(CUR) && (CUR != '_')) {
1912 return(NULL);
1913 }
1914
1915 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1916 (CUR == '.') || (CUR == '-') ||
1917 (CUR == '_') || (CUR == ':') ||
1918 (IS_COMBINING(CUR)) ||
1919 (IS_EXTENDER(CUR))) {
1920 buf[len++] = CUR;
1921 NEXT;
1922 if (len >= HTML_MAX_NAMELEN) {
1923 xmlGenericError(xmlGenericErrorContext,
1924 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1925 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1926 (CUR == '.') || (CUR == '-') ||
1927 (CUR == '_') || (CUR == ':') ||
1928 (IS_COMBINING(CUR)) ||
1929 (IS_EXTENDER(CUR)))
1930 NEXT;
1931 break;
1932 }
1933 }
1934 return(xmlStrndup(buf, len));
1935}
1936
1937/**
1938 * htmlParseHTMLAttribute:
1939 * @ctxt: an HTML parser context
1940 * @stop: a char stop value
1941 *
1942 * parse an HTML attribute value till the stop (quote), if
1943 * stop is 0 then it stops at the first space
1944 *
1945 * Returns the attribute parsed or NULL
1946 */
1947
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001948static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001949htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
1950 xmlChar *buffer = NULL;
1951 int buffer_size = 0;
1952 xmlChar *out = NULL;
1953 xmlChar *name = NULL;
1954
1955 xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00001956 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001957
1958 /*
1959 * allocate a translation buffer.
1960 */
1961 buffer_size = HTML_PARSER_BUFFER_SIZE;
1962 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1963 if (buffer == NULL) {
1964 perror("htmlParseHTMLAttribute: malloc failed");
1965 return(NULL);
1966 }
1967 out = buffer;
1968
1969 /*
1970 * Ok loop until we reach one of the ending chars
1971 */
1972 while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1973 if ((stop == 0) && (IS_BLANK(CUR))) break;
1974 if (CUR == '&') {
1975 if (NXT(1) == '#') {
1976 unsigned int c;
1977 int bits;
1978
1979 c = htmlParseCharRef(ctxt);
1980 if (c < 0x80)
1981 { *out++ = c; bits= -6; }
1982 else if (c < 0x800)
1983 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1984 else if (c < 0x10000)
1985 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1986 else
1987 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1988
1989 for ( ; bits >= 0; bits-= 6) {
1990 *out++ = ((c >> bits) & 0x3F) | 0x80;
1991 }
1992 } else {
1993 ent = htmlParseEntityRef(ctxt, &name);
1994 if (name == NULL) {
1995 *out++ = '&';
1996 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001997 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00001998
1999 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002000 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002001 }
2002 } else if (ent == NULL) {
2003 *out++ = '&';
2004 cur = name;
2005 while (*cur != 0) {
2006 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002007 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002008
2009 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002010 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002011 }
2012 *out++ = *cur++;
2013 }
2014 xmlFree(name);
2015 } else {
2016 unsigned int c;
2017 int bits;
2018
2019 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002020 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002021
2022 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002023 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002024 }
2025 c = (xmlChar)ent->value;
2026 if (c < 0x80)
2027 { *out++ = c; bits= -6; }
2028 else if (c < 0x800)
2029 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2030 else if (c < 0x10000)
2031 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2032 else
2033 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2034
2035 for ( ; bits >= 0; bits-= 6) {
2036 *out++ = ((c >> bits) & 0x3F) | 0x80;
2037 }
2038 xmlFree(name);
2039 }
2040 }
2041 } else {
2042 unsigned int c;
2043 int bits, l;
2044
2045 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002046 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002047
2048 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002049 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002050 }
2051 c = CUR_CHAR(l);
2052 if (c < 0x80)
2053 { *out++ = c; bits= -6; }
2054 else if (c < 0x800)
2055 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2056 else if (c < 0x10000)
2057 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2058 else
2059 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2060
2061 for ( ; bits >= 0; bits-= 6) {
2062 *out++ = ((c >> bits) & 0x3F) | 0x80;
2063 }
2064 NEXT;
2065 }
2066 }
2067 *out++ = 0;
2068 return(buffer);
2069}
2070
2071/**
Owen Taylor3473f882001-02-23 17:55:21 +00002072 * htmlParseEntityRef:
2073 * @ctxt: an HTML parser context
2074 * @str: location to store the entity name
2075 *
2076 * parse an HTML ENTITY references
2077 *
2078 * [68] EntityRef ::= '&' Name ';'
2079 *
2080 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2081 * if non-NULL *str will have to be freed by the caller.
2082 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002083const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00002084htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2085 xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002086 const htmlEntityDesc * ent = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002087 *str = NULL;
2088
2089 if (CUR == '&') {
2090 NEXT;
2091 name = htmlParseName(ctxt);
2092 if (name == NULL) {
2093 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2094 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2095 ctxt->wellFormed = 0;
2096 } else {
2097 GROW;
2098 if (CUR == ';') {
2099 *str = name;
2100
2101 /*
2102 * Lookup the entity in the table.
2103 */
2104 ent = htmlEntityLookup(name);
2105 if (ent != NULL) /* OK that's ugly !!! */
2106 NEXT;
2107 } else {
2108 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2109 ctxt->sax->error(ctxt->userData,
2110 "htmlParseEntityRef: expecting ';'\n");
2111 *str = name;
2112 }
2113 }
2114 }
2115 return(ent);
2116}
2117
2118/**
2119 * htmlParseAttValue:
2120 * @ctxt: an HTML parser context
2121 *
2122 * parse a value for an attribute
2123 * Note: the parser won't do substitution of entities here, this
2124 * will be handled later in xmlStringGetNodeList, unless it was
2125 * asked for ctxt->replaceEntities != 0
2126 *
2127 * Returns the AttValue parsed or NULL.
2128 */
2129
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002130static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002131htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2132 xmlChar *ret = NULL;
2133
2134 if (CUR == '"') {
2135 NEXT;
2136 ret = htmlParseHTMLAttribute(ctxt, '"');
2137 if (CUR != '"') {
2138 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2139 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2140 ctxt->wellFormed = 0;
2141 } else
2142 NEXT;
2143 } else if (CUR == '\'') {
2144 NEXT;
2145 ret = htmlParseHTMLAttribute(ctxt, '\'');
2146 if (CUR != '\'') {
2147 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2148 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2149 ctxt->wellFormed = 0;
2150 } else
2151 NEXT;
2152 } else {
2153 /*
2154 * That's an HTMLism, the attribute value may not be quoted
2155 */
2156 ret = htmlParseHTMLAttribute(ctxt, 0);
2157 if (ret == NULL) {
2158 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2159 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2160 ctxt->wellFormed = 0;
2161 }
2162 }
2163 return(ret);
2164}
2165
2166/**
2167 * htmlParseSystemLiteral:
2168 * @ctxt: an HTML parser context
2169 *
2170 * parse an HTML Literal
2171 *
2172 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2173 *
2174 * Returns the SystemLiteral parsed or NULL
2175 */
2176
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002177static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002178htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2179 const xmlChar *q;
2180 xmlChar *ret = NULL;
2181
2182 if (CUR == '"') {
2183 NEXT;
2184 q = CUR_PTR;
2185 while ((IS_CHAR(CUR)) && (CUR != '"'))
2186 NEXT;
2187 if (!IS_CHAR(CUR)) {
2188 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2189 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2190 ctxt->wellFormed = 0;
2191 } else {
2192 ret = xmlStrndup(q, CUR_PTR - q);
2193 NEXT;
2194 }
2195 } else if (CUR == '\'') {
2196 NEXT;
2197 q = CUR_PTR;
2198 while ((IS_CHAR(CUR)) && (CUR != '\''))
2199 NEXT;
2200 if (!IS_CHAR(CUR)) {
2201 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2202 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2203 ctxt->wellFormed = 0;
2204 } else {
2205 ret = xmlStrndup(q, CUR_PTR - q);
2206 NEXT;
2207 }
2208 } else {
2209 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2210 ctxt->sax->error(ctxt->userData,
2211 "SystemLiteral \" or ' expected\n");
2212 ctxt->wellFormed = 0;
2213 }
2214
2215 return(ret);
2216}
2217
2218/**
2219 * htmlParsePubidLiteral:
2220 * @ctxt: an HTML parser context
2221 *
2222 * parse an HTML public literal
2223 *
2224 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2225 *
2226 * Returns the PubidLiteral parsed or NULL.
2227 */
2228
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002229static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002230htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2231 const xmlChar *q;
2232 xmlChar *ret = NULL;
2233 /*
2234 * Name ::= (Letter | '_') (NameChar)*
2235 */
2236 if (CUR == '"') {
2237 NEXT;
2238 q = CUR_PTR;
2239 while (IS_PUBIDCHAR(CUR)) NEXT;
2240 if (CUR != '"') {
2241 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2242 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2243 ctxt->wellFormed = 0;
2244 } else {
2245 ret = xmlStrndup(q, CUR_PTR - q);
2246 NEXT;
2247 }
2248 } else if (CUR == '\'') {
2249 NEXT;
2250 q = CUR_PTR;
2251 while ((IS_LETTER(CUR)) && (CUR != '\''))
2252 NEXT;
2253 if (!IS_LETTER(CUR)) {
2254 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2255 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2256 ctxt->wellFormed = 0;
2257 } else {
2258 ret = xmlStrndup(q, CUR_PTR - q);
2259 NEXT;
2260 }
2261 } else {
2262 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2263 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2264 ctxt->wellFormed = 0;
2265 }
2266
2267 return(ret);
2268}
2269
2270/**
2271 * htmlParseScript:
2272 * @ctxt: an HTML parser context
2273 *
2274 * parse the content of an HTML SCRIPT or STYLE element
2275 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2276 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2277 * http://www.w3.org/TR/html4/types.html#type-script
2278 * http://www.w3.org/TR/html4/types.html#h-6.15
2279 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2280 *
2281 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2282 * element and the value of intrinsic event attributes. User agents must
2283 * not evaluate script data as HTML markup but instead must pass it on as
2284 * data to a script engine.
2285 * NOTES:
2286 * - The content is passed like CDATA
2287 * - the attributes for style and scripting "onXXX" are also described
2288 * as CDATA but SGML allows entities references in attributes so their
2289 * processing is identical as other attributes
2290 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002291static void
Owen Taylor3473f882001-02-23 17:55:21 +00002292htmlParseScript(htmlParserCtxtPtr ctxt) {
2293 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2294 int nbchar = 0;
2295 xmlChar cur;
2296
2297 SHRINK;
2298 cur = CUR;
2299 while (IS_CHAR(cur)) {
2300 if ((cur == '<') && (NXT(1) == '/')) {
2301 /*
2302 * One should break here, the specification is clear:
2303 * Authors should therefore escape "</" within the content.
2304 * Escape mechanisms are specific to each scripting or
2305 * style sheet language.
2306 */
2307 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2308 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2309 break; /* while */
2310 }
2311 buf[nbchar++] = cur;
2312 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2313 if (ctxt->sax->cdataBlock!= NULL) {
2314 /*
2315 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2316 */
2317 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2318 }
2319 nbchar = 0;
2320 }
2321 NEXT;
2322 cur = CUR;
2323 }
2324 if (!(IS_CHAR(cur))) {
2325 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2326 ctxt->sax->error(ctxt->userData,
2327 "Invalid char in CDATA 0x%X\n", cur);
2328 ctxt->wellFormed = 0;
2329 NEXT;
2330 }
2331
2332 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2333 if (ctxt->sax->cdataBlock!= NULL) {
2334 /*
2335 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2336 */
2337 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2338 }
2339 }
2340}
2341
2342
2343/**
2344 * htmlParseCharData:
2345 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002346 *
2347 * parse a CharData section.
2348 * if we are within a CDATA section ']]>' marks an end of section.
2349 *
2350 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2351 */
2352
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002353static void
2354htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002355 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2356 int nbchar = 0;
2357 int cur, l;
2358
2359 SHRINK;
2360 cur = CUR_CHAR(l);
2361 while (((cur != '<') || (ctxt->token == '<')) &&
2362 ((cur != '&') || (ctxt->token == '&')) &&
2363 (IS_CHAR(cur))) {
2364 COPY_BUF(l,buf,nbchar,cur);
2365 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2366 /*
2367 * Ok the segment is to be consumed as chars.
2368 */
2369 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2370 if (areBlanks(ctxt, buf, nbchar)) {
2371 if (ctxt->sax->ignorableWhitespace != NULL)
2372 ctxt->sax->ignorableWhitespace(ctxt->userData,
2373 buf, nbchar);
2374 } else {
2375 htmlCheckParagraph(ctxt);
2376 if (ctxt->sax->characters != NULL)
2377 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2378 }
2379 }
2380 nbchar = 0;
2381 }
2382 NEXTL(l);
2383 cur = CUR_CHAR(l);
2384 }
2385 if (nbchar != 0) {
2386 /*
2387 * Ok the segment is to be consumed as chars.
2388 */
2389 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2390 if (areBlanks(ctxt, buf, nbchar)) {
2391 if (ctxt->sax->ignorableWhitespace != NULL)
2392 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2393 } else {
2394 htmlCheckParagraph(ctxt);
2395 if (ctxt->sax->characters != NULL)
2396 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2397 }
2398 }
2399 }
2400}
2401
2402/**
2403 * htmlParseExternalID:
2404 * @ctxt: an HTML parser context
2405 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002406 *
2407 * Parse an External ID or a Public ID
2408 *
Owen Taylor3473f882001-02-23 17:55:21 +00002409 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2410 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2411 *
2412 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2413 *
2414 * Returns the function returns SystemLiteral and in the second
2415 * case publicID receives PubidLiteral, is strict is off
2416 * it is possible to return NULL and have publicID set.
2417 */
2418
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002419static xmlChar *
2420htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002421 xmlChar *URI = NULL;
2422
2423 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2424 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2425 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2426 SKIP(6);
2427 if (!IS_BLANK(CUR)) {
2428 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2429 ctxt->sax->error(ctxt->userData,
2430 "Space required after 'SYSTEM'\n");
2431 ctxt->wellFormed = 0;
2432 }
2433 SKIP_BLANKS;
2434 URI = htmlParseSystemLiteral(ctxt);
2435 if (URI == NULL) {
2436 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2437 ctxt->sax->error(ctxt->userData,
2438 "htmlParseExternalID: SYSTEM, no URI\n");
2439 ctxt->wellFormed = 0;
2440 }
2441 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2442 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2443 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2444 SKIP(6);
2445 if (!IS_BLANK(CUR)) {
2446 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2447 ctxt->sax->error(ctxt->userData,
2448 "Space required after 'PUBLIC'\n");
2449 ctxt->wellFormed = 0;
2450 }
2451 SKIP_BLANKS;
2452 *publicID = htmlParsePubidLiteral(ctxt);
2453 if (*publicID == NULL) {
2454 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2455 ctxt->sax->error(ctxt->userData,
2456 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2457 ctxt->wellFormed = 0;
2458 }
2459 SKIP_BLANKS;
2460 if ((CUR == '"') || (CUR == '\'')) {
2461 URI = htmlParseSystemLiteral(ctxt);
2462 }
2463 }
2464 return(URI);
2465}
2466
2467/**
2468 * htmlParseComment:
2469 * @ctxt: an HTML parser context
2470 *
2471 * Parse an XML (SGML) comment <!-- .... -->
2472 *
2473 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2474 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002475static void
Owen Taylor3473f882001-02-23 17:55:21 +00002476htmlParseComment(htmlParserCtxtPtr ctxt) {
2477 xmlChar *buf = NULL;
2478 int len;
2479 int size = HTML_PARSER_BUFFER_SIZE;
2480 int q, ql;
2481 int r, rl;
2482 int cur, l;
2483 xmlParserInputState state;
2484
2485 /*
2486 * Check that there is a comment right here.
2487 */
2488 if ((RAW != '<') || (NXT(1) != '!') ||
2489 (NXT(2) != '-') || (NXT(3) != '-')) return;
2490
2491 state = ctxt->instate;
2492 ctxt->instate = XML_PARSER_COMMENT;
2493 SHRINK;
2494 SKIP(4);
2495 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2496 if (buf == NULL) {
2497 xmlGenericError(xmlGenericErrorContext,
2498 "malloc of %d byte failed\n", size);
2499 ctxt->instate = state;
2500 return;
2501 }
2502 q = CUR_CHAR(ql);
2503 NEXTL(ql);
2504 r = CUR_CHAR(rl);
2505 NEXTL(rl);
2506 cur = CUR_CHAR(l);
2507 len = 0;
2508 while (IS_CHAR(cur) &&
2509 ((cur != '>') ||
2510 (r != '-') || (q != '-'))) {
2511 if (len + 5 >= size) {
2512 size *= 2;
2513 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2514 if (buf == NULL) {
2515 xmlGenericError(xmlGenericErrorContext,
2516 "realloc of %d byte failed\n", size);
2517 ctxt->instate = state;
2518 return;
2519 }
2520 }
2521 COPY_BUF(ql,buf,len,q);
2522 q = r;
2523 ql = rl;
2524 r = cur;
2525 rl = l;
2526 NEXTL(l);
2527 cur = CUR_CHAR(l);
2528 if (cur == 0) {
2529 SHRINK;
2530 GROW;
2531 cur = CUR_CHAR(l);
2532 }
2533 }
2534 buf[len] = 0;
2535 if (!IS_CHAR(cur)) {
2536 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2537 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2538 ctxt->sax->error(ctxt->userData,
2539 "Comment not terminated \n<!--%.50s\n", buf);
2540 ctxt->wellFormed = 0;
2541 xmlFree(buf);
2542 } else {
2543 NEXT;
2544 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2545 (!ctxt->disableSAX))
2546 ctxt->sax->comment(ctxt->userData, buf);
2547 xmlFree(buf);
2548 }
2549 ctxt->instate = state;
2550}
2551
2552/**
2553 * htmlParseCharRef:
2554 * @ctxt: an HTML parser context
2555 *
2556 * parse Reference declarations
2557 *
2558 * [66] CharRef ::= '&#' [0-9]+ ';' |
2559 * '&#x' [0-9a-fA-F]+ ';'
2560 *
2561 * Returns the value parsed (as an int)
2562 */
2563int
2564htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2565 int val = 0;
2566
2567 if ((CUR == '&') && (NXT(1) == '#') &&
2568 (NXT(2) == 'x')) {
2569 SKIP(3);
2570 while (CUR != ';') {
2571 if ((CUR >= '0') && (CUR <= '9'))
2572 val = val * 16 + (CUR - '0');
2573 else if ((CUR >= 'a') && (CUR <= 'f'))
2574 val = val * 16 + (CUR - 'a') + 10;
2575 else if ((CUR >= 'A') && (CUR <= 'F'))
2576 val = val * 16 + (CUR - 'A') + 10;
2577 else {
2578 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2579 ctxt->sax->error(ctxt->userData,
2580 "htmlParseCharRef: invalid hexadecimal value\n");
2581 ctxt->wellFormed = 0;
2582 return(0);
2583 }
2584 NEXT;
2585 }
2586 if (CUR == ';')
2587 NEXT;
2588 } else if ((CUR == '&') && (NXT(1) == '#')) {
2589 SKIP(2);
2590 while (CUR != ';') {
2591 if ((CUR >= '0') && (CUR <= '9'))
2592 val = val * 10 + (CUR - '0');
2593 else {
2594 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2595 ctxt->sax->error(ctxt->userData,
2596 "htmlParseCharRef: invalid decimal value\n");
2597 ctxt->wellFormed = 0;
2598 return(0);
2599 }
2600 NEXT;
2601 }
2602 if (CUR == ';')
2603 NEXT;
2604 } else {
2605 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2606 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2607 ctxt->wellFormed = 0;
2608 }
2609 /*
2610 * Check the value IS_CHAR ...
2611 */
2612 if (IS_CHAR(val)) {
2613 return(val);
2614 } else {
2615 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2616 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
2617 val);
2618 ctxt->wellFormed = 0;
2619 }
2620 return(0);
2621}
2622
2623
2624/**
2625 * htmlParseDocTypeDecl :
2626 * @ctxt: an HTML parser context
2627 *
2628 * parse a DOCTYPE declaration
2629 *
2630 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2631 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2632 */
2633
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002634static void
Owen Taylor3473f882001-02-23 17:55:21 +00002635htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
2636 xmlChar *name;
2637 xmlChar *ExternalID = NULL;
2638 xmlChar *URI = NULL;
2639
2640 /*
2641 * We know that '<!DOCTYPE' has been detected.
2642 */
2643 SKIP(9);
2644
2645 SKIP_BLANKS;
2646
2647 /*
2648 * Parse the DOCTYPE name.
2649 */
2650 name = htmlParseName(ctxt);
2651 if (name == NULL) {
2652 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2653 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2654 ctxt->wellFormed = 0;
2655 }
2656 /*
2657 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2658 */
2659
2660 SKIP_BLANKS;
2661
2662 /*
2663 * Check for SystemID and ExternalID
2664 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002665 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00002666 SKIP_BLANKS;
2667
2668 /*
2669 * We should be at the end of the DOCTYPE declaration.
2670 */
2671 if (CUR != '>') {
2672 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardf6ed8bc2001-10-02 09:22:47 +00002673 ctxt->sax->error(ctxt->userData, "DOCTYPE improperly terminated\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002674 ctxt->wellFormed = 0;
2675 /* We shouldn't try to resynchronize ... */
2676 }
2677 NEXT;
2678
2679 /*
2680 * Create or update the document accordingly to the DOCTYPE
2681 */
2682 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2683 (!ctxt->disableSAX))
2684 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
2685
2686 /*
2687 * Cleanup, since we don't use all those identifiers
2688 */
2689 if (URI != NULL) xmlFree(URI);
2690 if (ExternalID != NULL) xmlFree(ExternalID);
2691 if (name != NULL) xmlFree(name);
2692}
2693
2694/**
2695 * htmlParseAttribute:
2696 * @ctxt: an HTML parser context
2697 * @value: a xmlChar ** used to store the value of the attribute
2698 *
2699 * parse an attribute
2700 *
2701 * [41] Attribute ::= Name Eq AttValue
2702 *
2703 * [25] Eq ::= S? '=' S?
2704 *
2705 * With namespace:
2706 *
2707 * [NS 11] Attribute ::= QName Eq AttValue
2708 *
2709 * Also the case QName == xmlns:??? is handled independently as a namespace
2710 * definition.
2711 *
2712 * Returns the attribute name, and the value in *value.
2713 */
2714
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002715static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002716htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
2717 xmlChar *name, *val = NULL;
2718
2719 *value = NULL;
2720 name = htmlParseHTMLName(ctxt);
2721 if (name == NULL) {
2722 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2723 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2724 ctxt->wellFormed = 0;
2725 return(NULL);
2726 }
2727
2728 /*
2729 * read the value
2730 */
2731 SKIP_BLANKS;
2732 if (CUR == '=') {
2733 NEXT;
2734 SKIP_BLANKS;
2735 val = htmlParseAttValue(ctxt);
2736 /******
2737 } else {
2738 * TODO : some attribute must have values, some may not
2739 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2740 ctxt->sax->warning(ctxt->userData,
2741 "No value for attribute %s\n", name); */
2742 }
2743
2744 *value = val;
2745 return(name);
2746}
2747
2748/**
2749 * htmlCheckEncoding:
2750 * @ctxt: an HTML parser context
2751 * @attvalue: the attribute value
2752 *
2753 * Checks an http-equiv attribute from a Meta tag to detect
2754 * the encoding
2755 * If a new encoding is detected the parser is switched to decode
2756 * it and pass UTF8
2757 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002758static void
Owen Taylor3473f882001-02-23 17:55:21 +00002759htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2760 const xmlChar *encoding;
2761
2762 if ((ctxt == NULL) || (attvalue == NULL))
2763 return;
2764
2765 /* do not change encoding */
2766 if (ctxt->input->encoding != NULL)
2767 return;
2768
2769 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
2770 if (encoding != NULL) {
2771 encoding += 8;
2772 } else {
2773 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
2774 if (encoding != NULL)
2775 encoding += 9;
2776 }
2777 if (encoding != NULL) {
2778 xmlCharEncoding enc;
2779 xmlCharEncodingHandlerPtr handler;
2780
2781 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2782
2783 if (ctxt->input->encoding != NULL)
2784 xmlFree((xmlChar *) ctxt->input->encoding);
2785 ctxt->input->encoding = xmlStrdup(encoding);
2786
2787 enc = xmlParseCharEncoding((const char *) encoding);
2788 /*
2789 * registered set of known encodings
2790 */
2791 if (enc != XML_CHAR_ENCODING_ERROR) {
2792 xmlSwitchEncoding(ctxt, enc);
2793 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2794 } else {
2795 /*
2796 * fallback for unknown encodings
2797 */
2798 handler = xmlFindCharEncodingHandler((const char *) encoding);
2799 if (handler != NULL) {
2800 xmlSwitchToEncoding(ctxt, handler);
2801 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2802 } else {
2803 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2804 }
2805 }
2806
2807 if ((ctxt->input->buf != NULL) &&
2808 (ctxt->input->buf->encoder != NULL) &&
2809 (ctxt->input->buf->raw != NULL) &&
2810 (ctxt->input->buf->buffer != NULL)) {
2811 int nbchars;
2812 int processed;
2813
2814 /*
2815 * convert as much as possible to the parser reading buffer.
2816 */
2817 processed = ctxt->input->cur - ctxt->input->base;
2818 xmlBufferShrink(ctxt->input->buf->buffer, processed);
2819 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2820 ctxt->input->buf->buffer,
2821 ctxt->input->buf->raw);
2822 if (nbchars < 0) {
2823 ctxt->errNo = XML_ERR_INVALID_ENCODING;
2824 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2825 ctxt->sax->error(ctxt->userData,
2826 "htmlCheckEncoding: encoder error\n");
2827 }
2828 ctxt->input->base =
2829 ctxt->input->cur = ctxt->input->buf->buffer->content;
2830 }
2831 }
2832}
2833
2834/**
2835 * htmlCheckMeta:
2836 * @ctxt: an HTML parser context
2837 * @atts: the attributes values
2838 *
2839 * Checks an attributes from a Meta tag
2840 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002841static void
Owen Taylor3473f882001-02-23 17:55:21 +00002842htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2843 int i;
2844 const xmlChar *att, *value;
2845 int http = 0;
2846 const xmlChar *content = NULL;
2847
2848 if ((ctxt == NULL) || (atts == NULL))
2849 return;
2850
2851 i = 0;
2852 att = atts[i++];
2853 while (att != NULL) {
2854 value = atts[i++];
2855 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
2856 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
2857 http = 1;
2858 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
2859 content = value;
2860 att = atts[i++];
2861 }
2862 if ((http) && (content != NULL))
2863 htmlCheckEncoding(ctxt, content);
2864
2865}
2866
2867/**
2868 * htmlParseStartTag:
2869 * @ctxt: an HTML parser context
2870 *
2871 * parse a start of tag either for rule element or
2872 * EmptyElement. In both case we don't parse the tag closing chars.
2873 *
2874 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2875 *
2876 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2877 *
2878 * With namespace:
2879 *
2880 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2881 *
2882 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2883 *
2884 */
2885
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002886static void
Owen Taylor3473f882001-02-23 17:55:21 +00002887htmlParseStartTag(htmlParserCtxtPtr ctxt) {
2888 xmlChar *name;
2889 xmlChar *attname;
2890 xmlChar *attvalue;
2891 const xmlChar **atts = NULL;
2892 int nbatts = 0;
2893 int maxatts = 0;
2894 int meta = 0;
2895 int i;
2896
2897 if (CUR != '<') return;
2898 NEXT;
2899
2900 GROW;
2901 name = htmlParseHTMLName(ctxt);
2902 if (name == NULL) {
2903 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2904 ctxt->sax->error(ctxt->userData,
2905 "htmlParseStartTag: invalid element name\n");
2906 ctxt->wellFormed = 0;
2907 /* Dump the bogus tag like browsers do */
2908 while ((IS_CHAR(CUR)) && (CUR != '>'))
2909 NEXT;
2910 return;
2911 }
2912 if (xmlStrEqual(name, BAD_CAST"meta"))
2913 meta = 1;
2914
2915 /*
2916 * Check for auto-closure of HTML elements.
2917 */
2918 htmlAutoClose(ctxt, name);
2919
2920 /*
2921 * Check for implied HTML elements.
2922 */
2923 htmlCheckImplied(ctxt, name);
2924
2925 /*
2926 * Avoid html at any level > 0, head at any level != 1
2927 * or any attempt to recurse body
2928 */
2929 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
2930 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2931 ctxt->sax->error(ctxt->userData,
2932 "htmlParseStartTag: misplaced <html> tag\n");
2933 ctxt->wellFormed = 0;
2934 xmlFree(name);
2935 return;
2936 }
2937 if ((ctxt->nameNr != 1) &&
2938 (xmlStrEqual(name, BAD_CAST"head"))) {
2939 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2940 ctxt->sax->error(ctxt->userData,
2941 "htmlParseStartTag: misplaced <head> tag\n");
2942 ctxt->wellFormed = 0;
2943 xmlFree(name);
2944 return;
2945 }
2946 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002947 int indx;
2948 for (indx = 0;indx < ctxt->nameNr;indx++) {
2949 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor3473f882001-02-23 17:55:21 +00002950 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2951 ctxt->sax->error(ctxt->userData,
2952 "htmlParseStartTag: misplaced <body> tag\n");
2953 ctxt->wellFormed = 0;
2954 xmlFree(name);
2955 return;
2956 }
2957 }
2958 }
2959
2960 /*
2961 * Now parse the attributes, it ends up with the ending
2962 *
2963 * (S Attribute)* S?
2964 */
2965 SKIP_BLANKS;
2966 while ((IS_CHAR(CUR)) &&
2967 (CUR != '>') &&
2968 ((CUR != '/') || (NXT(1) != '>'))) {
2969 long cons = ctxt->nbChars;
2970
2971 GROW;
2972 attname = htmlParseAttribute(ctxt, &attvalue);
2973 if (attname != NULL) {
2974
2975 /*
2976 * Well formedness requires at most one declaration of an attribute
2977 */
2978 for (i = 0; i < nbatts;i += 2) {
2979 if (xmlStrEqual(atts[i], attname)) {
2980 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2981 ctxt->sax->error(ctxt->userData,
2982 "Attribute %s redefined\n",
2983 attname);
2984 ctxt->wellFormed = 0;
2985 xmlFree(attname);
2986 if (attvalue != NULL)
2987 xmlFree(attvalue);
2988 goto failed;
2989 }
2990 }
2991
2992 /*
2993 * Add the pair to atts
2994 */
2995 if (atts == NULL) {
2996 maxatts = 10;
2997 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
2998 if (atts == NULL) {
2999 xmlGenericError(xmlGenericErrorContext,
3000 "malloc of %ld byte failed\n",
3001 maxatts * (long)sizeof(xmlChar *));
3002 if (name != NULL) xmlFree(name);
3003 return;
3004 }
3005 } else if (nbatts + 4 > maxatts) {
3006 maxatts *= 2;
3007 atts = (const xmlChar **) xmlRealloc((void *) atts,
3008 maxatts * sizeof(xmlChar *));
3009 if (atts == NULL) {
3010 xmlGenericError(xmlGenericErrorContext,
3011 "realloc of %ld byte failed\n",
3012 maxatts * (long)sizeof(xmlChar *));
3013 if (name != NULL) xmlFree(name);
3014 return;
3015 }
3016 }
3017 atts[nbatts++] = attname;
3018 atts[nbatts++] = attvalue;
3019 atts[nbatts] = NULL;
3020 atts[nbatts + 1] = NULL;
3021 }
3022 else {
3023 /* Dump the bogus attribute string up to the next blank or
3024 * the end of the tag. */
3025 while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
3026 && ((CUR != '/') || (NXT(1) != '>')))
3027 NEXT;
3028 }
3029
3030failed:
3031 SKIP_BLANKS;
3032 if (cons == ctxt->nbChars) {
3033 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3034 ctxt->sax->error(ctxt->userData,
3035 "htmlParseStartTag: problem parsing attributes\n");
3036 ctxt->wellFormed = 0;
3037 break;
3038 }
3039 }
3040
3041 /*
3042 * Handle specific association to the META tag
3043 */
3044 if (meta)
3045 htmlCheckMeta(ctxt, atts);
3046
3047 /*
3048 * SAX: Start of Element !
3049 */
3050 htmlnamePush(ctxt, xmlStrdup(name));
3051#ifdef DEBUG
3052 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
3053#endif
3054 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3055 ctxt->sax->startElement(ctxt->userData, name, atts);
3056
3057 if (atts != NULL) {
3058 for (i = 0;i < nbatts;i++) {
3059 if (atts[i] != NULL)
3060 xmlFree((xmlChar *) atts[i]);
3061 }
3062 xmlFree((void *) atts);
3063 }
3064 if (name != NULL) xmlFree(name);
3065}
3066
3067/**
3068 * htmlParseEndTag:
3069 * @ctxt: an HTML parser context
3070 *
3071 * parse an end of tag
3072 *
3073 * [42] ETag ::= '</' Name S? '>'
3074 *
3075 * With namespace
3076 *
3077 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003078 *
3079 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003080 */
3081
Daniel Veillardf420ac52001-07-04 16:04:09 +00003082static int
Owen Taylor3473f882001-02-23 17:55:21 +00003083htmlParseEndTag(htmlParserCtxtPtr ctxt) {
3084 xmlChar *name;
3085 xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003086 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003087
3088 if ((CUR != '<') || (NXT(1) != '/')) {
3089 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3090 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3091 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003092 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003093 }
3094 SKIP(2);
3095
3096 name = htmlParseHTMLName(ctxt);
Daniel Veillardf420ac52001-07-04 16:04:09 +00003097 if (name == NULL) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003098
3099 /*
3100 * We should definitely be at the ending "S? '>'" part
3101 */
3102 SKIP_BLANKS;
3103 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3104 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3105 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3106 ctxt->wellFormed = 0;
3107 } else
3108 NEXT;
3109
3110 /*
3111 * If the name read is not one of the element in the parsing stack
3112 * then return, it's just an error.
3113 */
3114 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
3115 if (xmlStrEqual(name, ctxt->nameTab[i])) break;
3116 }
3117 if (i < 0) {
3118 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3119 ctxt->sax->error(ctxt->userData,
3120 "Unexpected end tag : %s\n", name);
3121 xmlFree(name);
3122 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003123 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003124 }
3125
3126
3127 /*
3128 * Check for auto-closure of HTML elements.
3129 */
3130
3131 htmlAutoCloseOnClose(ctxt, name);
3132
3133 /*
3134 * Well formedness constraints, opening and closing must match.
3135 * With the exception that the autoclose may have popped stuff out
3136 * of the stack.
3137 */
3138 if (!xmlStrEqual(name, ctxt->name)) {
3139#ifdef DEBUG
3140 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
3141#endif
3142 if ((ctxt->name != NULL) &&
3143 (!xmlStrEqual(ctxt->name, name))) {
3144 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3145 ctxt->sax->error(ctxt->userData,
3146 "Opening and ending tag mismatch: %s and %s\n",
3147 name, ctxt->name);
3148 ctxt->wellFormed = 0;
3149 }
3150 }
3151
3152 /*
3153 * SAX: End of Tag
3154 */
3155 oldname = ctxt->name;
3156 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3157 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3158 ctxt->sax->endElement(ctxt->userData, name);
3159 oldname = htmlnamePop(ctxt);
3160 if (oldname != NULL) {
3161#ifdef DEBUG
3162 xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
3163#endif
3164 xmlFree(oldname);
3165#ifdef DEBUG
3166 } else {
3167 xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
3168#endif
3169 }
Daniel Veillardf420ac52001-07-04 16:04:09 +00003170 ret = 1;
3171 } else {
3172 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003173 }
3174
3175 if (name != NULL)
3176 xmlFree(name);
3177
Daniel Veillardf420ac52001-07-04 16:04:09 +00003178 return(ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003179}
3180
3181
3182/**
3183 * htmlParseReference:
3184 * @ctxt: an HTML parser context
3185 *
3186 * parse and handle entity references in content,
3187 * this will end-up in a call to character() since this is either a
3188 * CharRef, or a predefined entity.
3189 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003190static void
Owen Taylor3473f882001-02-23 17:55:21 +00003191htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003192 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003193 xmlChar out[6];
3194 xmlChar *name;
3195 if (CUR != '&') return;
3196
3197 if (NXT(1) == '#') {
3198 unsigned int c;
3199 int bits, i = 0;
3200
3201 c = htmlParseCharRef(ctxt);
3202 if (c == 0)
3203 return;
3204
3205 if (c < 0x80) { out[i++]= c; bits= -6; }
3206 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3207 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3208 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3209
3210 for ( ; bits >= 0; bits-= 6) {
3211 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3212 }
3213 out[i] = 0;
3214
3215 htmlCheckParagraph(ctxt);
3216 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3217 ctxt->sax->characters(ctxt->userData, out, i);
3218 } else {
3219 ent = htmlParseEntityRef(ctxt, &name);
3220 if (name == NULL) {
3221 htmlCheckParagraph(ctxt);
3222 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3223 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3224 return;
3225 }
3226 if ((ent == NULL) || (ent->value <= 0)) {
3227 htmlCheckParagraph(ctxt);
3228 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3229 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3230 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3231 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3232 }
3233 } else {
3234 unsigned int c;
3235 int bits, i = 0;
3236
3237 c = ent->value;
3238 if (c < 0x80)
3239 { out[i++]= c; bits= -6; }
3240 else if (c < 0x800)
3241 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3242 else if (c < 0x10000)
3243 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3244 else
3245 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3246
3247 for ( ; bits >= 0; bits-= 6) {
3248 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3249 }
3250 out[i] = 0;
3251
3252 htmlCheckParagraph(ctxt);
3253 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3254 ctxt->sax->characters(ctxt->userData, out, i);
3255 }
3256 xmlFree(name);
3257 }
3258}
3259
3260/**
3261 * htmlParseContent:
3262 * @ctxt: an HTML parser context
3263 * @name: the node name
3264 *
3265 * Parse a content: comment, sub-element, reference or text.
3266 *
3267 */
3268
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003269static void
Owen Taylor3473f882001-02-23 17:55:21 +00003270htmlParseContent(htmlParserCtxtPtr ctxt) {
3271 xmlChar *currentNode;
3272 int depth;
3273
3274 currentNode = xmlStrdup(ctxt->name);
3275 depth = ctxt->nameNr;
3276 while (1) {
3277 long cons = ctxt->nbChars;
3278
3279 GROW;
3280 /*
3281 * Our tag or one of it's parent or children is ending.
3282 */
3283 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003284 if (htmlParseEndTag(ctxt) &&
3285 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3286 if (currentNode != NULL)
3287 xmlFree(currentNode);
3288 return;
3289 }
3290 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003291 }
3292
3293 /*
3294 * Has this node been popped out during parsing of
3295 * the next element
3296 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003297 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3298 (!xmlStrEqual(currentNode, ctxt->name)))
3299 {
Owen Taylor3473f882001-02-23 17:55:21 +00003300 if (currentNode != NULL) xmlFree(currentNode);
3301 return;
3302 }
3303
Daniel Veillardf9533d12001-03-03 10:04:57 +00003304 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3305 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003306 /*
3307 * Handle SCRIPT/STYLE separately
3308 */
3309 htmlParseScript(ctxt);
3310 } else {
3311 /*
3312 * Sometimes DOCTYPE arrives in the middle of the document
3313 */
3314 if ((CUR == '<') && (NXT(1) == '!') &&
3315 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3316 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3317 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3318 (UPP(8) == 'E')) {
3319 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3320 ctxt->sax->error(ctxt->userData,
3321 "Misplaced DOCTYPE declaration\n");
3322 ctxt->wellFormed = 0;
3323 htmlParseDocTypeDecl(ctxt);
3324 }
3325
3326 /*
3327 * First case : a comment
3328 */
3329 if ((CUR == '<') && (NXT(1) == '!') &&
3330 (NXT(2) == '-') && (NXT(3) == '-')) {
3331 htmlParseComment(ctxt);
3332 }
3333
3334 /*
3335 * Second case : a sub-element.
3336 */
3337 else if (CUR == '<') {
3338 htmlParseElement(ctxt);
3339 }
3340
3341 /*
3342 * Third case : a reference. If if has not been resolved,
3343 * parsing returns it's Name, create the node
3344 */
3345 else if (CUR == '&') {
3346 htmlParseReference(ctxt);
3347 }
3348
3349 /*
3350 * Fourth : end of the resource
3351 */
3352 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003353 htmlAutoCloseOnEnd(ctxt);
3354 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003355 }
3356
3357 /*
3358 * Last case, text. Note that References are handled directly.
3359 */
3360 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003361 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003362 }
3363
3364 if (cons == ctxt->nbChars) {
3365 if (ctxt->node != NULL) {
3366 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3367 ctxt->sax->error(ctxt->userData,
3368 "detected an error in element content\n");
3369 ctxt->wellFormed = 0;
3370 }
3371 break;
3372 }
3373 }
3374 GROW;
3375 }
3376 if (currentNode != NULL) xmlFree(currentNode);
3377}
3378
3379/**
3380 * htmlParseElement:
3381 * @ctxt: an HTML parser context
3382 *
3383 * parse an HTML element, this is highly recursive
3384 *
3385 * [39] element ::= EmptyElemTag | STag content ETag
3386 *
3387 * [41] Attribute ::= Name Eq AttValue
3388 */
3389
3390void
3391htmlParseElement(htmlParserCtxtPtr ctxt) {
3392 xmlChar *name;
3393 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003394 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003395 htmlParserNodeInfo node_info;
3396 xmlChar *oldname;
3397 int depth = ctxt->nameNr;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003398 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003399
3400 /* Capture start position */
3401 if (ctxt->record_info) {
3402 node_info.begin_pos = ctxt->input->consumed +
3403 (CUR_PTR - ctxt->input->base);
3404 node_info.begin_line = ctxt->input->line;
3405 }
3406
3407 oldname = xmlStrdup(ctxt->name);
3408 htmlParseStartTag(ctxt);
3409 name = ctxt->name;
3410#ifdef DEBUG
3411 if (oldname == NULL)
3412 xmlGenericError(xmlGenericErrorContext,
3413 "Start of element %s\n", name);
3414 else if (name == NULL)
3415 xmlGenericError(xmlGenericErrorContext,
3416 "Start of element failed, was %s\n", oldname);
3417 else
3418 xmlGenericError(xmlGenericErrorContext,
3419 "Start of element %s, was %s\n", name, oldname);
3420#endif
3421 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3422 (name == NULL)) {
3423 if (CUR == '>')
3424 NEXT;
3425 if (oldname != NULL)
3426 xmlFree(oldname);
3427 return;
3428 }
3429 if (oldname != NULL)
3430 xmlFree(oldname);
3431
3432 /*
3433 * Lookup the info for that element.
3434 */
3435 info = htmlTagLookup(name);
3436 if (info == NULL) {
3437 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3438 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3439 name);
3440 ctxt->wellFormed = 0;
3441 } else if (info->depr) {
3442/***************************
3443 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3444 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3445 name);
3446 ***************************/
3447 }
3448
3449 /*
3450 * Check for an Empty Element labelled the XML/SGML way
3451 */
3452 if ((CUR == '/') && (NXT(1) == '>')) {
3453 SKIP(2);
3454 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3455 ctxt->sax->endElement(ctxt->userData, name);
3456 oldname = htmlnamePop(ctxt);
3457#ifdef DEBUG
3458 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
3459#endif
3460 if (oldname != NULL)
3461 xmlFree(oldname);
3462 return;
3463 }
3464
3465 if (CUR == '>') {
3466 NEXT;
3467 } else {
3468 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3469 ctxt->sax->error(ctxt->userData,
3470 "Couldn't find end of Start Tag %s\n",
3471 name);
3472 ctxt->wellFormed = 0;
3473
3474 /*
3475 * end of parsing of this node.
3476 */
3477 if (xmlStrEqual(name, ctxt->name)) {
3478 nodePop(ctxt);
3479 oldname = htmlnamePop(ctxt);
3480#ifdef DEBUG
3481 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
3482#endif
3483 if (oldname != NULL)
3484 xmlFree(oldname);
3485 }
3486
3487 /*
3488 * Capture end position and add node
3489 */
3490 if ( currentNode != NULL && ctxt->record_info ) {
3491 node_info.end_pos = ctxt->input->consumed +
3492 (CUR_PTR - ctxt->input->base);
3493 node_info.end_line = ctxt->input->line;
3494 node_info.node = ctxt->node;
3495 xmlParserAddNodeInfo(ctxt, &node_info);
3496 }
3497 return;
3498 }
3499
3500 /*
3501 * Check for an Empty Element from DTD definition
3502 */
3503 if ((info != NULL) && (info->empty)) {
3504 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3505 ctxt->sax->endElement(ctxt->userData, name);
3506 oldname = htmlnamePop(ctxt);
3507#ifdef DEBUG
3508 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
3509#endif
3510 if (oldname != NULL)
3511 xmlFree(oldname);
3512 return;
3513 }
3514
3515 /*
3516 * Parse the content of the element:
3517 */
3518 currentNode = xmlStrdup(ctxt->name);
3519 depth = ctxt->nameNr;
3520 while (IS_CHAR(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00003521 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00003522 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00003523 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00003524 if (ctxt->nameNr < depth) break;
3525 }
3526
Owen Taylor3473f882001-02-23 17:55:21 +00003527 /*
3528 * Capture end position and add node
3529 */
3530 if ( currentNode != NULL && ctxt->record_info ) {
3531 node_info.end_pos = ctxt->input->consumed +
3532 (CUR_PTR - ctxt->input->base);
3533 node_info.end_line = ctxt->input->line;
3534 node_info.node = ctxt->node;
3535 xmlParserAddNodeInfo(ctxt, &node_info);
3536 }
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003537 if (!IS_CHAR(CUR)) {
3538 htmlAutoCloseOnEnd(ctxt);
3539 }
3540
Owen Taylor3473f882001-02-23 17:55:21 +00003541 if (currentNode != NULL)
3542 xmlFree(currentNode);
3543}
3544
3545/**
3546 * htmlParseDocument :
3547 * @ctxt: an HTML parser context
3548 *
3549 * parse an HTML document (and build a tree if using the standard SAX
3550 * interface).
3551 *
3552 * Returns 0, -1 in case of error. the parser context is augmented
3553 * as a result of the parsing.
3554 */
3555
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003556static int
Owen Taylor3473f882001-02-23 17:55:21 +00003557htmlParseDocument(htmlParserCtxtPtr ctxt) {
3558 xmlDtdPtr dtd;
3559
Daniel Veillardd0463562001-10-13 09:15:48 +00003560 xmlInitParser();
3561
Owen Taylor3473f882001-02-23 17:55:21 +00003562 htmlDefaultSAXHandlerInit();
3563 ctxt->html = 1;
3564
3565 GROW;
3566 /*
3567 * SAX: beginning of the document processing.
3568 */
3569 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3570 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3571
3572 /*
3573 * Wipe out everything which is before the first '<'
3574 */
3575 SKIP_BLANKS;
3576 if (CUR == 0) {
3577 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3578 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3579 ctxt->wellFormed = 0;
3580 }
3581
3582 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3583 ctxt->sax->startDocument(ctxt->userData);
3584
3585
3586 /*
3587 * Parse possible comments before any content
3588 */
3589 while ((CUR == '<') && (NXT(1) == '!') &&
3590 (NXT(2) == '-') && (NXT(3) == '-')) {
3591 htmlParseComment(ctxt);
3592 SKIP_BLANKS;
3593 }
3594
3595
3596 /*
3597 * Then possibly doc type declaration(s) and more Misc
3598 * (doctypedecl Misc*)?
3599 */
3600 if ((CUR == '<') && (NXT(1) == '!') &&
3601 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3602 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3603 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3604 (UPP(8) == 'E')) {
3605 htmlParseDocTypeDecl(ctxt);
3606 }
3607 SKIP_BLANKS;
3608
3609 /*
3610 * Parse possible comments before any content
3611 */
3612 while ((CUR == '<') && (NXT(1) == '!') &&
3613 (NXT(2) == '-') && (NXT(3) == '-')) {
3614 htmlParseComment(ctxt);
3615 SKIP_BLANKS;
3616 }
3617
3618 /*
3619 * Time to start parsing the tree itself
3620 */
3621 htmlParseContent(ctxt);
3622
3623 /*
3624 * autoclose
3625 */
3626 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003627 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003628
3629
3630 /*
3631 * SAX: end of the document processing.
3632 */
3633 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3634 ctxt->sax->endDocument(ctxt->userData);
3635
3636 if (ctxt->myDoc != NULL) {
3637 dtd = xmlGetIntSubset(ctxt->myDoc);
3638 if (dtd == NULL)
3639 ctxt->myDoc->intSubset =
3640 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3641 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3642 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3643 }
3644 if (! ctxt->wellFormed) return(-1);
3645 return(0);
3646}
3647
3648
3649/************************************************************************
3650 * *
3651 * Parser contexts handling *
3652 * *
3653 ************************************************************************/
3654
3655/**
3656 * xmlInitParserCtxt:
3657 * @ctxt: an HTML parser context
3658 *
3659 * Initialize a parser context
3660 */
3661
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003662static void
Owen Taylor3473f882001-02-23 17:55:21 +00003663htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3664{
3665 htmlSAXHandler *sax;
3666
3667 if (ctxt == NULL) return;
3668 memset(ctxt, 0, sizeof(htmlParserCtxt));
3669
3670 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3671 if (sax == NULL) {
3672 xmlGenericError(xmlGenericErrorContext,
3673 "htmlInitParserCtxt: out of memory\n");
3674 }
3675 else
3676 memset(sax, 0, sizeof(htmlSAXHandler));
3677
3678 /* Allocate the Input stack */
3679 ctxt->inputTab = (htmlParserInputPtr *)
3680 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3681 if (ctxt->inputTab == NULL) {
3682 xmlGenericError(xmlGenericErrorContext,
3683 "htmlInitParserCtxt: out of memory\n");
3684 ctxt->inputNr = 0;
3685 ctxt->inputMax = 0;
3686 ctxt->input = NULL;
3687 return;
3688 }
3689 ctxt->inputNr = 0;
3690 ctxt->inputMax = 5;
3691 ctxt->input = NULL;
3692 ctxt->version = NULL;
3693 ctxt->encoding = NULL;
3694 ctxt->standalone = -1;
3695 ctxt->instate = XML_PARSER_START;
3696
3697 /* Allocate the Node stack */
3698 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3699 if (ctxt->nodeTab == NULL) {
3700 xmlGenericError(xmlGenericErrorContext,
3701 "htmlInitParserCtxt: out of memory\n");
3702 ctxt->nodeNr = 0;
3703 ctxt->nodeMax = 0;
3704 ctxt->node = NULL;
3705 ctxt->inputNr = 0;
3706 ctxt->inputMax = 0;
3707 ctxt->input = NULL;
3708 return;
3709 }
3710 ctxt->nodeNr = 0;
3711 ctxt->nodeMax = 10;
3712 ctxt->node = NULL;
3713
3714 /* Allocate the Name stack */
3715 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
3716 if (ctxt->nameTab == NULL) {
3717 xmlGenericError(xmlGenericErrorContext,
3718 "htmlInitParserCtxt: out of memory\n");
3719 ctxt->nameNr = 0;
3720 ctxt->nameMax = 10;
3721 ctxt->name = NULL;
3722 ctxt->nodeNr = 0;
3723 ctxt->nodeMax = 0;
3724 ctxt->node = NULL;
3725 ctxt->inputNr = 0;
3726 ctxt->inputMax = 0;
3727 ctxt->input = NULL;
3728 return;
3729 }
3730 ctxt->nameNr = 0;
3731 ctxt->nameMax = 10;
3732 ctxt->name = NULL;
3733
3734 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3735 else {
3736 ctxt->sax = sax;
3737 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3738 }
3739 ctxt->userData = ctxt;
3740 ctxt->myDoc = NULL;
3741 ctxt->wellFormed = 1;
3742 ctxt->replaceEntities = 0;
3743 ctxt->html = 1;
3744 ctxt->record_info = 0;
3745 ctxt->validate = 0;
3746 ctxt->nbChars = 0;
3747 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00003748 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003749 xmlInitNodeInfoSeq(&ctxt->node_seq);
3750}
3751
3752/**
3753 * htmlFreeParserCtxt:
3754 * @ctxt: an HTML parser context
3755 *
3756 * Free all the memory used by a parser context. However the parsed
3757 * document in ctxt->myDoc is not freed.
3758 */
3759
3760void
3761htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3762{
3763 xmlFreeParserCtxt(ctxt);
3764}
3765
3766/**
3767 * htmlCreateDocParserCtxt :
3768 * @cur: a pointer to an array of xmlChar
3769 * @encoding: a free form C string describing the HTML document encoding, or NULL
3770 *
3771 * Create a parser context for an HTML document.
3772 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003773 * TODO: check the need to add encoding handling there
3774 *
Owen Taylor3473f882001-02-23 17:55:21 +00003775 * Returns the new parser context or NULL
3776 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003777static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00003778htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Owen Taylor3473f882001-02-23 17:55:21 +00003779 htmlParserCtxtPtr ctxt;
3780 htmlParserInputPtr input;
3781 /* htmlCharEncoding enc; */
3782
3783 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
3784 if (ctxt == NULL) {
3785 perror("malloc");
3786 return(NULL);
3787 }
3788 htmlInitParserCtxt(ctxt);
3789 input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
3790 if (input == NULL) {
3791 perror("malloc");
3792 xmlFree(ctxt);
3793 return(NULL);
3794 }
3795 memset(input, 0, sizeof(htmlParserInput));
3796
3797 input->line = 1;
3798 input->col = 1;
3799 input->base = cur;
3800 input->cur = cur;
3801
3802 inputPush(ctxt, input);
3803 return(ctxt);
3804}
3805
3806/************************************************************************
3807 * *
3808 * Progressive parsing interfaces *
3809 * *
3810 ************************************************************************/
3811
3812/**
3813 * htmlParseLookupSequence:
3814 * @ctxt: an HTML parser context
3815 * @first: the first char to lookup
3816 * @next: the next char to lookup or zero
3817 * @third: the next char to lookup or zero
3818 *
3819 * Try to find if a sequence (first, next, third) or just (first next) or
3820 * (first) is available in the input stream.
3821 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3822 * to avoid rescanning sequences of bytes, it DOES change the state of the
3823 * parser, do not use liberally.
3824 * This is basically similar to xmlParseLookupSequence()
3825 *
3826 * Returns the index to the current parsing point if the full sequence
3827 * is available, -1 otherwise.
3828 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003829static int
Owen Taylor3473f882001-02-23 17:55:21 +00003830htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3831 xmlChar next, xmlChar third) {
3832 int base, len;
3833 htmlParserInputPtr in;
3834 const xmlChar *buf;
3835
3836 in = ctxt->input;
3837 if (in == NULL) return(-1);
3838 base = in->cur - in->base;
3839 if (base < 0) return(-1);
3840 if (ctxt->checkIndex > base)
3841 base = ctxt->checkIndex;
3842 if (in->buf == NULL) {
3843 buf = in->base;
3844 len = in->length;
3845 } else {
3846 buf = in->buf->buffer->content;
3847 len = in->buf->buffer->use;
3848 }
3849 /* take into account the sequence length */
3850 if (third) len -= 2;
3851 else if (next) len --;
3852 for (;base < len;base++) {
3853 if (buf[base] == first) {
3854 if (third != 0) {
3855 if ((buf[base + 1] != next) ||
3856 (buf[base + 2] != third)) continue;
3857 } else if (next != 0) {
3858 if (buf[base + 1] != next) continue;
3859 }
3860 ctxt->checkIndex = 0;
3861#ifdef DEBUG_PUSH
3862 if (next == 0)
3863 xmlGenericError(xmlGenericErrorContext,
3864 "HPP: lookup '%c' found at %d\n",
3865 first, base);
3866 else if (third == 0)
3867 xmlGenericError(xmlGenericErrorContext,
3868 "HPP: lookup '%c%c' found at %d\n",
3869 first, next, base);
3870 else
3871 xmlGenericError(xmlGenericErrorContext,
3872 "HPP: lookup '%c%c%c' found at %d\n",
3873 first, next, third, base);
3874#endif
3875 return(base - (in->cur - in->base));
3876 }
3877 }
3878 ctxt->checkIndex = base;
3879#ifdef DEBUG_PUSH
3880 if (next == 0)
3881 xmlGenericError(xmlGenericErrorContext,
3882 "HPP: lookup '%c' failed\n", first);
3883 else if (third == 0)
3884 xmlGenericError(xmlGenericErrorContext,
3885 "HPP: lookup '%c%c' failed\n", first, next);
3886 else
3887 xmlGenericError(xmlGenericErrorContext,
3888 "HPP: lookup '%c%c%c' failed\n", first, next, third);
3889#endif
3890 return(-1);
3891}
3892
3893/**
3894 * htmlParseTryOrFinish:
3895 * @ctxt: an HTML parser context
3896 * @terminate: last chunk indicator
3897 *
3898 * Try to progress on parsing
3899 *
3900 * Returns zero if no parsing was possible
3901 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003902static int
Owen Taylor3473f882001-02-23 17:55:21 +00003903htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
3904 int ret = 0;
3905 htmlParserInputPtr in;
3906 int avail = 0;
3907 xmlChar cur, next;
3908
3909#ifdef DEBUG_PUSH
3910 switch (ctxt->instate) {
3911 case XML_PARSER_EOF:
3912 xmlGenericError(xmlGenericErrorContext,
3913 "HPP: try EOF\n"); break;
3914 case XML_PARSER_START:
3915 xmlGenericError(xmlGenericErrorContext,
3916 "HPP: try START\n"); break;
3917 case XML_PARSER_MISC:
3918 xmlGenericError(xmlGenericErrorContext,
3919 "HPP: try MISC\n");break;
3920 case XML_PARSER_COMMENT:
3921 xmlGenericError(xmlGenericErrorContext,
3922 "HPP: try COMMENT\n");break;
3923 case XML_PARSER_PROLOG:
3924 xmlGenericError(xmlGenericErrorContext,
3925 "HPP: try PROLOG\n");break;
3926 case XML_PARSER_START_TAG:
3927 xmlGenericError(xmlGenericErrorContext,
3928 "HPP: try START_TAG\n");break;
3929 case XML_PARSER_CONTENT:
3930 xmlGenericError(xmlGenericErrorContext,
3931 "HPP: try CONTENT\n");break;
3932 case XML_PARSER_CDATA_SECTION:
3933 xmlGenericError(xmlGenericErrorContext,
3934 "HPP: try CDATA_SECTION\n");break;
3935 case XML_PARSER_END_TAG:
3936 xmlGenericError(xmlGenericErrorContext,
3937 "HPP: try END_TAG\n");break;
3938 case XML_PARSER_ENTITY_DECL:
3939 xmlGenericError(xmlGenericErrorContext,
3940 "HPP: try ENTITY_DECL\n");break;
3941 case XML_PARSER_ENTITY_VALUE:
3942 xmlGenericError(xmlGenericErrorContext,
3943 "HPP: try ENTITY_VALUE\n");break;
3944 case XML_PARSER_ATTRIBUTE_VALUE:
3945 xmlGenericError(xmlGenericErrorContext,
3946 "HPP: try ATTRIBUTE_VALUE\n");break;
3947 case XML_PARSER_DTD:
3948 xmlGenericError(xmlGenericErrorContext,
3949 "HPP: try DTD\n");break;
3950 case XML_PARSER_EPILOG:
3951 xmlGenericError(xmlGenericErrorContext,
3952 "HPP: try EPILOG\n");break;
3953 case XML_PARSER_PI:
3954 xmlGenericError(xmlGenericErrorContext,
3955 "HPP: try PI\n");break;
3956 case XML_PARSER_SYSTEM_LITERAL:
3957 xmlGenericError(xmlGenericErrorContext,
3958 "HPP: try SYSTEM_LITERAL\n");break;
3959 }
3960#endif
3961
3962 while (1) {
3963
3964 in = ctxt->input;
3965 if (in == NULL) break;
3966 if (in->buf == NULL)
3967 avail = in->length - (in->cur - in->base);
3968 else
3969 avail = in->buf->buffer->use - (in->cur - in->base);
3970 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003971 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003972 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
3973 /*
3974 * SAX: end of the document processing.
3975 */
3976 ctxt->instate = XML_PARSER_EOF;
3977 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3978 ctxt->sax->endDocument(ctxt->userData);
3979 }
3980 }
3981 if (avail < 1)
3982 goto done;
3983 switch (ctxt->instate) {
3984 case XML_PARSER_EOF:
3985 /*
3986 * Document parsing is done !
3987 */
3988 goto done;
3989 case XML_PARSER_START:
3990 /*
3991 * Very first chars read from the document flow.
3992 */
3993 cur = in->cur[0];
3994 if (IS_BLANK(cur)) {
3995 SKIP_BLANKS;
3996 if (in->buf == NULL)
3997 avail = in->length - (in->cur - in->base);
3998 else
3999 avail = in->buf->buffer->use - (in->cur - in->base);
4000 }
4001 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4002 ctxt->sax->setDocumentLocator(ctxt->userData,
4003 &xmlDefaultSAXLocator);
4004 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4005 (!ctxt->disableSAX))
4006 ctxt->sax->startDocument(ctxt->userData);
4007
4008 cur = in->cur[0];
4009 next = in->cur[1];
4010 if ((cur == '<') && (next == '!') &&
4011 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4012 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4013 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4014 (UPP(8) == 'E')) {
4015 if ((!terminate) &&
4016 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4017 goto done;
4018#ifdef DEBUG_PUSH
4019 xmlGenericError(xmlGenericErrorContext,
4020 "HPP: Parsing internal subset\n");
4021#endif
4022 htmlParseDocTypeDecl(ctxt);
4023 ctxt->instate = XML_PARSER_PROLOG;
4024#ifdef DEBUG_PUSH
4025 xmlGenericError(xmlGenericErrorContext,
4026 "HPP: entering PROLOG\n");
4027#endif
4028 } else {
4029 ctxt->instate = XML_PARSER_MISC;
4030 }
4031#ifdef DEBUG_PUSH
4032 xmlGenericError(xmlGenericErrorContext,
4033 "HPP: entering MISC\n");
4034#endif
4035 break;
4036 case XML_PARSER_MISC:
4037 SKIP_BLANKS;
4038 if (in->buf == NULL)
4039 avail = in->length - (in->cur - in->base);
4040 else
4041 avail = in->buf->buffer->use - (in->cur - in->base);
4042 if (avail < 2)
4043 goto done;
4044 cur = in->cur[0];
4045 next = in->cur[1];
4046 if ((cur == '<') && (next == '!') &&
4047 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4048 if ((!terminate) &&
4049 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4050 goto done;
4051#ifdef DEBUG_PUSH
4052 xmlGenericError(xmlGenericErrorContext,
4053 "HPP: Parsing Comment\n");
4054#endif
4055 htmlParseComment(ctxt);
4056 ctxt->instate = XML_PARSER_MISC;
4057 } else if ((cur == '<') && (next == '!') &&
4058 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4059 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4060 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4061 (UPP(8) == 'E')) {
4062 if ((!terminate) &&
4063 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4064 goto done;
4065#ifdef DEBUG_PUSH
4066 xmlGenericError(xmlGenericErrorContext,
4067 "HPP: Parsing internal subset\n");
4068#endif
4069 htmlParseDocTypeDecl(ctxt);
4070 ctxt->instate = XML_PARSER_PROLOG;
4071#ifdef DEBUG_PUSH
4072 xmlGenericError(xmlGenericErrorContext,
4073 "HPP: entering PROLOG\n");
4074#endif
4075 } else if ((cur == '<') && (next == '!') &&
4076 (avail < 9)) {
4077 goto done;
4078 } else {
4079 ctxt->instate = XML_PARSER_START_TAG;
4080#ifdef DEBUG_PUSH
4081 xmlGenericError(xmlGenericErrorContext,
4082 "HPP: entering START_TAG\n");
4083#endif
4084 }
4085 break;
4086 case XML_PARSER_PROLOG:
4087 SKIP_BLANKS;
4088 if (in->buf == NULL)
4089 avail = in->length - (in->cur - in->base);
4090 else
4091 avail = in->buf->buffer->use - (in->cur - in->base);
4092 if (avail < 2)
4093 goto done;
4094 cur = in->cur[0];
4095 next = in->cur[1];
4096 if ((cur == '<') && (next == '!') &&
4097 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4098 if ((!terminate) &&
4099 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4100 goto done;
4101#ifdef DEBUG_PUSH
4102 xmlGenericError(xmlGenericErrorContext,
4103 "HPP: Parsing Comment\n");
4104#endif
4105 htmlParseComment(ctxt);
4106 ctxt->instate = XML_PARSER_PROLOG;
4107 } else if ((cur == '<') && (next == '!') &&
4108 (avail < 4)) {
4109 goto done;
4110 } else {
4111 ctxt->instate = XML_PARSER_START_TAG;
4112#ifdef DEBUG_PUSH
4113 xmlGenericError(xmlGenericErrorContext,
4114 "HPP: entering START_TAG\n");
4115#endif
4116 }
4117 break;
4118 case XML_PARSER_EPILOG:
4119 if (in->buf == NULL)
4120 avail = in->length - (in->cur - in->base);
4121 else
4122 avail = in->buf->buffer->use - (in->cur - in->base);
4123 if (avail < 1)
4124 goto done;
4125 cur = in->cur[0];
4126 if (IS_BLANK(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004127 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004128 goto done;
4129 }
4130 if (avail < 2)
4131 goto done;
4132 next = in->cur[1];
4133 if ((cur == '<') && (next == '!') &&
4134 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4135 if ((!terminate) &&
4136 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4137 goto done;
4138#ifdef DEBUG_PUSH
4139 xmlGenericError(xmlGenericErrorContext,
4140 "HPP: Parsing Comment\n");
4141#endif
4142 htmlParseComment(ctxt);
4143 ctxt->instate = XML_PARSER_EPILOG;
4144 } else if ((cur == '<') && (next == '!') &&
4145 (avail < 4)) {
4146 goto done;
4147 } else {
4148 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004149 ctxt->wellFormed = 0;
4150 ctxt->instate = XML_PARSER_EOF;
4151#ifdef DEBUG_PUSH
4152 xmlGenericError(xmlGenericErrorContext,
4153 "HPP: entering EOF\n");
4154#endif
4155 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4156 ctxt->sax->endDocument(ctxt->userData);
4157 goto done;
4158 }
4159 break;
4160 case XML_PARSER_START_TAG: {
4161 xmlChar *name, *oldname;
4162 int depth = ctxt->nameNr;
Daniel Veillardbb371292001-08-16 23:26:59 +00004163 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004164
4165 if (avail < 2)
4166 goto done;
4167 cur = in->cur[0];
4168 if (cur != '<') {
4169 ctxt->instate = XML_PARSER_CONTENT;
4170#ifdef DEBUG_PUSH
4171 xmlGenericError(xmlGenericErrorContext,
4172 "HPP: entering CONTENT\n");
4173#endif
4174 break;
4175 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004176 if (in->cur[1] == '/') {
4177 ctxt->instate = XML_PARSER_END_TAG;
4178 ctxt->checkIndex = 0;
4179#ifdef DEBUG_PUSH
4180 xmlGenericError(xmlGenericErrorContext,
4181 "HPP: entering END_TAG\n");
4182#endif
4183 break;
4184 }
Owen Taylor3473f882001-02-23 17:55:21 +00004185 if ((!terminate) &&
4186 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4187 goto done;
4188
4189 oldname = xmlStrdup(ctxt->name);
4190 htmlParseStartTag(ctxt);
4191 name = ctxt->name;
4192#ifdef DEBUG
4193 if (oldname == NULL)
4194 xmlGenericError(xmlGenericErrorContext,
4195 "Start of element %s\n", name);
4196 else if (name == NULL)
4197 xmlGenericError(xmlGenericErrorContext,
4198 "Start of element failed, was %s\n",
4199 oldname);
4200 else
4201 xmlGenericError(xmlGenericErrorContext,
4202 "Start of element %s, was %s\n",
4203 name, oldname);
4204#endif
4205 if (((depth == ctxt->nameNr) &&
4206 (xmlStrEqual(oldname, ctxt->name))) ||
4207 (name == NULL)) {
4208 if (CUR == '>')
4209 NEXT;
4210 if (oldname != NULL)
4211 xmlFree(oldname);
4212 break;
4213 }
4214 if (oldname != NULL)
4215 xmlFree(oldname);
4216
4217 /*
4218 * Lookup the info for that element.
4219 */
4220 info = htmlTagLookup(name);
4221 if (info == NULL) {
4222 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4223 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4224 name);
4225 ctxt->wellFormed = 0;
4226 } else if (info->depr) {
4227 /***************************
4228 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4229 ctxt->sax->warning(ctxt->userData,
4230 "Tag %s is deprecated\n",
4231 name);
4232 ***************************/
4233 }
4234
4235 /*
4236 * Check for an Empty Element labelled the XML/SGML way
4237 */
4238 if ((CUR == '/') && (NXT(1) == '>')) {
4239 SKIP(2);
4240 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4241 ctxt->sax->endElement(ctxt->userData, name);
4242 oldname = htmlnamePop(ctxt);
4243#ifdef DEBUG
4244 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
4245 oldname);
4246#endif
4247 if (oldname != NULL)
4248 xmlFree(oldname);
4249 ctxt->instate = XML_PARSER_CONTENT;
4250#ifdef DEBUG_PUSH
4251 xmlGenericError(xmlGenericErrorContext,
4252 "HPP: entering CONTENT\n");
4253#endif
4254 break;
4255 }
4256
4257 if (CUR == '>') {
4258 NEXT;
4259 } else {
4260 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4261 ctxt->sax->error(ctxt->userData,
4262 "Couldn't find end of Start Tag %s\n",
4263 name);
4264 ctxt->wellFormed = 0;
4265
4266 /*
4267 * end of parsing of this node.
4268 */
4269 if (xmlStrEqual(name, ctxt->name)) {
4270 nodePop(ctxt);
4271 oldname = htmlnamePop(ctxt);
4272#ifdef DEBUG
4273 xmlGenericError(xmlGenericErrorContext,
4274 "End of start tag problem: popping out %s\n", oldname);
4275#endif
4276 if (oldname != NULL)
4277 xmlFree(oldname);
4278 }
4279
4280 ctxt->instate = XML_PARSER_CONTENT;
4281#ifdef DEBUG_PUSH
4282 xmlGenericError(xmlGenericErrorContext,
4283 "HPP: entering CONTENT\n");
4284#endif
4285 break;
4286 }
4287
4288 /*
4289 * Check for an Empty Element from DTD definition
4290 */
4291 if ((info != NULL) && (info->empty)) {
4292 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4293 ctxt->sax->endElement(ctxt->userData, name);
4294 oldname = htmlnamePop(ctxt);
4295#ifdef DEBUG
4296 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
4297#endif
4298 if (oldname != NULL)
4299 xmlFree(oldname);
4300 }
4301 ctxt->instate = XML_PARSER_CONTENT;
4302#ifdef DEBUG_PUSH
4303 xmlGenericError(xmlGenericErrorContext,
4304 "HPP: entering CONTENT\n");
4305#endif
4306 break;
4307 }
4308 case XML_PARSER_CONTENT: {
4309 long cons;
4310 /*
4311 * Handle preparsed entities and charRef
4312 */
4313 if (ctxt->token != 0) {
4314 xmlChar chr[2] = { 0 , 0 } ;
4315
4316 chr[0] = (xmlChar) ctxt->token;
4317 htmlCheckParagraph(ctxt);
4318 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4319 ctxt->sax->characters(ctxt->userData, chr, 1);
4320 ctxt->token = 0;
4321 ctxt->checkIndex = 0;
4322 }
4323 if ((avail == 1) && (terminate)) {
4324 cur = in->cur[0];
4325 if ((cur != '<') && (cur != '&')) {
4326 if (ctxt->sax != NULL) {
4327 if (IS_BLANK(cur)) {
4328 if (ctxt->sax->ignorableWhitespace != NULL)
4329 ctxt->sax->ignorableWhitespace(
4330 ctxt->userData, &cur, 1);
4331 } else {
4332 htmlCheckParagraph(ctxt);
4333 if (ctxt->sax->characters != NULL)
4334 ctxt->sax->characters(
4335 ctxt->userData, &cur, 1);
4336 }
4337 }
4338 ctxt->token = 0;
4339 ctxt->checkIndex = 0;
4340 NEXT;
William M. Brack1633d182001-10-05 15:41:19 +00004341 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004342 }
Owen Taylor3473f882001-02-23 17:55:21 +00004343 }
4344 if (avail < 2)
4345 goto done;
4346 cur = in->cur[0];
4347 next = in->cur[1];
4348 cons = ctxt->nbChars;
4349 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4350 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4351 /*
4352 * Handle SCRIPT/STYLE separately
4353 */
4354 if ((!terminate) &&
4355 (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4356 goto done;
4357 htmlParseScript(ctxt);
4358 if ((cur == '<') && (next == '/')) {
4359 ctxt->instate = XML_PARSER_END_TAG;
4360 ctxt->checkIndex = 0;
4361#ifdef DEBUG_PUSH
4362 xmlGenericError(xmlGenericErrorContext,
4363 "HPP: entering END_TAG\n");
4364#endif
4365 break;
4366 }
4367 } else {
4368 /*
4369 * Sometimes DOCTYPE arrives in the middle of the document
4370 */
4371 if ((cur == '<') && (next == '!') &&
4372 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4373 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4374 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4375 (UPP(8) == 'E')) {
4376 if ((!terminate) &&
4377 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4378 goto done;
4379 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4380 ctxt->sax->error(ctxt->userData,
4381 "Misplaced DOCTYPE declaration\n");
4382 ctxt->wellFormed = 0;
4383 htmlParseDocTypeDecl(ctxt);
4384 } else if ((cur == '<') && (next == '!') &&
4385 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4386 if ((!terminate) &&
4387 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4388 goto done;
4389#ifdef DEBUG_PUSH
4390 xmlGenericError(xmlGenericErrorContext,
4391 "HPP: Parsing Comment\n");
4392#endif
4393 htmlParseComment(ctxt);
4394 ctxt->instate = XML_PARSER_CONTENT;
4395 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4396 goto done;
4397 } else if ((cur == '<') && (next == '/')) {
4398 ctxt->instate = XML_PARSER_END_TAG;
4399 ctxt->checkIndex = 0;
4400#ifdef DEBUG_PUSH
4401 xmlGenericError(xmlGenericErrorContext,
4402 "HPP: entering END_TAG\n");
4403#endif
4404 break;
4405 } else if (cur == '<') {
4406 ctxt->instate = XML_PARSER_START_TAG;
4407 ctxt->checkIndex = 0;
4408#ifdef DEBUG_PUSH
4409 xmlGenericError(xmlGenericErrorContext,
4410 "HPP: entering START_TAG\n");
4411#endif
4412 break;
4413 } else if (cur == '&') {
4414 if ((!terminate) &&
4415 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4416 goto done;
4417#ifdef DEBUG_PUSH
4418 xmlGenericError(xmlGenericErrorContext,
4419 "HPP: Parsing Reference\n");
4420#endif
4421 /* TODO: check generation of subtrees if noent !!! */
4422 htmlParseReference(ctxt);
4423 } else {
4424 /* TODO Avoid the extra copy, handle directly !!!!!! */
4425 /*
4426 * Goal of the following test is :
4427 * - minimize calls to the SAX 'character' callback
4428 * when they are mergeable
4429 */
4430 if ((ctxt->inputNr == 1) &&
4431 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4432 if ((!terminate) &&
4433 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4434 goto done;
4435 }
4436 ctxt->checkIndex = 0;
4437#ifdef DEBUG_PUSH
4438 xmlGenericError(xmlGenericErrorContext,
4439 "HPP: Parsing char data\n");
4440#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004441 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004442 }
4443 }
4444 if (cons == ctxt->nbChars) {
4445 if (ctxt->node != NULL) {
4446 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4447 ctxt->sax->error(ctxt->userData,
4448 "detected an error in element content\n");
4449 ctxt->wellFormed = 0;
4450 }
4451 NEXT;
4452 break;
4453 }
4454
4455 break;
4456 }
4457 case XML_PARSER_END_TAG:
4458 if (avail < 2)
4459 goto done;
4460 if ((!terminate) &&
4461 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4462 goto done;
4463 htmlParseEndTag(ctxt);
4464 if (ctxt->nameNr == 0) {
4465 ctxt->instate = XML_PARSER_EPILOG;
4466 } else {
4467 ctxt->instate = XML_PARSER_CONTENT;
4468 }
4469 ctxt->checkIndex = 0;
4470#ifdef DEBUG_PUSH
4471 xmlGenericError(xmlGenericErrorContext,
4472 "HPP: entering CONTENT\n");
4473#endif
4474 break;
4475 case XML_PARSER_CDATA_SECTION:
4476 xmlGenericError(xmlGenericErrorContext,
4477 "HPP: internal error, state == CDATA\n");
4478 ctxt->instate = XML_PARSER_CONTENT;
4479 ctxt->checkIndex = 0;
4480#ifdef DEBUG_PUSH
4481 xmlGenericError(xmlGenericErrorContext,
4482 "HPP: entering CONTENT\n");
4483#endif
4484 break;
4485 case XML_PARSER_DTD:
4486 xmlGenericError(xmlGenericErrorContext,
4487 "HPP: internal error, state == DTD\n");
4488 ctxt->instate = XML_PARSER_CONTENT;
4489 ctxt->checkIndex = 0;
4490#ifdef DEBUG_PUSH
4491 xmlGenericError(xmlGenericErrorContext,
4492 "HPP: entering CONTENT\n");
4493#endif
4494 break;
4495 case XML_PARSER_COMMENT:
4496 xmlGenericError(xmlGenericErrorContext,
4497 "HPP: internal error, state == COMMENT\n");
4498 ctxt->instate = XML_PARSER_CONTENT;
4499 ctxt->checkIndex = 0;
4500#ifdef DEBUG_PUSH
4501 xmlGenericError(xmlGenericErrorContext,
4502 "HPP: entering CONTENT\n");
4503#endif
4504 break;
4505 case XML_PARSER_PI:
4506 xmlGenericError(xmlGenericErrorContext,
4507 "HPP: internal error, state == PI\n");
4508 ctxt->instate = XML_PARSER_CONTENT;
4509 ctxt->checkIndex = 0;
4510#ifdef DEBUG_PUSH
4511 xmlGenericError(xmlGenericErrorContext,
4512 "HPP: entering CONTENT\n");
4513#endif
4514 break;
4515 case XML_PARSER_ENTITY_DECL:
4516 xmlGenericError(xmlGenericErrorContext,
4517 "HPP: internal error, state == ENTITY_DECL\n");
4518 ctxt->instate = XML_PARSER_CONTENT;
4519 ctxt->checkIndex = 0;
4520#ifdef DEBUG_PUSH
4521 xmlGenericError(xmlGenericErrorContext,
4522 "HPP: entering CONTENT\n");
4523#endif
4524 break;
4525 case XML_PARSER_ENTITY_VALUE:
4526 xmlGenericError(xmlGenericErrorContext,
4527 "HPP: internal error, state == ENTITY_VALUE\n");
4528 ctxt->instate = XML_PARSER_CONTENT;
4529 ctxt->checkIndex = 0;
4530#ifdef DEBUG_PUSH
4531 xmlGenericError(xmlGenericErrorContext,
4532 "HPP: entering DTD\n");
4533#endif
4534 break;
4535 case XML_PARSER_ATTRIBUTE_VALUE:
4536 xmlGenericError(xmlGenericErrorContext,
4537 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
4538 ctxt->instate = XML_PARSER_START_TAG;
4539 ctxt->checkIndex = 0;
4540#ifdef DEBUG_PUSH
4541 xmlGenericError(xmlGenericErrorContext,
4542 "HPP: entering START_TAG\n");
4543#endif
4544 break;
4545 case XML_PARSER_SYSTEM_LITERAL:
4546 xmlGenericError(xmlGenericErrorContext,
4547 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
4548 ctxt->instate = XML_PARSER_CONTENT;
4549 ctxt->checkIndex = 0;
4550#ifdef DEBUG_PUSH
4551 xmlGenericError(xmlGenericErrorContext,
4552 "HPP: entering CONTENT\n");
4553#endif
4554 break;
4555 case XML_PARSER_IGNORE:
4556 xmlGenericError(xmlGenericErrorContext,
4557 "HPP: internal error, state == XML_PARSER_IGNORE\n");
4558 ctxt->instate = XML_PARSER_CONTENT;
4559 ctxt->checkIndex = 0;
4560#ifdef DEBUG_PUSH
4561 xmlGenericError(xmlGenericErrorContext,
4562 "HPP: entering CONTENT\n");
4563#endif
4564 break;
4565 }
4566 }
4567done:
4568 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004569 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004570 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4571 /*
4572 * SAX: end of the document processing.
4573 */
4574 ctxt->instate = XML_PARSER_EOF;
4575 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4576 ctxt->sax->endDocument(ctxt->userData);
4577 }
4578 }
4579 if ((ctxt->myDoc != NULL) &&
4580 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4581 (ctxt->instate == XML_PARSER_EPILOG))) {
4582 xmlDtdPtr dtd;
4583 dtd = xmlGetIntSubset(ctxt->myDoc);
4584 if (dtd == NULL)
4585 ctxt->myDoc->intSubset =
4586 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4587 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4588 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4589 }
4590#ifdef DEBUG_PUSH
4591 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4592#endif
4593 return(ret);
4594}
4595
4596/**
Owen Taylor3473f882001-02-23 17:55:21 +00004597 * htmlParseChunk:
4598 * @ctxt: an XML parser context
4599 * @chunk: an char array
4600 * @size: the size in byte of the chunk
4601 * @terminate: last chunk indicator
4602 *
4603 * Parse a Chunk of memory
4604 *
4605 * Returns zero if no error, the xmlParserErrors otherwise.
4606 */
4607int
4608htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4609 int terminate) {
4610 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4611 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4612 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4613 int cur = ctxt->input->cur - ctxt->input->base;
4614
4615 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4616 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4617 ctxt->input->cur = ctxt->input->base + cur;
4618#ifdef DEBUG_PUSH
4619 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4620#endif
4621
4622 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4623 htmlParseTryOrFinish(ctxt, terminate);
4624 } else if (ctxt->instate != XML_PARSER_EOF) {
4625 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
4626 htmlParseTryOrFinish(ctxt, terminate);
4627 }
4628 if (terminate) {
4629 if ((ctxt->instate != XML_PARSER_EOF) &&
4630 (ctxt->instate != XML_PARSER_EPILOG) &&
4631 (ctxt->instate != XML_PARSER_MISC)) {
4632 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004633 ctxt->wellFormed = 0;
4634 }
4635 if (ctxt->instate != XML_PARSER_EOF) {
4636 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4637 ctxt->sax->endDocument(ctxt->userData);
4638 }
4639 ctxt->instate = XML_PARSER_EOF;
4640 }
4641 return((xmlParserErrors) ctxt->errNo);
4642}
4643
4644/************************************************************************
4645 * *
4646 * User entry points *
4647 * *
4648 ************************************************************************/
4649
4650/**
4651 * htmlCreatePushParserCtxt :
4652 * @sax: a SAX handler
4653 * @user_data: The user data returned on SAX callbacks
4654 * @chunk: a pointer to an array of chars
4655 * @size: number of chars in the array
4656 * @filename: an optional file name or URI
4657 * @enc: an optional encoding
4658 *
4659 * Create a parser context for using the HTML parser in push mode
4660 * To allow content encoding detection, @size should be >= 4
4661 * The value of @filename is used for fetching external entities
4662 * and error/warning reports.
4663 *
4664 * Returns the new parser context or NULL
4665 */
4666htmlParserCtxtPtr
4667htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4668 const char *chunk, int size, const char *filename,
4669 xmlCharEncoding enc) {
4670 htmlParserCtxtPtr ctxt;
4671 htmlParserInputPtr inputStream;
4672 xmlParserInputBufferPtr buf;
4673
Daniel Veillardd0463562001-10-13 09:15:48 +00004674 xmlInitParser();
4675
Owen Taylor3473f882001-02-23 17:55:21 +00004676 buf = xmlAllocParserInputBuffer(enc);
4677 if (buf == NULL) return(NULL);
4678
4679 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4680 if (ctxt == NULL) {
4681 xmlFree(buf);
4682 return(NULL);
4683 }
4684 memset(ctxt, 0, sizeof(htmlParserCtxt));
4685 htmlInitParserCtxt(ctxt);
4686 if (sax != NULL) {
4687 if (ctxt->sax != &htmlDefaultSAXHandler)
4688 xmlFree(ctxt->sax);
4689 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4690 if (ctxt->sax == NULL) {
4691 xmlFree(buf);
4692 xmlFree(ctxt);
4693 return(NULL);
4694 }
4695 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4696 if (user_data != NULL)
4697 ctxt->userData = user_data;
4698 }
4699 if (filename == NULL) {
4700 ctxt->directory = NULL;
4701 } else {
4702 ctxt->directory = xmlParserGetDirectory(filename);
4703 }
4704
4705 inputStream = htmlNewInputStream(ctxt);
4706 if (inputStream == NULL) {
4707 xmlFreeParserCtxt(ctxt);
4708 return(NULL);
4709 }
4710
4711 if (filename == NULL)
4712 inputStream->filename = NULL;
4713 else
4714 inputStream->filename = xmlMemStrdup(filename);
4715 inputStream->buf = buf;
4716 inputStream->base = inputStream->buf->buffer->content;
4717 inputStream->cur = inputStream->buf->buffer->content;
4718
4719 inputPush(ctxt, inputStream);
4720
4721 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4722 (ctxt->input->buf != NULL)) {
4723 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4724#ifdef DEBUG_PUSH
4725 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4726#endif
4727 }
4728
4729 return(ctxt);
4730}
4731
4732/**
4733 * htmlSAXParseDoc :
4734 * @cur: a pointer to an array of xmlChar
4735 * @encoding: a free form C string describing the HTML document encoding, or NULL
4736 * @sax: the SAX handler block
4737 * @userData: if using SAX, this pointer will be provided on callbacks.
4738 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004739 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
4740 * to handle parse events. If sax is NULL, fallback to the default DOM
4741 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00004742 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004743 * Returns the resulting document tree unless SAX is NULL or the document is
4744 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00004745 */
4746
4747htmlDocPtr
4748htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
4749 htmlDocPtr ret;
4750 htmlParserCtxtPtr ctxt;
4751
Daniel Veillardd0463562001-10-13 09:15:48 +00004752 xmlInitParser();
4753
Owen Taylor3473f882001-02-23 17:55:21 +00004754 if (cur == NULL) return(NULL);
4755
4756
4757 ctxt = htmlCreateDocParserCtxt(cur, encoding);
4758 if (ctxt == NULL) return(NULL);
4759 if (sax != NULL) {
4760 ctxt->sax = sax;
4761 ctxt->userData = userData;
4762 }
4763
4764 htmlParseDocument(ctxt);
4765 ret = ctxt->myDoc;
4766 if (sax != NULL) {
4767 ctxt->sax = NULL;
4768 ctxt->userData = NULL;
4769 }
4770 htmlFreeParserCtxt(ctxt);
4771
4772 return(ret);
4773}
4774
4775/**
4776 * htmlParseDoc :
4777 * @cur: a pointer to an array of xmlChar
4778 * @encoding: a free form C string describing the HTML document encoding, or NULL
4779 *
4780 * parse an HTML in-memory document and build a tree.
4781 *
4782 * Returns the resulting document tree
4783 */
4784
4785htmlDocPtr
4786htmlParseDoc(xmlChar *cur, const char *encoding) {
4787 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4788}
4789
4790
4791/**
4792 * htmlCreateFileParserCtxt :
4793 * @filename: the filename
4794 * @encoding: a free form C string describing the HTML document encoding, or NULL
4795 *
4796 * Create a parser context for a file content.
4797 * Automatic support for ZLIB/Compress compressed document is provided
4798 * by default if found at compile-time.
4799 *
4800 * Returns the new parser context or NULL
4801 */
4802htmlParserCtxtPtr
4803htmlCreateFileParserCtxt(const char *filename, const char *encoding)
4804{
4805 htmlParserCtxtPtr ctxt;
4806 htmlParserInputPtr inputStream;
4807 xmlParserInputBufferPtr buf;
4808 /* htmlCharEncoding enc; */
4809 xmlChar *content, *content_line = (xmlChar *) "charset=";
4810
4811 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
4812 if (buf == NULL) return(NULL);
4813
4814 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4815 if (ctxt == NULL) {
4816 perror("malloc");
4817 return(NULL);
4818 }
4819 memset(ctxt, 0, sizeof(htmlParserCtxt));
4820 htmlInitParserCtxt(ctxt);
4821 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
4822 if (inputStream == NULL) {
4823 perror("malloc");
4824 xmlFree(ctxt);
4825 return(NULL);
4826 }
4827 memset(inputStream, 0, sizeof(htmlParserInput));
4828
4829 inputStream->filename = xmlMemStrdup(filename);
4830 inputStream->line = 1;
4831 inputStream->col = 1;
4832 inputStream->buf = buf;
4833 inputStream->directory = NULL;
4834
4835 inputStream->base = inputStream->buf->buffer->content;
4836 inputStream->cur = inputStream->buf->buffer->content;
4837 inputStream->free = NULL;
4838
4839 inputPush(ctxt, inputStream);
4840
4841 /* set encoding */
4842 if (encoding) {
4843 content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
4844 if (content) {
4845 strcpy ((char *)content, (char *)content_line);
4846 strcat ((char *)content, (char *)encoding);
4847 htmlCheckEncoding (ctxt, content);
4848 xmlFree (content);
4849 }
4850 }
4851
4852 return(ctxt);
4853}
4854
4855/**
4856 * htmlSAXParseFile :
4857 * @filename: the filename
4858 * @encoding: a free form C string describing the HTML document encoding, or NULL
4859 * @sax: the SAX handler block
4860 * @userData: if using SAX, this pointer will be provided on callbacks.
4861 *
4862 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4863 * compressed document is provided by default if found at compile-time.
4864 * It use the given SAX function block to handle the parsing callback.
4865 * If sax is NULL, fallback to the default DOM tree building routines.
4866 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004867 * Returns the resulting document tree unless SAX is NULL or the document is
4868 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00004869 */
4870
4871htmlDocPtr
4872htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
4873 void *userData) {
4874 htmlDocPtr ret;
4875 htmlParserCtxtPtr ctxt;
4876 htmlSAXHandlerPtr oldsax = NULL;
4877
Daniel Veillardd0463562001-10-13 09:15:48 +00004878 xmlInitParser();
4879
Owen Taylor3473f882001-02-23 17:55:21 +00004880 ctxt = htmlCreateFileParserCtxt(filename, encoding);
4881 if (ctxt == NULL) return(NULL);
4882 if (sax != NULL) {
4883 oldsax = ctxt->sax;
4884 ctxt->sax = sax;
4885 ctxt->userData = userData;
4886 }
4887
4888 htmlParseDocument(ctxt);
4889
4890 ret = ctxt->myDoc;
4891 if (sax != NULL) {
4892 ctxt->sax = oldsax;
4893 ctxt->userData = NULL;
4894 }
4895 htmlFreeParserCtxt(ctxt);
4896
4897 return(ret);
4898}
4899
4900/**
4901 * htmlParseFile :
4902 * @filename: the filename
4903 * @encoding: a free form C string describing the HTML document encoding, or NULL
4904 *
4905 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4906 * compressed document is provided by default if found at compile-time.
4907 *
4908 * Returns the resulting document tree
4909 */
4910
4911htmlDocPtr
4912htmlParseFile(const char *filename, const char *encoding) {
4913 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
4914}
4915
4916/**
4917 * htmlHandleOmittedElem:
4918 * @val: int 0 or 1
4919 *
4920 * Set and return the previous value for handling HTML omitted tags.
4921 *
4922 * Returns the last value for 0 for no handling, 1 for auto insertion.
4923 */
4924
4925int
4926htmlHandleOmittedElem(int val) {
4927 int old = htmlOmittedDefaultValue;
4928
4929 htmlOmittedDefaultValue = val;
4930 return(old);
4931}
4932
4933#endif /* LIBXML_HTML_ENABLED */