blob: af941a0ec3d90fb90a786f9d0ab7404da8efae2f [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Bjorn Reese70a9da52001-04-21 16:57:29 +00009#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000010#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000011
Owen Taylor3473f882001-02-23 17:55:21 +000012#include <string.h>
13#ifdef HAVE_CTYPE_H
14#include <ctype.h>
15#endif
16#ifdef HAVE_STDLIB_H
17#include <stdlib.h>
18#endif
19#ifdef HAVE_SYS_STAT_H
20#include <sys/stat.h>
21#endif
22#ifdef HAVE_FCNTL_H
23#include <fcntl.h>
24#endif
25#ifdef HAVE_UNISTD_H
26#include <unistd.h>
27#endif
28#ifdef HAVE_ZLIB_H
29#include <zlib.h>
30#endif
31
32#include <libxml/xmlmemory.h>
33#include <libxml/tree.h>
34#include <libxml/parser.h>
35#include <libxml/parserInternals.h>
36#include <libxml/xmlerror.h>
37#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000038#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000039#include <libxml/entities.h>
40#include <libxml/encoding.h>
41#include <libxml/valid.h>
42#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000043#include <libxml/globals.h>
Owen Taylor3473f882001-02-23 17:55:21 +000044
45#define HTML_MAX_NAMELEN 1000
46#define HTML_PARSER_BIG_BUFFER_SIZE 1000
47#define HTML_PARSER_BUFFER_SIZE 100
48
49/* #define DEBUG */
50/* #define DEBUG_PUSH */
51
Daniel Veillard22090732001-07-16 00:06:07 +000052static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000053
Daniel Veillard56a4cb82001-03-24 17:00:36 +000054xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
55 xmlChar end, xmlChar end2, xmlChar end3);
56
57/************************************************************************
58 * *
Owen Taylor3473f882001-02-23 17:55:21 +000059 * Parser stacks related functions and macros *
60 * *
61 ************************************************************************/
62
63/*
64 * Generic function for accessing stacks in the Parser Context
65 */
66
67#define PUSH_AND_POP(scope, type, name) \
68scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
69 if (ctxt->name##Nr >= ctxt->name##Max) { \
70 ctxt->name##Max *= 2; \
71 ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
72 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
73 if (ctxt->name##Tab == NULL) { \
74 xmlGenericError(xmlGenericErrorContext, \
75 "realloc failed !\n"); \
76 return(0); \
77 } \
78 } \
79 ctxt->name##Tab[ctxt->name##Nr] = value; \
80 ctxt->name = value; \
81 return(ctxt->name##Nr++); \
82} \
83scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
84 type ret; \
85 if (ctxt->name##Nr < 0) return(0); \
86 ctxt->name##Nr--; \
87 if (ctxt->name##Nr < 0) return(0); \
88 if (ctxt->name##Nr > 0) \
89 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
90 else \
91 ctxt->name = NULL; \
92 ret = ctxt->name##Tab[ctxt->name##Nr]; \
93 ctxt->name##Tab[ctxt->name##Nr] = 0; \
94 return(ret); \
95} \
96
Daniel Veillard56a4cb82001-03-24 17:00:36 +000097/* PUSH_AND_POP(static, xmlNodePtr, node) */
98PUSH_AND_POP(static, xmlChar*, name)
Owen Taylor3473f882001-02-23 17:55:21 +000099
100/*
101 * Macros for accessing the content. Those should be used only by the parser,
102 * and not exported.
103 *
104 * Dirty macros, i.e. one need to make assumption on the context to use them
105 *
106 * CUR_PTR return the current pointer to the xmlChar to be parsed.
107 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
108 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
109 * in UNICODE mode. This should be used internally by the parser
110 * only to compare to ASCII values otherwise it would break when
111 * running with UTF-8 encoding.
112 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
113 * to compare on ASCII based substring.
114 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
115 * it should be used only to compare on ASCII based substring.
116 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
117 * strings within the parser.
118 *
119 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
120 *
121 * CURRENT Returns the current char value, with the full decoding of
122 * UTF-8 if we are using this mode. It returns an int.
123 * NEXT Skip to the next character, this does the proper decoding
124 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
125 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
126 */
127
128#define UPPER (toupper(*ctxt->input->cur))
129
130#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
131
132#define NXT(val) ctxt->input->cur[(val)]
133
134#define UPP(val) (toupper(ctxt->input->cur[(val)]))
135
136#define CUR_PTR ctxt->input->cur
137
138#define SHRINK xmlParserInputShrink(ctxt->input)
139
140#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
141
142#define CURRENT ((int) (*ctxt->input->cur))
143
144#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
145
146/* Inported from XML */
147
148/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
149#define CUR ((int) (*ctxt->input->cur))
150#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
151
152#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
153#define NXT(val) ctxt->input->cur[(val)]
154#define CUR_PTR ctxt->input->cur
155
156
157#define NEXTL(l) do { \
158 if (*(ctxt->input->cur) == '\n') { \
159 ctxt->input->line++; ctxt->input->col = 1; \
160 } else ctxt->input->col++; \
161 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
162 } while (0)
163
164/************
165 \
166 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
167 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
168 ************/
169
170#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
171#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
172
173#define COPY_BUF(l,b,i,v) \
174 if (l == 1) b[i++] = (xmlChar) v; \
175 else i += xmlCopyChar(l,&b[i],v)
176
177/**
178 * htmlCurrentChar:
179 * @ctxt: the HTML parser context
180 * @len: pointer to the length of the char read
181 *
182 * The current char value, if using UTF-8 this may actaully span multiple
183 * bytes in the input buffer. Implement the end of line normalization:
184 * 2.11 End-of-Line Handling
185 * If the encoding is unspecified, in the case we find an ISO-Latin-1
186 * char, then the encoding converter is plugged in automatically.
187 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000188 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000189 */
190
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000191static int
Owen Taylor3473f882001-02-23 17:55:21 +0000192htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
193 if (ctxt->instate == XML_PARSER_EOF)
194 return(0);
195
196 if (ctxt->token != 0) {
197 *len = 0;
198 return(ctxt->token);
199 }
200 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
201 /*
202 * We are supposed to handle UTF8, check it's valid
203 * From rfc2044: encoding of the Unicode values on UTF-8:
204 *
205 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
206 * 0000 0000-0000 007F 0xxxxxxx
207 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
208 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
209 *
210 * Check for the 0x110000 limit too
211 */
212 const unsigned char *cur = ctxt->input->cur;
213 unsigned char c;
214 unsigned int val;
215
216 c = *cur;
217 if (c & 0x80) {
218 if (cur[1] == 0)
219 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
220 if ((cur[1] & 0xc0) != 0x80)
221 goto encoding_error;
222 if ((c & 0xe0) == 0xe0) {
223
224 if (cur[2] == 0)
225 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
226 if ((cur[2] & 0xc0) != 0x80)
227 goto encoding_error;
228 if ((c & 0xf0) == 0xf0) {
229 if (cur[3] == 0)
230 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
231 if (((c & 0xf8) != 0xf0) ||
232 ((cur[3] & 0xc0) != 0x80))
233 goto encoding_error;
234 /* 4-byte code */
235 *len = 4;
236 val = (cur[0] & 0x7) << 18;
237 val |= (cur[1] & 0x3f) << 12;
238 val |= (cur[2] & 0x3f) << 6;
239 val |= cur[3] & 0x3f;
240 } else {
241 /* 3-byte code */
242 *len = 3;
243 val = (cur[0] & 0xf) << 12;
244 val |= (cur[1] & 0x3f) << 6;
245 val |= cur[2] & 0x3f;
246 }
247 } else {
248 /* 2-byte code */
249 *len = 2;
250 val = (cur[0] & 0x1f) << 6;
251 val |= cur[1] & 0x3f;
252 }
253 if (!IS_CHAR(val)) {
254 ctxt->errNo = XML_ERR_INVALID_ENCODING;
255 if ((ctxt->sax != NULL) &&
256 (ctxt->sax->error != NULL))
257 ctxt->sax->error(ctxt->userData,
258 "Char 0x%X out of allowed range\n", val);
259 ctxt->wellFormed = 0;
260 ctxt->disableSAX = 1;
261 }
262 return(val);
263 } else {
264 /* 1-byte code */
265 *len = 1;
266 return((int) *ctxt->input->cur);
267 }
268 }
269 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000270 * Assume it's a fixed length encoding (1) with
Owen Taylor3473f882001-02-23 17:55:21 +0000271 * a compatibke encoding for the ASCII set, since
272 * XML constructs only use < 128 chars
273 */
274 *len = 1;
275 if ((int) *ctxt->input->cur < 0x80)
276 return((int) *ctxt->input->cur);
277
278 /*
279 * Humm this is bad, do an automatic flow conversion
280 */
281 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
282 ctxt->charset = XML_CHAR_ENCODING_UTF8;
283 return(xmlCurrentChar(ctxt, len));
284
285encoding_error:
286 /*
287 * If we detect an UTF8 error that probably mean that the
288 * input encoding didn't get properly advertized in the
289 * declaration header. Report the error and switch the encoding
290 * to ISO-Latin-1 (if you don't like this policy, just declare the
291 * encoding !)
292 */
293 ctxt->errNo = XML_ERR_INVALID_ENCODING;
294 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
295 ctxt->sax->error(ctxt->userData,
296 "Input is not proper UTF-8, indicate encoding !\n");
297 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
298 ctxt->input->cur[0], ctxt->input->cur[1],
299 ctxt->input->cur[2], ctxt->input->cur[3]);
300 }
301
302 ctxt->charset = XML_CHAR_ENCODING_8859_1;
303 *len = 1;
304 return((int) *ctxt->input->cur);
305}
306
307/**
Owen Taylor3473f882001-02-23 17:55:21 +0000308 * htmlSkipBlankChars:
309 * @ctxt: the HTML parser context
310 *
311 * skip all blanks character found at that point in the input streams.
312 *
313 * Returns the number of space chars skipped
314 */
315
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000316static int
Owen Taylor3473f882001-02-23 17:55:21 +0000317htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
318 int res = 0;
319
320 while (IS_BLANK(*(ctxt->input->cur))) {
321 if ((*ctxt->input->cur == 0) &&
322 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
323 xmlPopInput(ctxt);
324 } else {
325 if (*(ctxt->input->cur) == '\n') {
326 ctxt->input->line++; ctxt->input->col = 1;
327 } else ctxt->input->col++;
328 ctxt->input->cur++;
329 ctxt->nbChars++;
330 if (*ctxt->input->cur == 0)
331 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
332 }
333 res++;
334 }
335 return(res);
336}
337
338
339
340/************************************************************************
341 * *
342 * The list of HTML elements and their properties *
343 * *
344 ************************************************************************/
345
346/*
347 * Start Tag: 1 means the start tag can be ommited
348 * End Tag: 1 means the end tag can be ommited
349 * 2 means it's forbidden (empty elements)
Daniel Veillard56098d42001-04-24 12:51:09 +0000350 * 3 means the tag is stylistic and should be closed easilly
Owen Taylor3473f882001-02-23 17:55:21 +0000351 * Depr: this element is deprecated
352 * DTD: 1 means that this element is valid only in the Loose DTD
353 * 2 means that this element is valid only in the Frameset DTD
354 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000355 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Owen Taylor3473f882001-02-23 17:55:21 +0000356 */
Daniel Veillard22090732001-07-16 00:06:07 +0000357static const htmlElemDesc
358html40ElementTable[] = {
Daniel Veillard02bb1702001-06-13 21:11:59 +0000359{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor " },
360{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form" },
361{ "acronym", 0, 0, 0, 0, 0, 0, 1, "" },
362{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author " },
363{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet " },
364{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area " },
365{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style" },
366{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri " },
367{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " },
368{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride " },
369{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style" },
370{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation " },
371{ "body", 1, 1, 0, 0, 0, 0, 0, "document body " },
372{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break " },
373{ "button", 0, 0, 0, 0, 0, 0, 2, "push button " },
374{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption " },
375{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center " },
376{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation" },
377{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment" },
378{ "col", 0, 2, 2, 1, 0, 0, 0, "table column " },
379{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group " },
380{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description " },
381{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text " },
382{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition" },
383{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list" },
384{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container"},
385{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list " },
386{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term " },
387{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis" },
388{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group " },
389{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font " },
390{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form " },
391{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " },
392{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" },
393{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading " },
394{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading " },
395{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading " },
396{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading " },
397{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading " },
398{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading " },
399{ "head", 1, 1, 0, 0, 0, 0, 0, "document head " },
400{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " },
401{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element " },
402{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style" },
403{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow " },
404{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image " },
405{ "input", 0, 2, 2, 1, 0, 0, 1, "form control " },
406{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text" },
407{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt " },
408{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user" },
409{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text " },
410{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend " },
411{ "li", 0, 1, 1, 0, 0, 0, 0, "list item " },
412{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link " },
413{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map " },
414{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list " },
415{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation " },
416{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering " },
417{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
418{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object " },
419{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list " },
420{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group " },
421{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " },
422{ "p", 0, 1, 1, 0, 0, 0, 0, "paragraph " },
423{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value " },
424{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text " },
425{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation " },
426{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style" },
427{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc." },
428{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements " },
429{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector " },
430{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style" },
431{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container " },
432{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text" },
433{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis" },
434{ "style", 0, 0, 0, 0, 0, 0, 0, "style info " },
435{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript" },
436{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript " },
437{ "table", 0, 0, 0, 0, 0, 0, 0, "&#160;" },
438{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body " },
439{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell" },
440{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field " },
441{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer " },
442{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell" },
443{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header " },
444{ "title", 0, 0, 0, 0, 0, 0, 0, "document title " },
445{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row " },
446{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style" },
447{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style" },
448{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list " },
449{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument" },
Owen Taylor3473f882001-02-23 17:55:21 +0000450};
451
452/*
Owen Taylor3473f882001-02-23 17:55:21 +0000453 * start tags that imply the end of current element
454 */
Daniel Veillard22090732001-07-16 00:06:07 +0000455static const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000456"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
457 "dl", "ul", "ol", "menu", "dir", "address", "pre",
458 "listing", "xmp", "head", NULL,
459"head", "p", NULL,
460"title", "p", NULL,
461"body", "head", "style", "link", "title", "p", NULL,
462"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
463 "pre", "listing", "xmp", "head", "li", NULL,
464"hr", "p", "head", NULL,
465"h1", "p", "head", NULL,
466"h2", "p", "head", NULL,
467"h3", "p", "head", NULL,
468"h4", "p", "head", NULL,
469"h5", "p", "head", NULL,
470"h6", "p", "head", NULL,
471"dir", "p", "head", NULL,
472"address", "p", "head", "ul", NULL,
473"pre", "p", "head", "ul", NULL,
474"listing", "p", "head", NULL,
475"xmp", "p", "head", NULL,
476"blockquote", "p", "head", NULL,
477"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
478 "xmp", "head", NULL,
479"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
480 "head", "dd", NULL,
481"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
482 "head", "dt", NULL,
483"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
484 "listing", "xmp", NULL,
485"ol", "p", "head", "ul", NULL,
486"menu", "p", "head", "ul", NULL,
487"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
488"div", "p", "head", NULL,
489"noscript", "p", "head", NULL,
490"center", "font", "b", "i", "p", "head", NULL,
491"a", "a", NULL,
492"caption", "p", NULL,
493"colgroup", "caption", "colgroup", "col", "p", NULL,
494"col", "caption", "col", "p", NULL,
495"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
496 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000497"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
498"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000499"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
500"thead", "caption", "col", "colgroup", NULL,
501"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
502 "tbody", "p", NULL,
503"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
504 "tfoot", "tbody", "p", NULL,
505"optgroup", "option", NULL,
506"option", "option", NULL,
507"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
508 "pre", "listing", "xmp", "a", NULL,
509NULL
510};
511
512/*
513 * The list of HTML elements which are supposed not to have
514 * CDATA content and where a p element will be implied
515 *
516 * TODO: extend that list by reading the HTML SGML DtD on
517 * implied paragraph
518 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000519static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000520 "html",
521 "head",
522 "body",
523 NULL
524};
525
526/*
527 * The list of HTML attributes which are of content %Script;
528 * NOTE: when adding ones, check htmlIsScriptAttribute() since
529 * it assumes the name starts with 'on'
530 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000531static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000532 "onclick",
533 "ondblclick",
534 "onmousedown",
535 "onmouseup",
536 "onmouseover",
537 "onmousemove",
538 "onmouseout",
539 "onkeypress",
540 "onkeydown",
541 "onkeyup",
542 "onload",
543 "onunload",
544 "onfocus",
545 "onblur",
546 "onsubmit",
547 "onrest",
548 "onchange",
549 "onselect"
550};
551
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000552/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000553 * This table is used by the htmlparser to know what to do with
554 * broken html pages. By assigning different priorities to different
555 * elements the parser can decide how to handle extra endtags.
556 * Endtags are only allowed to close elements with lower or equal
557 * priority.
558 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000559
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000560typedef struct {
561 const char *name;
562 int priority;
563} elementPriority;
564
Daniel Veillard22090732001-07-16 00:06:07 +0000565static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000566 {"div", 150},
567 {"td", 160},
568 {"th", 160},
569 {"tr", 170},
570 {"thead", 180},
571 {"tbody", 180},
572 {"tfoot", 180},
573 {"table", 190},
574 {"head", 200},
575 {"body", 200},
576 {"html", 220},
577 {NULL, 100} /* Default priority */
578};
Owen Taylor3473f882001-02-23 17:55:21 +0000579
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000580static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +0000581static int htmlStartCloseIndexinitialized = 0;
582
583/************************************************************************
584 * *
585 * functions to handle HTML specific data *
586 * *
587 ************************************************************************/
588
589/**
590 * htmlInitAutoClose:
591 *
592 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
593 * This is not reentrant. Call xmlInitParser() once before processing in
594 * case of use in multithreaded programs.
595 */
596void
597htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000598 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000599
600 if (htmlStartCloseIndexinitialized) return;
601
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000602 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
603 indx = 0;
604 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
605 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +0000606 while (htmlStartClose[i] != NULL) i++;
607 i++;
608 }
609 htmlStartCloseIndexinitialized = 1;
610}
611
612/**
613 * htmlTagLookup:
614 * @tag: The tag name in lowercase
615 *
616 * Lookup the HTML tag in the ElementTable
617 *
618 * Returns the related htmlElemDescPtr or NULL if not found.
619 */
Daniel Veillardbb371292001-08-16 23:26:59 +0000620const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +0000621htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000622 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000623
624 for (i = 0; i < (sizeof(html40ElementTable) /
625 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000626 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillard22090732001-07-16 00:06:07 +0000627 return((const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +0000628 }
629 return(NULL);
630}
631
632/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000633 * htmlGetEndPriority:
634 * @name: The name of the element to look up the priority for.
635 *
636 * Return value: The "endtag" priority.
637 **/
638static int
639htmlGetEndPriority (const xmlChar *name) {
640 int i = 0;
641
642 while ((htmlEndPriority[i].name != NULL) &&
643 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
644 i++;
645
646 return(htmlEndPriority[i].priority);
647}
648
649/**
Owen Taylor3473f882001-02-23 17:55:21 +0000650 * htmlCheckAutoClose:
651 * @newtag: The new tag name
652 * @oldtag: The old tag name
653 *
654 * Checks wether the new tag is one of the registered valid tags for closing old.
655 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
656 *
657 * Returns 0 if no, 1 if yes.
658 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000659static int
Owen Taylor3473f882001-02-23 17:55:21 +0000660htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000661 int i, indx;
662 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +0000663
664 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
665
666 /* inefficient, but not a big deal */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000667 for (indx = 0; indx < 100;indx++) {
668 closed = htmlStartCloseIndex[indx];
669 if (closed == NULL) return(0);
670 if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor3473f882001-02-23 17:55:21 +0000671 }
672
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000673 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +0000674 i++;
675 while (htmlStartClose[i] != NULL) {
676 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
677 return(1);
678 }
679 i++;
680 }
681 return(0);
682}
683
684/**
685 * htmlAutoCloseOnClose:
686 * @ctxt: an HTML parser context
687 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000688 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +0000689 *
690 * The HTmL DtD allows an ending tag to implicitely close other tags.
691 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000692static void
Owen Taylor3473f882001-02-23 17:55:21 +0000693htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillardbb371292001-08-16 23:26:59 +0000694 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +0000695 xmlChar *oldname;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000696 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +0000697
698#ifdef DEBUG
699 xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
700 for (i = 0;i < ctxt->nameNr;i++)
701 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
702#endif
703
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000704 priority = htmlGetEndPriority (newtag);
705
Owen Taylor3473f882001-02-23 17:55:21 +0000706 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000707
Owen Taylor3473f882001-02-23 17:55:21 +0000708 if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000709 /*
710 * A missplaced endtagad can only close elements with lower
711 * or equal priority, so if we find an element with higher
712 * priority before we find an element with
713 * matching name, we just ignore this endtag
714 */
715 if (htmlGetEndPriority (ctxt->nameTab[i]) > priority) return;
Owen Taylor3473f882001-02-23 17:55:21 +0000716 }
717 if (i < 0) return;
718
719 while (!xmlStrEqual(newtag, ctxt->name)) {
720 info = htmlTagLookup(ctxt->name);
721 if ((info == NULL) || (info->endTag == 1)) {
722#ifdef DEBUG
723 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
724#endif
Daniel Veillard56098d42001-04-24 12:51:09 +0000725 } else if (info->endTag == 3) {
726#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +0000727 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", newtag, ctxt->name);
William M. Brack1633d182001-10-05 15:41:19 +0000728
Daniel Veillard56098d42001-04-24 12:51:09 +0000729#endif
730 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
731 ctxt->sax->error(ctxt->userData,
732 "Opening and ending tag mismatch: %s and %s\n",
733 newtag, ctxt->name);
734 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000735 }
736 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
737 ctxt->sax->endElement(ctxt->userData, ctxt->name);
738 oldname = htmlnamePop(ctxt);
739 if (oldname != NULL) {
740#ifdef DEBUG
741 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
742#endif
743 xmlFree(oldname);
744 }
745 }
746}
747
748/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000749 * htmlAutoCloseOnEnd:
750 * @ctxt: an HTML parser context
751 *
752 * Close all remaining tags at the end of the stream
753 */
754static void
755htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
756 xmlChar *oldname;
757 int i;
758
759 if (ctxt->nameNr == 0)
760 return;
761#ifdef DEBUG
762 xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
763#endif
764
765 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
766#ifdef DEBUG
767 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
768#endif
769 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
770 ctxt->sax->endElement(ctxt->userData, ctxt->name);
771 oldname = htmlnamePop(ctxt);
772 if (oldname != NULL) {
773#ifdef DEBUG
774 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
775#endif
776 xmlFree(oldname);
777 }
778 }
779}
780
781/**
Owen Taylor3473f882001-02-23 17:55:21 +0000782 * htmlAutoClose:
783 * @ctxt: an HTML parser context
784 * @newtag: The new tag name or NULL
785 *
786 * The HTmL DtD allows a tag to implicitely close other tags.
787 * The list is kept in htmlStartClose array. This function is
788 * called when a new tag has been detected and generates the
789 * appropriates closes if possible/needed.
790 * If newtag is NULL this mean we are at the end of the resource
791 * and we should check
792 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000793static void
Owen Taylor3473f882001-02-23 17:55:21 +0000794htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
795 xmlChar *oldname;
796 while ((newtag != NULL) && (ctxt->name != NULL) &&
797 (htmlCheckAutoClose(newtag, ctxt->name))) {
798#ifdef DEBUG
799 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
800#endif
801 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
802 ctxt->sax->endElement(ctxt->userData, ctxt->name);
803 oldname = htmlnamePop(ctxt);
804 if (oldname != NULL) {
805#ifdef DEBUG
806 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
807#endif
808 xmlFree(oldname);
809 }
810 }
811 if (newtag == NULL) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000812 htmlAutoCloseOnEnd(ctxt);
813 return;
Owen Taylor3473f882001-02-23 17:55:21 +0000814 }
815 while ((newtag == NULL) && (ctxt->name != NULL) &&
816 ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
817 (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
818 (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
819#ifdef DEBUG
820 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
821#endif
822 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
823 ctxt->sax->endElement(ctxt->userData, ctxt->name);
824 oldname = htmlnamePop(ctxt);
825 if (oldname != NULL) {
826#ifdef DEBUG
827 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
828#endif
829 xmlFree(oldname);
830 }
831 }
832
833}
834
835/**
836 * htmlAutoCloseTag:
837 * @doc: the HTML document
838 * @name: The tag name
839 * @elem: the HTML element
840 *
841 * The HTmL DtD allows a tag to implicitely close other tags.
842 * The list is kept in htmlStartClose array. This function checks
843 * if the element or one of it's children would autoclose the
844 * given tag.
845 *
846 * Returns 1 if autoclose, 0 otherwise
847 */
848int
849htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
850 htmlNodePtr child;
851
852 if (elem == NULL) return(1);
853 if (xmlStrEqual(name, elem->name)) return(0);
854 if (htmlCheckAutoClose(elem->name, name)) return(1);
855 child = elem->children;
856 while (child != NULL) {
857 if (htmlAutoCloseTag(doc, name, child)) return(1);
858 child = child->next;
859 }
860 return(0);
861}
862
863/**
864 * htmlIsAutoClosed:
865 * @doc: the HTML document
866 * @elem: the HTML element
867 *
868 * The HTmL DtD allows a tag to implicitely close other tags.
869 * The list is kept in htmlStartClose array. This function checks
870 * if a tag is autoclosed by one of it's child
871 *
872 * Returns 1 if autoclosed, 0 otherwise
873 */
874int
875htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
876 htmlNodePtr child;
877
878 if (elem == NULL) return(1);
879 child = elem->children;
880 while (child != NULL) {
881 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
882 child = child->next;
883 }
884 return(0);
885}
886
887/**
888 * htmlCheckImplied:
889 * @ctxt: an HTML parser context
890 * @newtag: The new tag name
891 *
892 * The HTML DtD allows a tag to exists only implicitely
893 * called when a new tag has been detected and generates the
894 * appropriates implicit tags if missing
895 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000896static void
Owen Taylor3473f882001-02-23 17:55:21 +0000897htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
898 if (!htmlOmittedDefaultValue)
899 return;
900 if (xmlStrEqual(newtag, BAD_CAST"html"))
901 return;
902 if (ctxt->nameNr <= 0) {
903#ifdef DEBUG
904 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
905#endif
906 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
907 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
908 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
909 }
910 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
911 return;
912 if ((ctxt->nameNr <= 1) &&
913 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
914 (xmlStrEqual(newtag, BAD_CAST"style")) ||
915 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
916 (xmlStrEqual(newtag, BAD_CAST"link")) ||
917 (xmlStrEqual(newtag, BAD_CAST"title")) ||
918 (xmlStrEqual(newtag, BAD_CAST"base")))) {
919 /*
920 * dropped OBJECT ... i you put it first BODY will be
921 * assumed !
922 */
923#ifdef DEBUG
924 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
925#endif
926 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
927 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
928 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
929 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
930 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
931 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
932 int i;
933 for (i = 0;i < ctxt->nameNr;i++) {
934 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
935 return;
936 }
937 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
938 return;
939 }
940 }
941
942#ifdef DEBUG
943 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
944#endif
945 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
946 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
947 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
948 }
949}
950
951/**
952 * htmlCheckParagraph
953 * @ctxt: an HTML parser context
954 *
955 * Check whether a p element need to be implied before inserting
956 * characters in the current element.
957 *
958 * Returns 1 if a paragraph has been inserted, 0 if not and -1
959 * in case of error.
960 */
961
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000962static int
Owen Taylor3473f882001-02-23 17:55:21 +0000963htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
964 const xmlChar *tag;
965 int i;
966
967 if (ctxt == NULL)
968 return(-1);
969 tag = ctxt->name;
970 if (tag == NULL) {
971 htmlAutoClose(ctxt, BAD_CAST"p");
972 htmlCheckImplied(ctxt, BAD_CAST"p");
973 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
974 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
975 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
976 return(1);
977 }
978 if (!htmlOmittedDefaultValue)
979 return(0);
980 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
981 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
982#ifdef DEBUG
983 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
984#endif
985 htmlAutoClose(ctxt, BAD_CAST"p");
986 htmlCheckImplied(ctxt, BAD_CAST"p");
987 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
988 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
989 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
990 return(1);
991 }
992 }
993 return(0);
994}
995
996/**
997 * htmlIsScriptAttribute:
998 * @name: an attribute name
999 *
1000 * Check if an attribute is of content type Script
1001 *
1002 * Returns 1 is the attribute is a script 0 otherwise
1003 */
1004int
1005htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001006 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001007
1008 if (name == NULL)
1009 return(0);
1010 /*
1011 * all script attributes start with 'on'
1012 */
1013 if ((name[0] != 'o') || (name[1] != 'n'))
1014 return(0);
1015 for (i = 0;
1016 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1017 i++) {
1018 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1019 return(1);
1020 }
1021 return(0);
1022}
1023
1024/************************************************************************
1025 * *
1026 * The list of HTML predefined entities *
1027 * *
1028 ************************************************************************/
1029
1030
Daniel Veillard22090732001-07-16 00:06:07 +00001031static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001032/*
1033 * the 4 absolute ones, plus apostrophe.
1034 */
1035{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1036{ 38, "amp", "ampersand, U+0026 ISOnum" },
1037{ 39, "apos", "single quote" },
1038{ 60, "lt", "less-than sign, U+003C ISOnum" },
1039{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1040
1041/*
1042 * A bunch still in the 128-255 range
1043 * Replacing them depend really on the charset used.
1044 */
1045{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1046{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1047{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1048{ 163, "pound","pound sign, U+00A3 ISOnum" },
1049{ 164, "curren","currency sign, U+00A4 ISOnum" },
1050{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1051{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1052{ 167, "sect", "section sign, U+00A7 ISOnum" },
1053{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1054{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1055{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1056{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1057{ 172, "not", "not sign, U+00AC ISOnum" },
1058{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1059{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1060{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1061{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1062{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1063{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1064{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1065{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1066{ 181, "micro","micro sign, U+00B5 ISOnum" },
1067{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1068{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1069{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1070{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1071{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1072{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1073{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1074{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1075{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1076{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1077{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1078{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1079{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1080{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1081{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1082{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1083{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1084{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1085{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1086{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1087{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1088{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1089{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1090{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1091{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1092{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1093{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1094{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1095{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1096{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1097{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1098{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1099{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1100{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1101{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1102{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1103{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1104{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1105{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1106{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1107{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1108{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1109{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1110{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1111{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1112{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1113{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1114{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1115{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1116{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1117{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1118{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1119{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1120{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1121{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1122{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1123{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1124{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1125{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1126{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1127{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1128{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1129{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1130{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1131{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1132{ 247, "divide","division sign, U+00F7 ISOnum" },
1133{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1134{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1135{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1136{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1137{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1138{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1139{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1140{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1141
1142{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1143{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1144{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1145{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1146{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1147
1148/*
1149 * Anything below should really be kept as entities references
1150 */
1151{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1152
1153{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1154{ 732, "tilde","small tilde, U+02DC ISOdia" },
1155
1156{ 913, "Alpha","greek capital letter alpha, U+0391" },
1157{ 914, "Beta", "greek capital letter beta, U+0392" },
1158{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1159{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1160{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1161{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1162{ 919, "Eta", "greek capital letter eta, U+0397" },
1163{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1164{ 921, "Iota", "greek capital letter iota, U+0399" },
1165{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001166{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001167{ 924, "Mu", "greek capital letter mu, U+039C" },
1168{ 925, "Nu", "greek capital letter nu, U+039D" },
1169{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1170{ 927, "Omicron","greek capital letter omicron, U+039F" },
1171{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1172{ 929, "Rho", "greek capital letter rho, U+03A1" },
1173{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1174{ 932, "Tau", "greek capital letter tau, U+03A4" },
1175{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1176{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1177{ 935, "Chi", "greek capital letter chi, U+03A7" },
1178{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1179{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1180
1181{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1182{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1183{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1184{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1185{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1186{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1187{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1188{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1189{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1190{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1191{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1192{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1193{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1194{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1195{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1196{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1197{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1198{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1199{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1200{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1201{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1202{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1203{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1204{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1205{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1206{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1207{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1208{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1209
1210{ 8194, "ensp", "en space, U+2002 ISOpub" },
1211{ 8195, "emsp", "em space, U+2003 ISOpub" },
1212{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1213{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1214{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1215{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1216{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1217{ 8211, "ndash","en dash, U+2013 ISOpub" },
1218{ 8212, "mdash","em dash, U+2014 ISOpub" },
1219{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1220{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1221{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1222{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1223{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1224{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1225{ 8224, "dagger","dagger, U+2020 ISOpub" },
1226{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1227
1228{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1229{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1230
1231{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1232
1233{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1234{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1235
1236{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1237{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1238
1239{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1240{ 8260, "frasl","fraction slash, U+2044 NEW" },
1241
1242{ 8364, "euro", "euro sign, U+20AC NEW" },
1243
1244{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1245{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1246{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1247{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1248{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1249{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1250{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1251{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1252{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1253{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1254{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1255{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1256{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1257{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1258{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1259{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1260
1261{ 8704, "forall","for all, U+2200 ISOtech" },
1262{ 8706, "part", "partial differential, U+2202 ISOtech" },
1263{ 8707, "exist","there exists, U+2203 ISOtech" },
1264{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1265{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1266{ 8712, "isin", "element of, U+2208 ISOtech" },
1267{ 8713, "notin","not an element of, U+2209 ISOtech" },
1268{ 8715, "ni", "contains as member, U+220B ISOtech" },
1269{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1270{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
1271{ 8722, "minus","minus sign, U+2212 ISOtech" },
1272{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1273{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1274{ 8733, "prop", "proportional to, U+221D ISOtech" },
1275{ 8734, "infin","infinity, U+221E ISOtech" },
1276{ 8736, "ang", "angle, U+2220 ISOamso" },
1277{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1278{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1279{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1280{ 8746, "cup", "union = cup, U+222A ISOtech" },
1281{ 8747, "int", "integral, U+222B ISOtech" },
1282{ 8756, "there4","therefore, U+2234 ISOtech" },
1283{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1284{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1285{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1286{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1287{ 8801, "equiv","identical to, U+2261 ISOtech" },
1288{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1289{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1290{ 8834, "sub", "subset of, U+2282 ISOtech" },
1291{ 8835, "sup", "superset of, U+2283 ISOtech" },
1292{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1293{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1294{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1295{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1296{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1297{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1298{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1299{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1300{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1301{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1302{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1303{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1304{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1305{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1306
1307{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1308{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1309{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1310{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1311
1312};
1313
1314/************************************************************************
1315 * *
1316 * Commodity functions to handle entities *
1317 * *
1318 ************************************************************************/
1319
1320/*
1321 * Macro used to grow the current buffer.
1322 */
1323#define growBuffer(buffer) { \
1324 buffer##_size *= 2; \
1325 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1326 if (buffer == NULL) { \
1327 perror("realloc failed"); \
1328 return(NULL); \
1329 } \
1330}
1331
1332/**
1333 * htmlEntityLookup:
1334 * @name: the entity name
1335 *
1336 * Lookup the given entity in EntitiesTable
1337 *
1338 * TODO: the linear scan is really ugly, an hash table is really needed.
1339 *
1340 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1341 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001342const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001343htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001344 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001345
1346 for (i = 0;i < (sizeof(html40EntitiesTable)/
1347 sizeof(html40EntitiesTable[0]));i++) {
1348 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1349#ifdef DEBUG
1350 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
1351#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001352 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001353 }
1354 }
1355 return(NULL);
1356}
1357
1358/**
1359 * htmlEntityValueLookup:
1360 * @value: the entity's unicode value
1361 *
1362 * Lookup the given entity in EntitiesTable
1363 *
1364 * TODO: the linear scan is really ugly, an hash table is really needed.
1365 *
1366 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1367 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001368const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001369htmlEntityValueLookup(unsigned int value) {
1370 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001371#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00001372 unsigned int lv = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001373#endif
1374
1375 for (i = 0;i < (sizeof(html40EntitiesTable)/
1376 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001377 if (html40EntitiesTable[i].value >= value) {
1378 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001379 break;
1380#ifdef DEBUG
1381 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
1382#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001383 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001384 }
1385#ifdef DEBUG
1386 if (lv > html40EntitiesTable[i].value) {
1387 xmlGenericError(xmlGenericErrorContext,
1388 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1389 lv, html40EntitiesTable[i].value);
1390 }
1391 lv = html40EntitiesTable[i].value;
1392#endif
1393 }
1394 return(NULL);
1395}
1396
1397/**
1398 * UTF8ToHtml:
1399 * @out: a pointer to an array of bytes to store the result
1400 * @outlen: the length of @out
1401 * @in: a pointer to an array of UTF-8 chars
1402 * @inlen: the length of @in
1403 *
1404 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1405 * plus HTML entities block of chars out.
1406 *
1407 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1408 * The value of @inlen after return is the number of octets consumed
1409 * as the return value is positive, else unpredictiable.
1410 * The value of @outlen after return is the number of octets consumed.
1411 */
1412int
1413UTF8ToHtml(unsigned char* out, int *outlen,
1414 const unsigned char* in, int *inlen) {
1415 const unsigned char* processed = in;
1416 const unsigned char* outend;
1417 const unsigned char* outstart = out;
1418 const unsigned char* instart = in;
1419 const unsigned char* inend;
1420 unsigned int c, d;
1421 int trailing;
1422
1423 if (in == NULL) {
1424 /*
1425 * initialization nothing to do
1426 */
1427 *outlen = 0;
1428 *inlen = 0;
1429 return(0);
1430 }
1431 inend = in + (*inlen);
1432 outend = out + (*outlen);
1433 while (in < inend) {
1434 d = *in++;
1435 if (d < 0x80) { c= d; trailing= 0; }
1436 else if (d < 0xC0) {
1437 /* trailing byte in leading position */
1438 *outlen = out - outstart;
1439 *inlen = processed - instart;
1440 return(-2);
1441 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1442 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1443 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1444 else {
1445 /* no chance for this in Ascii */
1446 *outlen = out - outstart;
1447 *inlen = processed - instart;
1448 return(-2);
1449 }
1450
1451 if (inend - in < trailing) {
1452 break;
1453 }
1454
1455 for ( ; trailing; trailing--) {
1456 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1457 break;
1458 c <<= 6;
1459 c |= d & 0x3F;
1460 }
1461
1462 /* assertion: c is a single UTF-4 value */
1463 if (c < 0x80) {
1464 if (out + 1 >= outend)
1465 break;
1466 *out++ = c;
1467 } else {
1468 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001469 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001470
1471 /*
1472 * Try to lookup a predefined HTML entity for it
1473 */
1474
1475 ent = htmlEntityValueLookup(c);
1476 if (ent == NULL) {
1477 /* no chance for this in Ascii */
1478 *outlen = out - outstart;
1479 *inlen = processed - instart;
1480 return(-2);
1481 }
1482 len = strlen(ent->name);
1483 if (out + 2 + len >= outend)
1484 break;
1485 *out++ = '&';
1486 memcpy(out, ent->name, len);
1487 out += len;
1488 *out++ = ';';
1489 }
1490 processed = in;
1491 }
1492 *outlen = out - outstart;
1493 *inlen = processed - instart;
1494 return(0);
1495}
1496
1497/**
1498 * htmlEncodeEntities:
1499 * @out: a pointer to an array of bytes to store the result
1500 * @outlen: the length of @out
1501 * @in: a pointer to an array of UTF-8 chars
1502 * @inlen: the length of @in
1503 * @quoteChar: the quote character to escape (' or ") or zero.
1504 *
1505 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1506 * plus HTML entities block of chars out.
1507 *
1508 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1509 * The value of @inlen after return is the number of octets consumed
1510 * as the return value is positive, else unpredictiable.
1511 * The value of @outlen after return is the number of octets consumed.
1512 */
1513int
1514htmlEncodeEntities(unsigned char* out, int *outlen,
1515 const unsigned char* in, int *inlen, int quoteChar) {
1516 const unsigned char* processed = in;
1517 const unsigned char* outend = out + (*outlen);
1518 const unsigned char* outstart = out;
1519 const unsigned char* instart = in;
1520 const unsigned char* inend = in + (*inlen);
1521 unsigned int c, d;
1522 int trailing;
1523
1524 while (in < inend) {
1525 d = *in++;
1526 if (d < 0x80) { c= d; trailing= 0; }
1527 else if (d < 0xC0) {
1528 /* trailing byte in leading position */
1529 *outlen = out - outstart;
1530 *inlen = processed - instart;
1531 return(-2);
1532 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1533 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1534 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1535 else {
1536 /* no chance for this in Ascii */
1537 *outlen = out - outstart;
1538 *inlen = processed - instart;
1539 return(-2);
1540 }
1541
1542 if (inend - in < trailing)
1543 break;
1544
1545 while (trailing--) {
1546 if (((d= *in++) & 0xC0) != 0x80) {
1547 *outlen = out - outstart;
1548 *inlen = processed - instart;
1549 return(-2);
1550 }
1551 c <<= 6;
1552 c |= d & 0x3F;
1553 }
1554
1555 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001556 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1557 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001558 if (out >= outend)
1559 break;
1560 *out++ = c;
1561 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001562 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001563 const char *cp;
1564 char nbuf[16];
1565 int len;
1566
1567 /*
1568 * Try to lookup a predefined HTML entity for it
1569 */
1570 ent = htmlEntityValueLookup(c);
1571 if (ent == NULL) {
1572 sprintf(nbuf, "#%u", c);
1573 cp = nbuf;
1574 }
1575 else
1576 cp = ent->name;
1577 len = strlen(cp);
1578 if (out + 2 + len > outend)
1579 break;
1580 *out++ = '&';
1581 memcpy(out, cp, len);
1582 out += len;
1583 *out++ = ';';
1584 }
1585 processed = in;
1586 }
1587 *outlen = out - outstart;
1588 *inlen = processed - instart;
1589 return(0);
1590}
1591
1592/**
1593 * htmlDecodeEntities:
1594 * @ctxt: the parser context
1595 * @len: the len to decode (in bytes !), -1 for no size limit
1596 * @end: an end marker xmlChar, 0 if none
1597 * @end2: an end marker xmlChar, 0 if none
1598 * @end3: an end marker xmlChar, 0 if none
1599 *
1600 * Subtitute the HTML entities by their value
1601 *
1602 * DEPRECATED !!!!
1603 *
1604 * Returns A newly allocated string with the substitution done. The caller
1605 * must deallocate it !
1606 */
1607xmlChar *
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00001608htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
1609 xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001610 static int deprecated = 0;
1611 if (!deprecated) {
1612 xmlGenericError(xmlGenericErrorContext,
1613 "htmlDecodeEntities() deprecated function reached\n");
1614 deprecated = 1;
1615 }
1616 return(NULL);
1617#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00001618 xmlChar *name = NULL;
1619 xmlChar *buffer = NULL;
1620 unsigned int buffer_size = 0;
1621 unsigned int nbchars = 0;
1622 htmlEntityDescPtr ent;
1623 unsigned int max = (unsigned int) len;
1624 int c,l;
1625
1626 if (ctxt->depth > 40) {
1627 ctxt->errNo = XML_ERR_ENTITY_LOOP;
1628 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1629 ctxt->sax->error(ctxt->userData,
1630 "Detected entity reference loop\n");
1631 ctxt->wellFormed = 0;
1632 ctxt->disableSAX = 1;
1633 return(NULL);
1634 }
1635
1636 /*
1637 * allocate a translation buffer.
1638 */
1639 buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1640 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1641 if (buffer == NULL) {
1642 perror("xmlDecodeEntities: malloc failed");
1643 return(NULL);
1644 }
1645
1646 /*
1647 * Ok loop until we reach one of the ending char or a size limit.
1648 */
1649 c = CUR_CHAR(l);
1650 while ((nbchars < max) && (c != end) &&
1651 (c != end2) && (c != end3)) {
1652
1653 if (c == 0) break;
1654 if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
1655 int val = htmlParseCharRef(ctxt);
1656 COPY_BUF(0,buffer,nbchars,val);
1657 NEXTL(l);
1658 } else if ((c == '&') && (ctxt->token != '&')) {
1659 ent = htmlParseEntityRef(ctxt, &name);
1660 if (name != NULL) {
1661 if (ent != NULL) {
1662 int val = ent->value;
1663 COPY_BUF(0,buffer,nbchars,val);
1664 NEXTL(l);
1665 } else {
1666 const xmlChar *cur = name;
1667
1668 buffer[nbchars++] = '&';
1669 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1670 growBuffer(buffer);
1671 }
1672 while (*cur != 0) {
1673 buffer[nbchars++] = *cur++;
1674 }
1675 buffer[nbchars++] = ';';
1676 }
1677 }
1678 } else {
1679 COPY_BUF(l,buffer,nbchars,c);
1680 NEXTL(l);
1681 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1682 growBuffer(buffer);
1683 }
1684 }
1685 c = CUR_CHAR(l);
1686 }
1687 buffer[nbchars++] = 0;
1688 return(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001689#endif
Owen Taylor3473f882001-02-23 17:55:21 +00001690}
1691
1692/************************************************************************
1693 * *
1694 * Commodity functions to handle streams *
1695 * *
1696 ************************************************************************/
1697
1698/**
Owen Taylor3473f882001-02-23 17:55:21 +00001699 * htmlNewInputStream:
1700 * @ctxt: an HTML parser context
1701 *
1702 * Create a new input stream structure
1703 * Returns the new input stream or NULL
1704 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001705static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001706htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1707 htmlParserInputPtr input;
1708
1709 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1710 if (input == NULL) {
1711 ctxt->errNo = XML_ERR_NO_MEMORY;
1712 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1713 ctxt->sax->error(ctxt->userData,
1714 "malloc: couldn't allocate a new input stream\n");
1715 return(NULL);
1716 }
1717 memset(input, 0, sizeof(htmlParserInput));
1718 input->filename = NULL;
1719 input->directory = NULL;
1720 input->base = NULL;
1721 input->cur = NULL;
1722 input->buf = NULL;
1723 input->line = 1;
1724 input->col = 1;
1725 input->buf = NULL;
1726 input->free = NULL;
1727 input->version = NULL;
1728 input->consumed = 0;
1729 input->length = 0;
1730 return(input);
1731}
1732
1733
1734/************************************************************************
1735 * *
1736 * Commodity functions, cleanup needed ? *
1737 * *
1738 ************************************************************************/
1739
1740/**
1741 * areBlanks:
1742 * @ctxt: an HTML parser context
1743 * @str: a xmlChar *
1744 * @len: the size of @str
1745 *
1746 * Is this a sequence of blank chars that one can ignore ?
1747 *
1748 * Returns 1 if ignorable 0 otherwise.
1749 */
1750
1751static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
1752 int i;
1753 xmlNodePtr lastChild;
1754
1755 for (i = 0;i < len;i++)
1756 if (!(IS_BLANK(str[i]))) return(0);
1757
1758 if (CUR == 0) return(1);
1759 if (CUR != '<') return(0);
1760 if (ctxt->name == NULL)
1761 return(1);
1762 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
1763 return(1);
1764 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
1765 return(1);
1766 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
1767 return(1);
1768 if (ctxt->node == NULL) return(0);
1769 lastChild = xmlGetLastChild(ctxt->node);
1770 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00001771 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
1772 (ctxt->node->content != NULL)) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001773 } else if (xmlNodeIsText(lastChild)) {
1774 return(0);
1775 } else if (xmlStrEqual(lastChild->name, BAD_CAST"b")) {
1776 return(0);
1777 } else if (xmlStrEqual(lastChild->name, BAD_CAST"bold")) {
1778 return(0);
1779 } else if (xmlStrEqual(lastChild->name, BAD_CAST"em")) {
1780 return(0);
1781 }
1782 return(1);
1783}
1784
1785/**
Owen Taylor3473f882001-02-23 17:55:21 +00001786 * htmlNewDocNoDtD:
1787 * @URI: URI for the dtd, or NULL
1788 * @ExternalID: the external ID of the DTD, or NULL
1789 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001790 * Creates a new HTML document without a DTD node if @URI and @ExternalID
1791 * are NULL
1792 *
Owen Taylor3473f882001-02-23 17:55:21 +00001793 * Returns a new document, do not intialize the DTD if not provided
1794 */
1795htmlDocPtr
1796htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
1797 xmlDocPtr cur;
1798
1799 /*
1800 * Allocate a new document and fill the fields.
1801 */
1802 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
1803 if (cur == NULL) {
1804 xmlGenericError(xmlGenericErrorContext,
1805 "xmlNewDoc : malloc failed\n");
1806 return(NULL);
1807 }
1808 memset(cur, 0, sizeof(xmlDoc));
1809
1810 cur->type = XML_HTML_DOCUMENT_NODE;
1811 cur->version = NULL;
1812 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001813 cur->doc = cur;
1814 cur->name = NULL;
1815 cur->children = NULL;
1816 cur->extSubset = NULL;
1817 cur->oldNs = NULL;
1818 cur->encoding = NULL;
1819 cur->standalone = 1;
1820 cur->compression = 0;
1821 cur->ids = NULL;
1822 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001823 cur->_private = NULL;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00001824 if ((ExternalID != NULL) ||
1825 (URI != NULL))
Daniel Veillard5151c062001-10-23 13:10:19 +00001826 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00001827 return(cur);
1828}
1829
1830/**
1831 * htmlNewDoc:
1832 * @URI: URI for the dtd, or NULL
1833 * @ExternalID: the external ID of the DTD, or NULL
1834 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001835 * Creates a new HTML document
1836 *
Owen Taylor3473f882001-02-23 17:55:21 +00001837 * Returns a new document
1838 */
1839htmlDocPtr
1840htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1841 if ((URI == NULL) && (ExternalID == NULL))
1842 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00001843 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
1844 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00001845
1846 return(htmlNewDocNoDtD(URI, ExternalID));
1847}
1848
1849
1850/************************************************************************
1851 * *
1852 * The parser itself *
1853 * Relates to http://www.w3.org/TR/html40 *
1854 * *
1855 ************************************************************************/
1856
1857/************************************************************************
1858 * *
1859 * The parser itself *
1860 * *
1861 ************************************************************************/
1862
1863/**
1864 * htmlParseHTMLName:
1865 * @ctxt: an HTML parser context
1866 *
1867 * parse an HTML tag or attribute name, note that we convert it to lowercase
1868 * since HTML names are not case-sensitive.
1869 *
1870 * Returns the Tag Name parsed or NULL
1871 */
1872
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001873static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001874htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1875 xmlChar *ret = NULL;
1876 int i = 0;
1877 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
1878
1879 if (!IS_LETTER(CUR) && (CUR != '_') &&
1880 (CUR != ':')) return(NULL);
1881
1882 while ((i < HTML_PARSER_BUFFER_SIZE) &&
1883 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1884 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
1885 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
1886 else loc[i] = CUR;
1887 i++;
1888
1889 NEXT;
1890 }
1891
1892 ret = xmlStrndup(loc, i);
1893
1894 return(ret);
1895}
1896
1897/**
1898 * htmlParseName:
1899 * @ctxt: an HTML parser context
1900 *
1901 * parse an HTML name, this routine is case sensistive.
1902 *
1903 * Returns the Name parsed or NULL
1904 */
1905
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001906static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001907htmlParseName(htmlParserCtxtPtr ctxt) {
1908 xmlChar buf[HTML_MAX_NAMELEN];
1909 int len = 0;
1910
1911 GROW;
1912 if (!IS_LETTER(CUR) && (CUR != '_')) {
1913 return(NULL);
1914 }
1915
1916 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1917 (CUR == '.') || (CUR == '-') ||
1918 (CUR == '_') || (CUR == ':') ||
1919 (IS_COMBINING(CUR)) ||
1920 (IS_EXTENDER(CUR))) {
1921 buf[len++] = CUR;
1922 NEXT;
1923 if (len >= HTML_MAX_NAMELEN) {
1924 xmlGenericError(xmlGenericErrorContext,
1925 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1926 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1927 (CUR == '.') || (CUR == '-') ||
1928 (CUR == '_') || (CUR == ':') ||
1929 (IS_COMBINING(CUR)) ||
1930 (IS_EXTENDER(CUR)))
1931 NEXT;
1932 break;
1933 }
1934 }
1935 return(xmlStrndup(buf, len));
1936}
1937
1938/**
1939 * htmlParseHTMLAttribute:
1940 * @ctxt: an HTML parser context
1941 * @stop: a char stop value
1942 *
1943 * parse an HTML attribute value till the stop (quote), if
1944 * stop is 0 then it stops at the first space
1945 *
1946 * Returns the attribute parsed or NULL
1947 */
1948
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001949static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001950htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
1951 xmlChar *buffer = NULL;
1952 int buffer_size = 0;
1953 xmlChar *out = NULL;
1954 xmlChar *name = NULL;
1955
1956 xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00001957 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001958
1959 /*
1960 * allocate a translation buffer.
1961 */
1962 buffer_size = HTML_PARSER_BUFFER_SIZE;
1963 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1964 if (buffer == NULL) {
1965 perror("htmlParseHTMLAttribute: malloc failed");
1966 return(NULL);
1967 }
1968 out = buffer;
1969
1970 /*
1971 * Ok loop until we reach one of the ending chars
1972 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00001973 while ((CUR != 0) && (CUR != stop)) {
1974 if ((stop == 0) && (CUR == '>')) break;
Owen Taylor3473f882001-02-23 17:55:21 +00001975 if ((stop == 0) && (IS_BLANK(CUR))) break;
1976 if (CUR == '&') {
1977 if (NXT(1) == '#') {
1978 unsigned int c;
1979 int bits;
1980
1981 c = htmlParseCharRef(ctxt);
1982 if (c < 0x80)
1983 { *out++ = c; bits= -6; }
1984 else if (c < 0x800)
1985 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1986 else if (c < 0x10000)
1987 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1988 else
1989 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1990
1991 for ( ; bits >= 0; bits-= 6) {
1992 *out++ = ((c >> bits) & 0x3F) | 0x80;
1993 }
1994 } else {
1995 ent = htmlParseEntityRef(ctxt, &name);
1996 if (name == NULL) {
1997 *out++ = '&';
1998 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001999 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002000
2001 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002002 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002003 }
2004 } else if (ent == NULL) {
2005 *out++ = '&';
2006 cur = name;
2007 while (*cur != 0) {
2008 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002009 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002010
2011 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002012 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002013 }
2014 *out++ = *cur++;
2015 }
2016 xmlFree(name);
2017 } else {
2018 unsigned int c;
2019 int bits;
2020
2021 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002022 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002023
2024 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002025 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002026 }
2027 c = (xmlChar)ent->value;
2028 if (c < 0x80)
2029 { *out++ = c; bits= -6; }
2030 else if (c < 0x800)
2031 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2032 else if (c < 0x10000)
2033 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2034 else
2035 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2036
2037 for ( ; bits >= 0; bits-= 6) {
2038 *out++ = ((c >> bits) & 0x3F) | 0x80;
2039 }
2040 xmlFree(name);
2041 }
2042 }
2043 } else {
2044 unsigned int c;
2045 int bits, l;
2046
2047 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002048 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002049
2050 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002051 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002052 }
2053 c = CUR_CHAR(l);
2054 if (c < 0x80)
2055 { *out++ = c; bits= -6; }
2056 else if (c < 0x800)
2057 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2058 else if (c < 0x10000)
2059 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2060 else
2061 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2062
2063 for ( ; bits >= 0; bits-= 6) {
2064 *out++ = ((c >> bits) & 0x3F) | 0x80;
2065 }
2066 NEXT;
2067 }
2068 }
2069 *out++ = 0;
2070 return(buffer);
2071}
2072
2073/**
Owen Taylor3473f882001-02-23 17:55:21 +00002074 * htmlParseEntityRef:
2075 * @ctxt: an HTML parser context
2076 * @str: location to store the entity name
2077 *
2078 * parse an HTML ENTITY references
2079 *
2080 * [68] EntityRef ::= '&' Name ';'
2081 *
2082 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2083 * if non-NULL *str will have to be freed by the caller.
2084 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002085const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00002086htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2087 xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002088 const htmlEntityDesc * ent = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002089 *str = NULL;
2090
2091 if (CUR == '&') {
2092 NEXT;
2093 name = htmlParseName(ctxt);
2094 if (name == NULL) {
2095 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2096 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2097 ctxt->wellFormed = 0;
2098 } else {
2099 GROW;
2100 if (CUR == ';') {
2101 *str = name;
2102
2103 /*
2104 * Lookup the entity in the table.
2105 */
2106 ent = htmlEntityLookup(name);
2107 if (ent != NULL) /* OK that's ugly !!! */
2108 NEXT;
2109 } else {
2110 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2111 ctxt->sax->error(ctxt->userData,
2112 "htmlParseEntityRef: expecting ';'\n");
2113 *str = name;
2114 }
2115 }
2116 }
2117 return(ent);
2118}
2119
2120/**
2121 * htmlParseAttValue:
2122 * @ctxt: an HTML parser context
2123 *
2124 * parse a value for an attribute
2125 * Note: the parser won't do substitution of entities here, this
2126 * will be handled later in xmlStringGetNodeList, unless it was
2127 * asked for ctxt->replaceEntities != 0
2128 *
2129 * Returns the AttValue parsed or NULL.
2130 */
2131
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002132static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002133htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2134 xmlChar *ret = NULL;
2135
2136 if (CUR == '"') {
2137 NEXT;
2138 ret = htmlParseHTMLAttribute(ctxt, '"');
2139 if (CUR != '"') {
2140 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2141 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2142 ctxt->wellFormed = 0;
2143 } else
2144 NEXT;
2145 } else if (CUR == '\'') {
2146 NEXT;
2147 ret = htmlParseHTMLAttribute(ctxt, '\'');
2148 if (CUR != '\'') {
2149 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2150 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2151 ctxt->wellFormed = 0;
2152 } else
2153 NEXT;
2154 } else {
2155 /*
2156 * That's an HTMLism, the attribute value may not be quoted
2157 */
2158 ret = htmlParseHTMLAttribute(ctxt, 0);
2159 if (ret == NULL) {
2160 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2161 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2162 ctxt->wellFormed = 0;
2163 }
2164 }
2165 return(ret);
2166}
2167
2168/**
2169 * htmlParseSystemLiteral:
2170 * @ctxt: an HTML parser context
2171 *
2172 * parse an HTML Literal
2173 *
2174 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2175 *
2176 * Returns the SystemLiteral parsed or NULL
2177 */
2178
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002179static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002180htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2181 const xmlChar *q;
2182 xmlChar *ret = NULL;
2183
2184 if (CUR == '"') {
2185 NEXT;
2186 q = CUR_PTR;
2187 while ((IS_CHAR(CUR)) && (CUR != '"'))
2188 NEXT;
2189 if (!IS_CHAR(CUR)) {
2190 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2191 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2192 ctxt->wellFormed = 0;
2193 } else {
2194 ret = xmlStrndup(q, CUR_PTR - q);
2195 NEXT;
2196 }
2197 } else if (CUR == '\'') {
2198 NEXT;
2199 q = CUR_PTR;
2200 while ((IS_CHAR(CUR)) && (CUR != '\''))
2201 NEXT;
2202 if (!IS_CHAR(CUR)) {
2203 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2204 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2205 ctxt->wellFormed = 0;
2206 } else {
2207 ret = xmlStrndup(q, CUR_PTR - q);
2208 NEXT;
2209 }
2210 } else {
2211 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2212 ctxt->sax->error(ctxt->userData,
2213 "SystemLiteral \" or ' expected\n");
2214 ctxt->wellFormed = 0;
2215 }
2216
2217 return(ret);
2218}
2219
2220/**
2221 * htmlParsePubidLiteral:
2222 * @ctxt: an HTML parser context
2223 *
2224 * parse an HTML public literal
2225 *
2226 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2227 *
2228 * Returns the PubidLiteral parsed or NULL.
2229 */
2230
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002231static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002232htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2233 const xmlChar *q;
2234 xmlChar *ret = NULL;
2235 /*
2236 * Name ::= (Letter | '_') (NameChar)*
2237 */
2238 if (CUR == '"') {
2239 NEXT;
2240 q = CUR_PTR;
2241 while (IS_PUBIDCHAR(CUR)) NEXT;
2242 if (CUR != '"') {
2243 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2244 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2245 ctxt->wellFormed = 0;
2246 } else {
2247 ret = xmlStrndup(q, CUR_PTR - q);
2248 NEXT;
2249 }
2250 } else if (CUR == '\'') {
2251 NEXT;
2252 q = CUR_PTR;
2253 while ((IS_LETTER(CUR)) && (CUR != '\''))
2254 NEXT;
2255 if (!IS_LETTER(CUR)) {
2256 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2257 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2258 ctxt->wellFormed = 0;
2259 } else {
2260 ret = xmlStrndup(q, CUR_PTR - q);
2261 NEXT;
2262 }
2263 } else {
2264 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2265 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2266 ctxt->wellFormed = 0;
2267 }
2268
2269 return(ret);
2270}
2271
2272/**
2273 * htmlParseScript:
2274 * @ctxt: an HTML parser context
2275 *
2276 * parse the content of an HTML SCRIPT or STYLE element
2277 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2278 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2279 * http://www.w3.org/TR/html4/types.html#type-script
2280 * http://www.w3.org/TR/html4/types.html#h-6.15
2281 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2282 *
2283 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2284 * element and the value of intrinsic event attributes. User agents must
2285 * not evaluate script data as HTML markup but instead must pass it on as
2286 * data to a script engine.
2287 * NOTES:
2288 * - The content is passed like CDATA
2289 * - the attributes for style and scripting "onXXX" are also described
2290 * as CDATA but SGML allows entities references in attributes so their
2291 * processing is identical as other attributes
2292 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002293static void
Owen Taylor3473f882001-02-23 17:55:21 +00002294htmlParseScript(htmlParserCtxtPtr ctxt) {
2295 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2296 int nbchar = 0;
2297 xmlChar cur;
2298
2299 SHRINK;
2300 cur = CUR;
2301 while (IS_CHAR(cur)) {
2302 if ((cur == '<') && (NXT(1) == '/')) {
2303 /*
2304 * One should break here, the specification is clear:
2305 * Authors should therefore escape "</" within the content.
2306 * Escape mechanisms are specific to each scripting or
2307 * style sheet language.
2308 */
2309 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2310 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2311 break; /* while */
2312 }
2313 buf[nbchar++] = cur;
2314 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2315 if (ctxt->sax->cdataBlock!= NULL) {
2316 /*
2317 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2318 */
2319 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2320 }
2321 nbchar = 0;
2322 }
2323 NEXT;
2324 cur = CUR;
2325 }
2326 if (!(IS_CHAR(cur))) {
2327 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2328 ctxt->sax->error(ctxt->userData,
2329 "Invalid char in CDATA 0x%X\n", cur);
2330 ctxt->wellFormed = 0;
2331 NEXT;
2332 }
2333
2334 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2335 if (ctxt->sax->cdataBlock!= NULL) {
2336 /*
2337 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2338 */
2339 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2340 }
2341 }
2342}
2343
2344
2345/**
2346 * htmlParseCharData:
2347 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002348 *
2349 * parse a CharData section.
2350 * if we are within a CDATA section ']]>' marks an end of section.
2351 *
2352 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2353 */
2354
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002355static void
2356htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002357 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2358 int nbchar = 0;
2359 int cur, l;
2360
2361 SHRINK;
2362 cur = CUR_CHAR(l);
2363 while (((cur != '<') || (ctxt->token == '<')) &&
2364 ((cur != '&') || (ctxt->token == '&')) &&
2365 (IS_CHAR(cur))) {
2366 COPY_BUF(l,buf,nbchar,cur);
2367 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2368 /*
2369 * Ok the segment is to be consumed as chars.
2370 */
2371 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2372 if (areBlanks(ctxt, buf, nbchar)) {
2373 if (ctxt->sax->ignorableWhitespace != NULL)
2374 ctxt->sax->ignorableWhitespace(ctxt->userData,
2375 buf, nbchar);
2376 } else {
2377 htmlCheckParagraph(ctxt);
2378 if (ctxt->sax->characters != NULL)
2379 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2380 }
2381 }
2382 nbchar = 0;
2383 }
2384 NEXTL(l);
2385 cur = CUR_CHAR(l);
2386 }
2387 if (nbchar != 0) {
2388 /*
2389 * Ok the segment is to be consumed as chars.
2390 */
2391 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2392 if (areBlanks(ctxt, buf, nbchar)) {
2393 if (ctxt->sax->ignorableWhitespace != NULL)
2394 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2395 } else {
2396 htmlCheckParagraph(ctxt);
2397 if (ctxt->sax->characters != NULL)
2398 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2399 }
2400 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002401 } else {
2402 /*
2403 * Loop detection
2404 */
2405 if (cur == 0)
2406 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002407 }
2408}
2409
2410/**
2411 * htmlParseExternalID:
2412 * @ctxt: an HTML parser context
2413 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002414 *
2415 * Parse an External ID or a Public ID
2416 *
Owen Taylor3473f882001-02-23 17:55:21 +00002417 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2418 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2419 *
2420 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2421 *
2422 * Returns the function returns SystemLiteral and in the second
2423 * case publicID receives PubidLiteral, is strict is off
2424 * it is possible to return NULL and have publicID set.
2425 */
2426
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002427static xmlChar *
2428htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002429 xmlChar *URI = NULL;
2430
2431 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2432 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2433 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2434 SKIP(6);
2435 if (!IS_BLANK(CUR)) {
2436 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2437 ctxt->sax->error(ctxt->userData,
2438 "Space required after 'SYSTEM'\n");
2439 ctxt->wellFormed = 0;
2440 }
2441 SKIP_BLANKS;
2442 URI = htmlParseSystemLiteral(ctxt);
2443 if (URI == NULL) {
2444 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2445 ctxt->sax->error(ctxt->userData,
2446 "htmlParseExternalID: SYSTEM, no URI\n");
2447 ctxt->wellFormed = 0;
2448 }
2449 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2450 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2451 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2452 SKIP(6);
2453 if (!IS_BLANK(CUR)) {
2454 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2455 ctxt->sax->error(ctxt->userData,
2456 "Space required after 'PUBLIC'\n");
2457 ctxt->wellFormed = 0;
2458 }
2459 SKIP_BLANKS;
2460 *publicID = htmlParsePubidLiteral(ctxt);
2461 if (*publicID == NULL) {
2462 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2463 ctxt->sax->error(ctxt->userData,
2464 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2465 ctxt->wellFormed = 0;
2466 }
2467 SKIP_BLANKS;
2468 if ((CUR == '"') || (CUR == '\'')) {
2469 URI = htmlParseSystemLiteral(ctxt);
2470 }
2471 }
2472 return(URI);
2473}
2474
2475/**
2476 * htmlParseComment:
2477 * @ctxt: an HTML parser context
2478 *
2479 * Parse an XML (SGML) comment <!-- .... -->
2480 *
2481 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2482 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002483static void
Owen Taylor3473f882001-02-23 17:55:21 +00002484htmlParseComment(htmlParserCtxtPtr ctxt) {
2485 xmlChar *buf = NULL;
2486 int len;
2487 int size = HTML_PARSER_BUFFER_SIZE;
2488 int q, ql;
2489 int r, rl;
2490 int cur, l;
2491 xmlParserInputState state;
2492
2493 /*
2494 * Check that there is a comment right here.
2495 */
2496 if ((RAW != '<') || (NXT(1) != '!') ||
2497 (NXT(2) != '-') || (NXT(3) != '-')) return;
2498
2499 state = ctxt->instate;
2500 ctxt->instate = XML_PARSER_COMMENT;
2501 SHRINK;
2502 SKIP(4);
2503 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2504 if (buf == NULL) {
2505 xmlGenericError(xmlGenericErrorContext,
2506 "malloc of %d byte failed\n", size);
2507 ctxt->instate = state;
2508 return;
2509 }
2510 q = CUR_CHAR(ql);
2511 NEXTL(ql);
2512 r = CUR_CHAR(rl);
2513 NEXTL(rl);
2514 cur = CUR_CHAR(l);
2515 len = 0;
2516 while (IS_CHAR(cur) &&
2517 ((cur != '>') ||
2518 (r != '-') || (q != '-'))) {
2519 if (len + 5 >= size) {
2520 size *= 2;
2521 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2522 if (buf == NULL) {
2523 xmlGenericError(xmlGenericErrorContext,
2524 "realloc of %d byte failed\n", size);
2525 ctxt->instate = state;
2526 return;
2527 }
2528 }
2529 COPY_BUF(ql,buf,len,q);
2530 q = r;
2531 ql = rl;
2532 r = cur;
2533 rl = l;
2534 NEXTL(l);
2535 cur = CUR_CHAR(l);
2536 if (cur == 0) {
2537 SHRINK;
2538 GROW;
2539 cur = CUR_CHAR(l);
2540 }
2541 }
2542 buf[len] = 0;
2543 if (!IS_CHAR(cur)) {
2544 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2545 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2546 ctxt->sax->error(ctxt->userData,
2547 "Comment not terminated \n<!--%.50s\n", buf);
2548 ctxt->wellFormed = 0;
2549 xmlFree(buf);
2550 } else {
2551 NEXT;
2552 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2553 (!ctxt->disableSAX))
2554 ctxt->sax->comment(ctxt->userData, buf);
2555 xmlFree(buf);
2556 }
2557 ctxt->instate = state;
2558}
2559
2560/**
2561 * htmlParseCharRef:
2562 * @ctxt: an HTML parser context
2563 *
2564 * parse Reference declarations
2565 *
2566 * [66] CharRef ::= '&#' [0-9]+ ';' |
2567 * '&#x' [0-9a-fA-F]+ ';'
2568 *
2569 * Returns the value parsed (as an int)
2570 */
2571int
2572htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2573 int val = 0;
2574
2575 if ((CUR == '&') && (NXT(1) == '#') &&
2576 (NXT(2) == 'x')) {
2577 SKIP(3);
2578 while (CUR != ';') {
2579 if ((CUR >= '0') && (CUR <= '9'))
2580 val = val * 16 + (CUR - '0');
2581 else if ((CUR >= 'a') && (CUR <= 'f'))
2582 val = val * 16 + (CUR - 'a') + 10;
2583 else if ((CUR >= 'A') && (CUR <= 'F'))
2584 val = val * 16 + (CUR - 'A') + 10;
2585 else {
2586 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2587 ctxt->sax->error(ctxt->userData,
2588 "htmlParseCharRef: invalid hexadecimal value\n");
2589 ctxt->wellFormed = 0;
2590 return(0);
2591 }
2592 NEXT;
2593 }
2594 if (CUR == ';')
2595 NEXT;
2596 } else if ((CUR == '&') && (NXT(1) == '#')) {
2597 SKIP(2);
2598 while (CUR != ';') {
2599 if ((CUR >= '0') && (CUR <= '9'))
2600 val = val * 10 + (CUR - '0');
2601 else {
2602 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2603 ctxt->sax->error(ctxt->userData,
2604 "htmlParseCharRef: invalid decimal value\n");
2605 ctxt->wellFormed = 0;
2606 return(0);
2607 }
2608 NEXT;
2609 }
2610 if (CUR == ';')
2611 NEXT;
2612 } else {
2613 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2614 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2615 ctxt->wellFormed = 0;
2616 }
2617 /*
2618 * Check the value IS_CHAR ...
2619 */
2620 if (IS_CHAR(val)) {
2621 return(val);
2622 } else {
2623 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2624 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
2625 val);
2626 ctxt->wellFormed = 0;
2627 }
2628 return(0);
2629}
2630
2631
2632/**
2633 * htmlParseDocTypeDecl :
2634 * @ctxt: an HTML parser context
2635 *
2636 * parse a DOCTYPE declaration
2637 *
2638 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2639 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2640 */
2641
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002642static void
Owen Taylor3473f882001-02-23 17:55:21 +00002643htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
2644 xmlChar *name;
2645 xmlChar *ExternalID = NULL;
2646 xmlChar *URI = NULL;
2647
2648 /*
2649 * We know that '<!DOCTYPE' has been detected.
2650 */
2651 SKIP(9);
2652
2653 SKIP_BLANKS;
2654
2655 /*
2656 * Parse the DOCTYPE name.
2657 */
2658 name = htmlParseName(ctxt);
2659 if (name == NULL) {
2660 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2661 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2662 ctxt->wellFormed = 0;
2663 }
2664 /*
2665 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2666 */
2667
2668 SKIP_BLANKS;
2669
2670 /*
2671 * Check for SystemID and ExternalID
2672 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002673 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00002674 SKIP_BLANKS;
2675
2676 /*
2677 * We should be at the end of the DOCTYPE declaration.
2678 */
2679 if (CUR != '>') {
2680 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardf6ed8bc2001-10-02 09:22:47 +00002681 ctxt->sax->error(ctxt->userData, "DOCTYPE improperly terminated\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002682 ctxt->wellFormed = 0;
2683 /* We shouldn't try to resynchronize ... */
2684 }
2685 NEXT;
2686
2687 /*
2688 * Create or update the document accordingly to the DOCTYPE
2689 */
2690 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2691 (!ctxt->disableSAX))
2692 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
2693
2694 /*
2695 * Cleanup, since we don't use all those identifiers
2696 */
2697 if (URI != NULL) xmlFree(URI);
2698 if (ExternalID != NULL) xmlFree(ExternalID);
2699 if (name != NULL) xmlFree(name);
2700}
2701
2702/**
2703 * htmlParseAttribute:
2704 * @ctxt: an HTML parser context
2705 * @value: a xmlChar ** used to store the value of the attribute
2706 *
2707 * parse an attribute
2708 *
2709 * [41] Attribute ::= Name Eq AttValue
2710 *
2711 * [25] Eq ::= S? '=' S?
2712 *
2713 * With namespace:
2714 *
2715 * [NS 11] Attribute ::= QName Eq AttValue
2716 *
2717 * Also the case QName == xmlns:??? is handled independently as a namespace
2718 * definition.
2719 *
2720 * Returns the attribute name, and the value in *value.
2721 */
2722
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002723static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002724htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
2725 xmlChar *name, *val = NULL;
2726
2727 *value = NULL;
2728 name = htmlParseHTMLName(ctxt);
2729 if (name == NULL) {
2730 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2731 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2732 ctxt->wellFormed = 0;
2733 return(NULL);
2734 }
2735
2736 /*
2737 * read the value
2738 */
2739 SKIP_BLANKS;
2740 if (CUR == '=') {
2741 NEXT;
2742 SKIP_BLANKS;
2743 val = htmlParseAttValue(ctxt);
2744 /******
2745 } else {
2746 * TODO : some attribute must have values, some may not
2747 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2748 ctxt->sax->warning(ctxt->userData,
2749 "No value for attribute %s\n", name); */
2750 }
2751
2752 *value = val;
2753 return(name);
2754}
2755
2756/**
2757 * htmlCheckEncoding:
2758 * @ctxt: an HTML parser context
2759 * @attvalue: the attribute value
2760 *
2761 * Checks an http-equiv attribute from a Meta tag to detect
2762 * the encoding
2763 * If a new encoding is detected the parser is switched to decode
2764 * it and pass UTF8
2765 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002766static void
Owen Taylor3473f882001-02-23 17:55:21 +00002767htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2768 const xmlChar *encoding;
2769
2770 if ((ctxt == NULL) || (attvalue == NULL))
2771 return;
2772
2773 /* do not change encoding */
2774 if (ctxt->input->encoding != NULL)
2775 return;
2776
2777 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
2778 if (encoding != NULL) {
2779 encoding += 8;
2780 } else {
2781 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
2782 if (encoding != NULL)
2783 encoding += 9;
2784 }
2785 if (encoding != NULL) {
2786 xmlCharEncoding enc;
2787 xmlCharEncodingHandlerPtr handler;
2788
2789 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2790
2791 if (ctxt->input->encoding != NULL)
2792 xmlFree((xmlChar *) ctxt->input->encoding);
2793 ctxt->input->encoding = xmlStrdup(encoding);
2794
2795 enc = xmlParseCharEncoding((const char *) encoding);
2796 /*
2797 * registered set of known encodings
2798 */
2799 if (enc != XML_CHAR_ENCODING_ERROR) {
2800 xmlSwitchEncoding(ctxt, enc);
2801 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2802 } else {
2803 /*
2804 * fallback for unknown encodings
2805 */
2806 handler = xmlFindCharEncodingHandler((const char *) encoding);
2807 if (handler != NULL) {
2808 xmlSwitchToEncoding(ctxt, handler);
2809 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2810 } else {
2811 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2812 }
2813 }
2814
2815 if ((ctxt->input->buf != NULL) &&
2816 (ctxt->input->buf->encoder != NULL) &&
2817 (ctxt->input->buf->raw != NULL) &&
2818 (ctxt->input->buf->buffer != NULL)) {
2819 int nbchars;
2820 int processed;
2821
2822 /*
2823 * convert as much as possible to the parser reading buffer.
2824 */
2825 processed = ctxt->input->cur - ctxt->input->base;
2826 xmlBufferShrink(ctxt->input->buf->buffer, processed);
2827 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2828 ctxt->input->buf->buffer,
2829 ctxt->input->buf->raw);
2830 if (nbchars < 0) {
2831 ctxt->errNo = XML_ERR_INVALID_ENCODING;
2832 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2833 ctxt->sax->error(ctxt->userData,
2834 "htmlCheckEncoding: encoder error\n");
2835 }
2836 ctxt->input->base =
2837 ctxt->input->cur = ctxt->input->buf->buffer->content;
2838 }
2839 }
2840}
2841
2842/**
2843 * htmlCheckMeta:
2844 * @ctxt: an HTML parser context
2845 * @atts: the attributes values
2846 *
2847 * Checks an attributes from a Meta tag
2848 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002849static void
Owen Taylor3473f882001-02-23 17:55:21 +00002850htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2851 int i;
2852 const xmlChar *att, *value;
2853 int http = 0;
2854 const xmlChar *content = NULL;
2855
2856 if ((ctxt == NULL) || (atts == NULL))
2857 return;
2858
2859 i = 0;
2860 att = atts[i++];
2861 while (att != NULL) {
2862 value = atts[i++];
2863 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
2864 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
2865 http = 1;
2866 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
2867 content = value;
2868 att = atts[i++];
2869 }
2870 if ((http) && (content != NULL))
2871 htmlCheckEncoding(ctxt, content);
2872
2873}
2874
2875/**
2876 * htmlParseStartTag:
2877 * @ctxt: an HTML parser context
2878 *
2879 * parse a start of tag either for rule element or
2880 * EmptyElement. In both case we don't parse the tag closing chars.
2881 *
2882 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2883 *
2884 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2885 *
2886 * With namespace:
2887 *
2888 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2889 *
2890 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2891 *
2892 */
2893
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002894static void
Owen Taylor3473f882001-02-23 17:55:21 +00002895htmlParseStartTag(htmlParserCtxtPtr ctxt) {
2896 xmlChar *name;
2897 xmlChar *attname;
2898 xmlChar *attvalue;
2899 const xmlChar **atts = NULL;
2900 int nbatts = 0;
2901 int maxatts = 0;
2902 int meta = 0;
2903 int i;
2904
2905 if (CUR != '<') return;
2906 NEXT;
2907
2908 GROW;
2909 name = htmlParseHTMLName(ctxt);
2910 if (name == NULL) {
2911 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2912 ctxt->sax->error(ctxt->userData,
2913 "htmlParseStartTag: invalid element name\n");
2914 ctxt->wellFormed = 0;
2915 /* Dump the bogus tag like browsers do */
2916 while ((IS_CHAR(CUR)) && (CUR != '>'))
2917 NEXT;
2918 return;
2919 }
2920 if (xmlStrEqual(name, BAD_CAST"meta"))
2921 meta = 1;
2922
2923 /*
2924 * Check for auto-closure of HTML elements.
2925 */
2926 htmlAutoClose(ctxt, name);
2927
2928 /*
2929 * Check for implied HTML elements.
2930 */
2931 htmlCheckImplied(ctxt, name);
2932
2933 /*
2934 * Avoid html at any level > 0, head at any level != 1
2935 * or any attempt to recurse body
2936 */
2937 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
2938 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2939 ctxt->sax->error(ctxt->userData,
2940 "htmlParseStartTag: misplaced <html> tag\n");
2941 ctxt->wellFormed = 0;
2942 xmlFree(name);
2943 return;
2944 }
2945 if ((ctxt->nameNr != 1) &&
2946 (xmlStrEqual(name, BAD_CAST"head"))) {
2947 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2948 ctxt->sax->error(ctxt->userData,
2949 "htmlParseStartTag: misplaced <head> tag\n");
2950 ctxt->wellFormed = 0;
2951 xmlFree(name);
2952 return;
2953 }
2954 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002955 int indx;
2956 for (indx = 0;indx < ctxt->nameNr;indx++) {
2957 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor3473f882001-02-23 17:55:21 +00002958 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2959 ctxt->sax->error(ctxt->userData,
2960 "htmlParseStartTag: misplaced <body> tag\n");
2961 ctxt->wellFormed = 0;
2962 xmlFree(name);
2963 return;
2964 }
2965 }
2966 }
2967
2968 /*
2969 * Now parse the attributes, it ends up with the ending
2970 *
2971 * (S Attribute)* S?
2972 */
2973 SKIP_BLANKS;
2974 while ((IS_CHAR(CUR)) &&
2975 (CUR != '>') &&
2976 ((CUR != '/') || (NXT(1) != '>'))) {
2977 long cons = ctxt->nbChars;
2978
2979 GROW;
2980 attname = htmlParseAttribute(ctxt, &attvalue);
2981 if (attname != NULL) {
2982
2983 /*
2984 * Well formedness requires at most one declaration of an attribute
2985 */
2986 for (i = 0; i < nbatts;i += 2) {
2987 if (xmlStrEqual(atts[i], attname)) {
2988 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2989 ctxt->sax->error(ctxt->userData,
2990 "Attribute %s redefined\n",
2991 attname);
2992 ctxt->wellFormed = 0;
2993 xmlFree(attname);
2994 if (attvalue != NULL)
2995 xmlFree(attvalue);
2996 goto failed;
2997 }
2998 }
2999
3000 /*
3001 * Add the pair to atts
3002 */
3003 if (atts == NULL) {
3004 maxatts = 10;
3005 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
3006 if (atts == NULL) {
3007 xmlGenericError(xmlGenericErrorContext,
3008 "malloc of %ld byte failed\n",
3009 maxatts * (long)sizeof(xmlChar *));
3010 if (name != NULL) xmlFree(name);
3011 return;
3012 }
3013 } else if (nbatts + 4 > maxatts) {
3014 maxatts *= 2;
3015 atts = (const xmlChar **) xmlRealloc((void *) atts,
3016 maxatts * sizeof(xmlChar *));
3017 if (atts == NULL) {
3018 xmlGenericError(xmlGenericErrorContext,
3019 "realloc of %ld byte failed\n",
3020 maxatts * (long)sizeof(xmlChar *));
3021 if (name != NULL) xmlFree(name);
3022 return;
3023 }
3024 }
3025 atts[nbatts++] = attname;
3026 atts[nbatts++] = attvalue;
3027 atts[nbatts] = NULL;
3028 atts[nbatts + 1] = NULL;
3029 }
3030 else {
3031 /* Dump the bogus attribute string up to the next blank or
3032 * the end of the tag. */
3033 while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
3034 && ((CUR != '/') || (NXT(1) != '>')))
3035 NEXT;
3036 }
3037
3038failed:
3039 SKIP_BLANKS;
3040 if (cons == ctxt->nbChars) {
3041 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3042 ctxt->sax->error(ctxt->userData,
3043 "htmlParseStartTag: problem parsing attributes\n");
3044 ctxt->wellFormed = 0;
3045 break;
3046 }
3047 }
3048
3049 /*
3050 * Handle specific association to the META tag
3051 */
3052 if (meta)
3053 htmlCheckMeta(ctxt, atts);
3054
3055 /*
3056 * SAX: Start of Element !
3057 */
3058 htmlnamePush(ctxt, xmlStrdup(name));
3059#ifdef DEBUG
3060 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
3061#endif
3062 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3063 ctxt->sax->startElement(ctxt->userData, name, atts);
3064
3065 if (atts != NULL) {
3066 for (i = 0;i < nbatts;i++) {
3067 if (atts[i] != NULL)
3068 xmlFree((xmlChar *) atts[i]);
3069 }
3070 xmlFree((void *) atts);
3071 }
3072 if (name != NULL) xmlFree(name);
3073}
3074
3075/**
3076 * htmlParseEndTag:
3077 * @ctxt: an HTML parser context
3078 *
3079 * parse an end of tag
3080 *
3081 * [42] ETag ::= '</' Name S? '>'
3082 *
3083 * With namespace
3084 *
3085 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003086 *
3087 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003088 */
3089
Daniel Veillardf420ac52001-07-04 16:04:09 +00003090static int
Owen Taylor3473f882001-02-23 17:55:21 +00003091htmlParseEndTag(htmlParserCtxtPtr ctxt) {
3092 xmlChar *name;
3093 xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003094 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003095
3096 if ((CUR != '<') || (NXT(1) != '/')) {
3097 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3098 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3099 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003100 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003101 }
3102 SKIP(2);
3103
3104 name = htmlParseHTMLName(ctxt);
Daniel Veillardf420ac52001-07-04 16:04:09 +00003105 if (name == NULL) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003106
3107 /*
3108 * We should definitely be at the ending "S? '>'" part
3109 */
3110 SKIP_BLANKS;
3111 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3112 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3113 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3114 ctxt->wellFormed = 0;
3115 } else
3116 NEXT;
3117
3118 /*
3119 * If the name read is not one of the element in the parsing stack
3120 * then return, it's just an error.
3121 */
3122 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
3123 if (xmlStrEqual(name, ctxt->nameTab[i])) break;
3124 }
3125 if (i < 0) {
3126 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3127 ctxt->sax->error(ctxt->userData,
3128 "Unexpected end tag : %s\n", name);
3129 xmlFree(name);
3130 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003131 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003132 }
3133
3134
3135 /*
3136 * Check for auto-closure of HTML elements.
3137 */
3138
3139 htmlAutoCloseOnClose(ctxt, name);
3140
3141 /*
3142 * Well formedness constraints, opening and closing must match.
3143 * With the exception that the autoclose may have popped stuff out
3144 * of the stack.
3145 */
3146 if (!xmlStrEqual(name, ctxt->name)) {
3147#ifdef DEBUG
3148 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
3149#endif
3150 if ((ctxt->name != NULL) &&
3151 (!xmlStrEqual(ctxt->name, name))) {
3152 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3153 ctxt->sax->error(ctxt->userData,
3154 "Opening and ending tag mismatch: %s and %s\n",
3155 name, ctxt->name);
3156 ctxt->wellFormed = 0;
3157 }
3158 }
3159
3160 /*
3161 * SAX: End of Tag
3162 */
3163 oldname = ctxt->name;
3164 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3165 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3166 ctxt->sax->endElement(ctxt->userData, name);
3167 oldname = htmlnamePop(ctxt);
3168 if (oldname != NULL) {
3169#ifdef DEBUG
3170 xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
3171#endif
3172 xmlFree(oldname);
3173#ifdef DEBUG
3174 } else {
3175 xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
3176#endif
3177 }
Daniel Veillardf420ac52001-07-04 16:04:09 +00003178 ret = 1;
3179 } else {
3180 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003181 }
3182
3183 if (name != NULL)
3184 xmlFree(name);
3185
Daniel Veillardf420ac52001-07-04 16:04:09 +00003186 return(ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003187}
3188
3189
3190/**
3191 * htmlParseReference:
3192 * @ctxt: an HTML parser context
3193 *
3194 * parse and handle entity references in content,
3195 * this will end-up in a call to character() since this is either a
3196 * CharRef, or a predefined entity.
3197 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003198static void
Owen Taylor3473f882001-02-23 17:55:21 +00003199htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003200 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003201 xmlChar out[6];
3202 xmlChar *name;
3203 if (CUR != '&') return;
3204
3205 if (NXT(1) == '#') {
3206 unsigned int c;
3207 int bits, i = 0;
3208
3209 c = htmlParseCharRef(ctxt);
3210 if (c == 0)
3211 return;
3212
3213 if (c < 0x80) { out[i++]= c; bits= -6; }
3214 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3215 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3216 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3217
3218 for ( ; bits >= 0; bits-= 6) {
3219 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3220 }
3221 out[i] = 0;
3222
3223 htmlCheckParagraph(ctxt);
3224 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3225 ctxt->sax->characters(ctxt->userData, out, i);
3226 } else {
3227 ent = htmlParseEntityRef(ctxt, &name);
3228 if (name == NULL) {
3229 htmlCheckParagraph(ctxt);
3230 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3231 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3232 return;
3233 }
3234 if ((ent == NULL) || (ent->value <= 0)) {
3235 htmlCheckParagraph(ctxt);
3236 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3237 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3238 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3239 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3240 }
3241 } else {
3242 unsigned int c;
3243 int bits, i = 0;
3244
3245 c = ent->value;
3246 if (c < 0x80)
3247 { out[i++]= c; bits= -6; }
3248 else if (c < 0x800)
3249 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3250 else if (c < 0x10000)
3251 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3252 else
3253 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3254
3255 for ( ; bits >= 0; bits-= 6) {
3256 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3257 }
3258 out[i] = 0;
3259
3260 htmlCheckParagraph(ctxt);
3261 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3262 ctxt->sax->characters(ctxt->userData, out, i);
3263 }
3264 xmlFree(name);
3265 }
3266}
3267
3268/**
3269 * htmlParseContent:
3270 * @ctxt: an HTML parser context
3271 * @name: the node name
3272 *
3273 * Parse a content: comment, sub-element, reference or text.
3274 *
3275 */
3276
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003277static void
Owen Taylor3473f882001-02-23 17:55:21 +00003278htmlParseContent(htmlParserCtxtPtr ctxt) {
3279 xmlChar *currentNode;
3280 int depth;
3281
3282 currentNode = xmlStrdup(ctxt->name);
3283 depth = ctxt->nameNr;
3284 while (1) {
3285 long cons = ctxt->nbChars;
3286
3287 GROW;
3288 /*
3289 * Our tag or one of it's parent or children is ending.
3290 */
3291 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003292 if (htmlParseEndTag(ctxt) &&
3293 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3294 if (currentNode != NULL)
3295 xmlFree(currentNode);
3296 return;
3297 }
3298 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003299 }
3300
3301 /*
3302 * Has this node been popped out during parsing of
3303 * the next element
3304 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003305 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3306 (!xmlStrEqual(currentNode, ctxt->name)))
3307 {
Owen Taylor3473f882001-02-23 17:55:21 +00003308 if (currentNode != NULL) xmlFree(currentNode);
3309 return;
3310 }
3311
Daniel Veillardf9533d12001-03-03 10:04:57 +00003312 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3313 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003314 /*
3315 * Handle SCRIPT/STYLE separately
3316 */
3317 htmlParseScript(ctxt);
3318 } else {
3319 /*
3320 * Sometimes DOCTYPE arrives in the middle of the document
3321 */
3322 if ((CUR == '<') && (NXT(1) == '!') &&
3323 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3324 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3325 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3326 (UPP(8) == 'E')) {
3327 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3328 ctxt->sax->error(ctxt->userData,
3329 "Misplaced DOCTYPE declaration\n");
3330 ctxt->wellFormed = 0;
3331 htmlParseDocTypeDecl(ctxt);
3332 }
3333
3334 /*
3335 * First case : a comment
3336 */
3337 if ((CUR == '<') && (NXT(1) == '!') &&
3338 (NXT(2) == '-') && (NXT(3) == '-')) {
3339 htmlParseComment(ctxt);
3340 }
3341
3342 /*
3343 * Second case : a sub-element.
3344 */
3345 else if (CUR == '<') {
3346 htmlParseElement(ctxt);
3347 }
3348
3349 /*
3350 * Third case : a reference. If if has not been resolved,
3351 * parsing returns it's Name, create the node
3352 */
3353 else if (CUR == '&') {
3354 htmlParseReference(ctxt);
3355 }
3356
3357 /*
3358 * Fourth : end of the resource
3359 */
3360 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003361 htmlAutoCloseOnEnd(ctxt);
3362 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003363 }
3364
3365 /*
3366 * Last case, text. Note that References are handled directly.
3367 */
3368 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003369 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003370 }
3371
3372 if (cons == ctxt->nbChars) {
3373 if (ctxt->node != NULL) {
3374 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3375 ctxt->sax->error(ctxt->userData,
3376 "detected an error in element content\n");
3377 ctxt->wellFormed = 0;
3378 }
3379 break;
3380 }
3381 }
3382 GROW;
3383 }
3384 if (currentNode != NULL) xmlFree(currentNode);
3385}
3386
3387/**
3388 * htmlParseElement:
3389 * @ctxt: an HTML parser context
3390 *
3391 * parse an HTML element, this is highly recursive
3392 *
3393 * [39] element ::= EmptyElemTag | STag content ETag
3394 *
3395 * [41] Attribute ::= Name Eq AttValue
3396 */
3397
3398void
3399htmlParseElement(htmlParserCtxtPtr ctxt) {
3400 xmlChar *name;
3401 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003402 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003403 htmlParserNodeInfo node_info;
3404 xmlChar *oldname;
3405 int depth = ctxt->nameNr;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003406 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003407
3408 /* Capture start position */
3409 if (ctxt->record_info) {
3410 node_info.begin_pos = ctxt->input->consumed +
3411 (CUR_PTR - ctxt->input->base);
3412 node_info.begin_line = ctxt->input->line;
3413 }
3414
3415 oldname = xmlStrdup(ctxt->name);
3416 htmlParseStartTag(ctxt);
3417 name = ctxt->name;
3418#ifdef DEBUG
3419 if (oldname == NULL)
3420 xmlGenericError(xmlGenericErrorContext,
3421 "Start of element %s\n", name);
3422 else if (name == NULL)
3423 xmlGenericError(xmlGenericErrorContext,
3424 "Start of element failed, was %s\n", oldname);
3425 else
3426 xmlGenericError(xmlGenericErrorContext,
3427 "Start of element %s, was %s\n", name, oldname);
3428#endif
3429 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3430 (name == NULL)) {
3431 if (CUR == '>')
3432 NEXT;
3433 if (oldname != NULL)
3434 xmlFree(oldname);
3435 return;
3436 }
3437 if (oldname != NULL)
3438 xmlFree(oldname);
3439
3440 /*
3441 * Lookup the info for that element.
3442 */
3443 info = htmlTagLookup(name);
3444 if (info == NULL) {
3445 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3446 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3447 name);
3448 ctxt->wellFormed = 0;
3449 } else if (info->depr) {
3450/***************************
3451 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3452 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3453 name);
3454 ***************************/
3455 }
3456
3457 /*
3458 * Check for an Empty Element labelled the XML/SGML way
3459 */
3460 if ((CUR == '/') && (NXT(1) == '>')) {
3461 SKIP(2);
3462 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3463 ctxt->sax->endElement(ctxt->userData, name);
3464 oldname = htmlnamePop(ctxt);
3465#ifdef DEBUG
3466 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
3467#endif
3468 if (oldname != NULL)
3469 xmlFree(oldname);
3470 return;
3471 }
3472
3473 if (CUR == '>') {
3474 NEXT;
3475 } else {
3476 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3477 ctxt->sax->error(ctxt->userData,
3478 "Couldn't find end of Start Tag %s\n",
3479 name);
3480 ctxt->wellFormed = 0;
3481
3482 /*
3483 * end of parsing of this node.
3484 */
3485 if (xmlStrEqual(name, ctxt->name)) {
3486 nodePop(ctxt);
3487 oldname = htmlnamePop(ctxt);
3488#ifdef DEBUG
3489 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
3490#endif
3491 if (oldname != NULL)
3492 xmlFree(oldname);
3493 }
3494
3495 /*
3496 * Capture end position and add node
3497 */
3498 if ( currentNode != NULL && ctxt->record_info ) {
3499 node_info.end_pos = ctxt->input->consumed +
3500 (CUR_PTR - ctxt->input->base);
3501 node_info.end_line = ctxt->input->line;
3502 node_info.node = ctxt->node;
3503 xmlParserAddNodeInfo(ctxt, &node_info);
3504 }
3505 return;
3506 }
3507
3508 /*
3509 * Check for an Empty Element from DTD definition
3510 */
3511 if ((info != NULL) && (info->empty)) {
3512 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3513 ctxt->sax->endElement(ctxt->userData, name);
3514 oldname = htmlnamePop(ctxt);
3515#ifdef DEBUG
3516 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
3517#endif
3518 if (oldname != NULL)
3519 xmlFree(oldname);
3520 return;
3521 }
3522
3523 /*
3524 * Parse the content of the element:
3525 */
3526 currentNode = xmlStrdup(ctxt->name);
3527 depth = ctxt->nameNr;
3528 while (IS_CHAR(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00003529 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00003530 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00003531 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00003532 if (ctxt->nameNr < depth) break;
3533 }
3534
Owen Taylor3473f882001-02-23 17:55:21 +00003535 /*
3536 * Capture end position and add node
3537 */
3538 if ( currentNode != NULL && ctxt->record_info ) {
3539 node_info.end_pos = ctxt->input->consumed +
3540 (CUR_PTR - ctxt->input->base);
3541 node_info.end_line = ctxt->input->line;
3542 node_info.node = ctxt->node;
3543 xmlParserAddNodeInfo(ctxt, &node_info);
3544 }
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003545 if (!IS_CHAR(CUR)) {
3546 htmlAutoCloseOnEnd(ctxt);
3547 }
3548
Owen Taylor3473f882001-02-23 17:55:21 +00003549 if (currentNode != NULL)
3550 xmlFree(currentNode);
3551}
3552
3553/**
3554 * htmlParseDocument :
3555 * @ctxt: an HTML parser context
3556 *
3557 * parse an HTML document (and build a tree if using the standard SAX
3558 * interface).
3559 *
3560 * Returns 0, -1 in case of error. the parser context is augmented
3561 * as a result of the parsing.
3562 */
3563
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003564static int
Owen Taylor3473f882001-02-23 17:55:21 +00003565htmlParseDocument(htmlParserCtxtPtr ctxt) {
3566 xmlDtdPtr dtd;
3567
Daniel Veillardd0463562001-10-13 09:15:48 +00003568 xmlInitParser();
3569
Owen Taylor3473f882001-02-23 17:55:21 +00003570 htmlDefaultSAXHandlerInit();
3571 ctxt->html = 1;
3572
3573 GROW;
3574 /*
3575 * SAX: beginning of the document processing.
3576 */
3577 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3578 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3579
3580 /*
3581 * Wipe out everything which is before the first '<'
3582 */
3583 SKIP_BLANKS;
3584 if (CUR == 0) {
3585 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3586 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3587 ctxt->wellFormed = 0;
3588 }
3589
3590 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3591 ctxt->sax->startDocument(ctxt->userData);
3592
3593
3594 /*
3595 * Parse possible comments before any content
3596 */
3597 while ((CUR == '<') && (NXT(1) == '!') &&
3598 (NXT(2) == '-') && (NXT(3) == '-')) {
3599 htmlParseComment(ctxt);
3600 SKIP_BLANKS;
3601 }
3602
3603
3604 /*
3605 * Then possibly doc type declaration(s) and more Misc
3606 * (doctypedecl Misc*)?
3607 */
3608 if ((CUR == '<') && (NXT(1) == '!') &&
3609 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3610 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3611 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3612 (UPP(8) == 'E')) {
3613 htmlParseDocTypeDecl(ctxt);
3614 }
3615 SKIP_BLANKS;
3616
3617 /*
3618 * Parse possible comments before any content
3619 */
3620 while ((CUR == '<') && (NXT(1) == '!') &&
3621 (NXT(2) == '-') && (NXT(3) == '-')) {
3622 htmlParseComment(ctxt);
3623 SKIP_BLANKS;
3624 }
3625
3626 /*
3627 * Time to start parsing the tree itself
3628 */
3629 htmlParseContent(ctxt);
3630
3631 /*
3632 * autoclose
3633 */
3634 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003635 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003636
3637
3638 /*
3639 * SAX: end of the document processing.
3640 */
3641 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3642 ctxt->sax->endDocument(ctxt->userData);
3643
3644 if (ctxt->myDoc != NULL) {
3645 dtd = xmlGetIntSubset(ctxt->myDoc);
3646 if (dtd == NULL)
3647 ctxt->myDoc->intSubset =
3648 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3649 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3650 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3651 }
3652 if (! ctxt->wellFormed) return(-1);
3653 return(0);
3654}
3655
3656
3657/************************************************************************
3658 * *
3659 * Parser contexts handling *
3660 * *
3661 ************************************************************************/
3662
3663/**
3664 * xmlInitParserCtxt:
3665 * @ctxt: an HTML parser context
3666 *
3667 * Initialize a parser context
3668 */
3669
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003670static void
Owen Taylor3473f882001-02-23 17:55:21 +00003671htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3672{
3673 htmlSAXHandler *sax;
3674
3675 if (ctxt == NULL) return;
3676 memset(ctxt, 0, sizeof(htmlParserCtxt));
3677
3678 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3679 if (sax == NULL) {
3680 xmlGenericError(xmlGenericErrorContext,
3681 "htmlInitParserCtxt: out of memory\n");
3682 }
3683 else
3684 memset(sax, 0, sizeof(htmlSAXHandler));
3685
3686 /* Allocate the Input stack */
3687 ctxt->inputTab = (htmlParserInputPtr *)
3688 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3689 if (ctxt->inputTab == NULL) {
3690 xmlGenericError(xmlGenericErrorContext,
3691 "htmlInitParserCtxt: out of memory\n");
3692 ctxt->inputNr = 0;
3693 ctxt->inputMax = 0;
3694 ctxt->input = NULL;
3695 return;
3696 }
3697 ctxt->inputNr = 0;
3698 ctxt->inputMax = 5;
3699 ctxt->input = NULL;
3700 ctxt->version = NULL;
3701 ctxt->encoding = NULL;
3702 ctxt->standalone = -1;
3703 ctxt->instate = XML_PARSER_START;
3704
3705 /* Allocate the Node stack */
3706 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3707 if (ctxt->nodeTab == NULL) {
3708 xmlGenericError(xmlGenericErrorContext,
3709 "htmlInitParserCtxt: out of memory\n");
3710 ctxt->nodeNr = 0;
3711 ctxt->nodeMax = 0;
3712 ctxt->node = NULL;
3713 ctxt->inputNr = 0;
3714 ctxt->inputMax = 0;
3715 ctxt->input = NULL;
3716 return;
3717 }
3718 ctxt->nodeNr = 0;
3719 ctxt->nodeMax = 10;
3720 ctxt->node = NULL;
3721
3722 /* Allocate the Name stack */
3723 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
3724 if (ctxt->nameTab == NULL) {
3725 xmlGenericError(xmlGenericErrorContext,
3726 "htmlInitParserCtxt: out of memory\n");
3727 ctxt->nameNr = 0;
3728 ctxt->nameMax = 10;
3729 ctxt->name = NULL;
3730 ctxt->nodeNr = 0;
3731 ctxt->nodeMax = 0;
3732 ctxt->node = NULL;
3733 ctxt->inputNr = 0;
3734 ctxt->inputMax = 0;
3735 ctxt->input = NULL;
3736 return;
3737 }
3738 ctxt->nameNr = 0;
3739 ctxt->nameMax = 10;
3740 ctxt->name = NULL;
3741
3742 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3743 else {
3744 ctxt->sax = sax;
3745 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3746 }
3747 ctxt->userData = ctxt;
3748 ctxt->myDoc = NULL;
3749 ctxt->wellFormed = 1;
3750 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00003751 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00003752 ctxt->html = 1;
3753 ctxt->record_info = 0;
3754 ctxt->validate = 0;
3755 ctxt->nbChars = 0;
3756 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00003757 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003758 xmlInitNodeInfoSeq(&ctxt->node_seq);
3759}
3760
3761/**
3762 * htmlFreeParserCtxt:
3763 * @ctxt: an HTML parser context
3764 *
3765 * Free all the memory used by a parser context. However the parsed
3766 * document in ctxt->myDoc is not freed.
3767 */
3768
3769void
3770htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3771{
3772 xmlFreeParserCtxt(ctxt);
3773}
3774
3775/**
3776 * htmlCreateDocParserCtxt :
3777 * @cur: a pointer to an array of xmlChar
3778 * @encoding: a free form C string describing the HTML document encoding, or NULL
3779 *
3780 * Create a parser context for an HTML document.
3781 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003782 * TODO: check the need to add encoding handling there
3783 *
Owen Taylor3473f882001-02-23 17:55:21 +00003784 * Returns the new parser context or NULL
3785 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003786static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00003787htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Owen Taylor3473f882001-02-23 17:55:21 +00003788 htmlParserCtxtPtr ctxt;
3789 htmlParserInputPtr input;
3790 /* htmlCharEncoding enc; */
3791
3792 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
3793 if (ctxt == NULL) {
3794 perror("malloc");
3795 return(NULL);
3796 }
3797 htmlInitParserCtxt(ctxt);
3798 input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
3799 if (input == NULL) {
3800 perror("malloc");
3801 xmlFree(ctxt);
3802 return(NULL);
3803 }
3804 memset(input, 0, sizeof(htmlParserInput));
3805
3806 input->line = 1;
3807 input->col = 1;
3808 input->base = cur;
3809 input->cur = cur;
3810
3811 inputPush(ctxt, input);
3812 return(ctxt);
3813}
3814
3815/************************************************************************
3816 * *
3817 * Progressive parsing interfaces *
3818 * *
3819 ************************************************************************/
3820
3821/**
3822 * htmlParseLookupSequence:
3823 * @ctxt: an HTML parser context
3824 * @first: the first char to lookup
3825 * @next: the next char to lookup or zero
3826 * @third: the next char to lookup or zero
3827 *
3828 * Try to find if a sequence (first, next, third) or just (first next) or
3829 * (first) is available in the input stream.
3830 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3831 * to avoid rescanning sequences of bytes, it DOES change the state of the
3832 * parser, do not use liberally.
3833 * This is basically similar to xmlParseLookupSequence()
3834 *
3835 * Returns the index to the current parsing point if the full sequence
3836 * is available, -1 otherwise.
3837 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003838static int
Owen Taylor3473f882001-02-23 17:55:21 +00003839htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3840 xmlChar next, xmlChar third) {
3841 int base, len;
3842 htmlParserInputPtr in;
3843 const xmlChar *buf;
3844
3845 in = ctxt->input;
3846 if (in == NULL) return(-1);
3847 base = in->cur - in->base;
3848 if (base < 0) return(-1);
3849 if (ctxt->checkIndex > base)
3850 base = ctxt->checkIndex;
3851 if (in->buf == NULL) {
3852 buf = in->base;
3853 len = in->length;
3854 } else {
3855 buf = in->buf->buffer->content;
3856 len = in->buf->buffer->use;
3857 }
3858 /* take into account the sequence length */
3859 if (third) len -= 2;
3860 else if (next) len --;
3861 for (;base < len;base++) {
3862 if (buf[base] == first) {
3863 if (third != 0) {
3864 if ((buf[base + 1] != next) ||
3865 (buf[base + 2] != third)) continue;
3866 } else if (next != 0) {
3867 if (buf[base + 1] != next) continue;
3868 }
3869 ctxt->checkIndex = 0;
3870#ifdef DEBUG_PUSH
3871 if (next == 0)
3872 xmlGenericError(xmlGenericErrorContext,
3873 "HPP: lookup '%c' found at %d\n",
3874 first, base);
3875 else if (third == 0)
3876 xmlGenericError(xmlGenericErrorContext,
3877 "HPP: lookup '%c%c' found at %d\n",
3878 first, next, base);
3879 else
3880 xmlGenericError(xmlGenericErrorContext,
3881 "HPP: lookup '%c%c%c' found at %d\n",
3882 first, next, third, base);
3883#endif
3884 return(base - (in->cur - in->base));
3885 }
3886 }
3887 ctxt->checkIndex = base;
3888#ifdef DEBUG_PUSH
3889 if (next == 0)
3890 xmlGenericError(xmlGenericErrorContext,
3891 "HPP: lookup '%c' failed\n", first);
3892 else if (third == 0)
3893 xmlGenericError(xmlGenericErrorContext,
3894 "HPP: lookup '%c%c' failed\n", first, next);
3895 else
3896 xmlGenericError(xmlGenericErrorContext,
3897 "HPP: lookup '%c%c%c' failed\n", first, next, third);
3898#endif
3899 return(-1);
3900}
3901
3902/**
3903 * htmlParseTryOrFinish:
3904 * @ctxt: an HTML parser context
3905 * @terminate: last chunk indicator
3906 *
3907 * Try to progress on parsing
3908 *
3909 * Returns zero if no parsing was possible
3910 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003911static int
Owen Taylor3473f882001-02-23 17:55:21 +00003912htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
3913 int ret = 0;
3914 htmlParserInputPtr in;
3915 int avail = 0;
3916 xmlChar cur, next;
3917
3918#ifdef DEBUG_PUSH
3919 switch (ctxt->instate) {
3920 case XML_PARSER_EOF:
3921 xmlGenericError(xmlGenericErrorContext,
3922 "HPP: try EOF\n"); break;
3923 case XML_PARSER_START:
3924 xmlGenericError(xmlGenericErrorContext,
3925 "HPP: try START\n"); break;
3926 case XML_PARSER_MISC:
3927 xmlGenericError(xmlGenericErrorContext,
3928 "HPP: try MISC\n");break;
3929 case XML_PARSER_COMMENT:
3930 xmlGenericError(xmlGenericErrorContext,
3931 "HPP: try COMMENT\n");break;
3932 case XML_PARSER_PROLOG:
3933 xmlGenericError(xmlGenericErrorContext,
3934 "HPP: try PROLOG\n");break;
3935 case XML_PARSER_START_TAG:
3936 xmlGenericError(xmlGenericErrorContext,
3937 "HPP: try START_TAG\n");break;
3938 case XML_PARSER_CONTENT:
3939 xmlGenericError(xmlGenericErrorContext,
3940 "HPP: try CONTENT\n");break;
3941 case XML_PARSER_CDATA_SECTION:
3942 xmlGenericError(xmlGenericErrorContext,
3943 "HPP: try CDATA_SECTION\n");break;
3944 case XML_PARSER_END_TAG:
3945 xmlGenericError(xmlGenericErrorContext,
3946 "HPP: try END_TAG\n");break;
3947 case XML_PARSER_ENTITY_DECL:
3948 xmlGenericError(xmlGenericErrorContext,
3949 "HPP: try ENTITY_DECL\n");break;
3950 case XML_PARSER_ENTITY_VALUE:
3951 xmlGenericError(xmlGenericErrorContext,
3952 "HPP: try ENTITY_VALUE\n");break;
3953 case XML_PARSER_ATTRIBUTE_VALUE:
3954 xmlGenericError(xmlGenericErrorContext,
3955 "HPP: try ATTRIBUTE_VALUE\n");break;
3956 case XML_PARSER_DTD:
3957 xmlGenericError(xmlGenericErrorContext,
3958 "HPP: try DTD\n");break;
3959 case XML_PARSER_EPILOG:
3960 xmlGenericError(xmlGenericErrorContext,
3961 "HPP: try EPILOG\n");break;
3962 case XML_PARSER_PI:
3963 xmlGenericError(xmlGenericErrorContext,
3964 "HPP: try PI\n");break;
3965 case XML_PARSER_SYSTEM_LITERAL:
3966 xmlGenericError(xmlGenericErrorContext,
3967 "HPP: try SYSTEM_LITERAL\n");break;
3968 }
3969#endif
3970
3971 while (1) {
3972
3973 in = ctxt->input;
3974 if (in == NULL) break;
3975 if (in->buf == NULL)
3976 avail = in->length - (in->cur - in->base);
3977 else
3978 avail = in->buf->buffer->use - (in->cur - in->base);
3979 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003980 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003981 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
3982 /*
3983 * SAX: end of the document processing.
3984 */
3985 ctxt->instate = XML_PARSER_EOF;
3986 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3987 ctxt->sax->endDocument(ctxt->userData);
3988 }
3989 }
3990 if (avail < 1)
3991 goto done;
3992 switch (ctxt->instate) {
3993 case XML_PARSER_EOF:
3994 /*
3995 * Document parsing is done !
3996 */
3997 goto done;
3998 case XML_PARSER_START:
3999 /*
4000 * Very first chars read from the document flow.
4001 */
4002 cur = in->cur[0];
4003 if (IS_BLANK(cur)) {
4004 SKIP_BLANKS;
4005 if (in->buf == NULL)
4006 avail = in->length - (in->cur - in->base);
4007 else
4008 avail = in->buf->buffer->use - (in->cur - in->base);
4009 }
4010 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4011 ctxt->sax->setDocumentLocator(ctxt->userData,
4012 &xmlDefaultSAXLocator);
4013 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4014 (!ctxt->disableSAX))
4015 ctxt->sax->startDocument(ctxt->userData);
4016
4017 cur = in->cur[0];
4018 next = in->cur[1];
4019 if ((cur == '<') && (next == '!') &&
4020 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4021 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4022 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4023 (UPP(8) == 'E')) {
4024 if ((!terminate) &&
4025 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4026 goto done;
4027#ifdef DEBUG_PUSH
4028 xmlGenericError(xmlGenericErrorContext,
4029 "HPP: Parsing internal subset\n");
4030#endif
4031 htmlParseDocTypeDecl(ctxt);
4032 ctxt->instate = XML_PARSER_PROLOG;
4033#ifdef DEBUG_PUSH
4034 xmlGenericError(xmlGenericErrorContext,
4035 "HPP: entering PROLOG\n");
4036#endif
4037 } else {
4038 ctxt->instate = XML_PARSER_MISC;
4039 }
4040#ifdef DEBUG_PUSH
4041 xmlGenericError(xmlGenericErrorContext,
4042 "HPP: entering MISC\n");
4043#endif
4044 break;
4045 case XML_PARSER_MISC:
4046 SKIP_BLANKS;
4047 if (in->buf == NULL)
4048 avail = in->length - (in->cur - in->base);
4049 else
4050 avail = in->buf->buffer->use - (in->cur - in->base);
4051 if (avail < 2)
4052 goto done;
4053 cur = in->cur[0];
4054 next = in->cur[1];
4055 if ((cur == '<') && (next == '!') &&
4056 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4057 if ((!terminate) &&
4058 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4059 goto done;
4060#ifdef DEBUG_PUSH
4061 xmlGenericError(xmlGenericErrorContext,
4062 "HPP: Parsing Comment\n");
4063#endif
4064 htmlParseComment(ctxt);
4065 ctxt->instate = XML_PARSER_MISC;
4066 } else if ((cur == '<') && (next == '!') &&
4067 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4068 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4069 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4070 (UPP(8) == 'E')) {
4071 if ((!terminate) &&
4072 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4073 goto done;
4074#ifdef DEBUG_PUSH
4075 xmlGenericError(xmlGenericErrorContext,
4076 "HPP: Parsing internal subset\n");
4077#endif
4078 htmlParseDocTypeDecl(ctxt);
4079 ctxt->instate = XML_PARSER_PROLOG;
4080#ifdef DEBUG_PUSH
4081 xmlGenericError(xmlGenericErrorContext,
4082 "HPP: entering PROLOG\n");
4083#endif
4084 } else if ((cur == '<') && (next == '!') &&
4085 (avail < 9)) {
4086 goto done;
4087 } else {
4088 ctxt->instate = XML_PARSER_START_TAG;
4089#ifdef DEBUG_PUSH
4090 xmlGenericError(xmlGenericErrorContext,
4091 "HPP: entering START_TAG\n");
4092#endif
4093 }
4094 break;
4095 case XML_PARSER_PROLOG:
4096 SKIP_BLANKS;
4097 if (in->buf == NULL)
4098 avail = in->length - (in->cur - in->base);
4099 else
4100 avail = in->buf->buffer->use - (in->cur - in->base);
4101 if (avail < 2)
4102 goto done;
4103 cur = in->cur[0];
4104 next = in->cur[1];
4105 if ((cur == '<') && (next == '!') &&
4106 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4107 if ((!terminate) &&
4108 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4109 goto done;
4110#ifdef DEBUG_PUSH
4111 xmlGenericError(xmlGenericErrorContext,
4112 "HPP: Parsing Comment\n");
4113#endif
4114 htmlParseComment(ctxt);
4115 ctxt->instate = XML_PARSER_PROLOG;
4116 } else if ((cur == '<') && (next == '!') &&
4117 (avail < 4)) {
4118 goto done;
4119 } else {
4120 ctxt->instate = XML_PARSER_START_TAG;
4121#ifdef DEBUG_PUSH
4122 xmlGenericError(xmlGenericErrorContext,
4123 "HPP: entering START_TAG\n");
4124#endif
4125 }
4126 break;
4127 case XML_PARSER_EPILOG:
4128 if (in->buf == NULL)
4129 avail = in->length - (in->cur - in->base);
4130 else
4131 avail = in->buf->buffer->use - (in->cur - in->base);
4132 if (avail < 1)
4133 goto done;
4134 cur = in->cur[0];
4135 if (IS_BLANK(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004136 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004137 goto done;
4138 }
4139 if (avail < 2)
4140 goto done;
4141 next = in->cur[1];
4142 if ((cur == '<') && (next == '!') &&
4143 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4144 if ((!terminate) &&
4145 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4146 goto done;
4147#ifdef DEBUG_PUSH
4148 xmlGenericError(xmlGenericErrorContext,
4149 "HPP: Parsing Comment\n");
4150#endif
4151 htmlParseComment(ctxt);
4152 ctxt->instate = XML_PARSER_EPILOG;
4153 } else if ((cur == '<') && (next == '!') &&
4154 (avail < 4)) {
4155 goto done;
4156 } else {
4157 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004158 ctxt->wellFormed = 0;
4159 ctxt->instate = XML_PARSER_EOF;
4160#ifdef DEBUG_PUSH
4161 xmlGenericError(xmlGenericErrorContext,
4162 "HPP: entering EOF\n");
4163#endif
4164 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4165 ctxt->sax->endDocument(ctxt->userData);
4166 goto done;
4167 }
4168 break;
4169 case XML_PARSER_START_TAG: {
4170 xmlChar *name, *oldname;
4171 int depth = ctxt->nameNr;
Daniel Veillardbb371292001-08-16 23:26:59 +00004172 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004173
4174 if (avail < 2)
4175 goto done;
4176 cur = in->cur[0];
4177 if (cur != '<') {
4178 ctxt->instate = XML_PARSER_CONTENT;
4179#ifdef DEBUG_PUSH
4180 xmlGenericError(xmlGenericErrorContext,
4181 "HPP: entering CONTENT\n");
4182#endif
4183 break;
4184 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004185 if (in->cur[1] == '/') {
4186 ctxt->instate = XML_PARSER_END_TAG;
4187 ctxt->checkIndex = 0;
4188#ifdef DEBUG_PUSH
4189 xmlGenericError(xmlGenericErrorContext,
4190 "HPP: entering END_TAG\n");
4191#endif
4192 break;
4193 }
Owen Taylor3473f882001-02-23 17:55:21 +00004194 if ((!terminate) &&
4195 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4196 goto done;
4197
4198 oldname = xmlStrdup(ctxt->name);
4199 htmlParseStartTag(ctxt);
4200 name = ctxt->name;
4201#ifdef DEBUG
4202 if (oldname == NULL)
4203 xmlGenericError(xmlGenericErrorContext,
4204 "Start of element %s\n", name);
4205 else if (name == NULL)
4206 xmlGenericError(xmlGenericErrorContext,
4207 "Start of element failed, was %s\n",
4208 oldname);
4209 else
4210 xmlGenericError(xmlGenericErrorContext,
4211 "Start of element %s, was %s\n",
4212 name, oldname);
4213#endif
4214 if (((depth == ctxt->nameNr) &&
4215 (xmlStrEqual(oldname, ctxt->name))) ||
4216 (name == NULL)) {
4217 if (CUR == '>')
4218 NEXT;
4219 if (oldname != NULL)
4220 xmlFree(oldname);
4221 break;
4222 }
4223 if (oldname != NULL)
4224 xmlFree(oldname);
4225
4226 /*
4227 * Lookup the info for that element.
4228 */
4229 info = htmlTagLookup(name);
4230 if (info == NULL) {
4231 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4232 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4233 name);
4234 ctxt->wellFormed = 0;
4235 } else if (info->depr) {
4236 /***************************
4237 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4238 ctxt->sax->warning(ctxt->userData,
4239 "Tag %s is deprecated\n",
4240 name);
4241 ***************************/
4242 }
4243
4244 /*
4245 * Check for an Empty Element labelled the XML/SGML way
4246 */
4247 if ((CUR == '/') && (NXT(1) == '>')) {
4248 SKIP(2);
4249 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4250 ctxt->sax->endElement(ctxt->userData, name);
4251 oldname = htmlnamePop(ctxt);
4252#ifdef DEBUG
4253 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
4254 oldname);
4255#endif
4256 if (oldname != NULL)
4257 xmlFree(oldname);
4258 ctxt->instate = XML_PARSER_CONTENT;
4259#ifdef DEBUG_PUSH
4260 xmlGenericError(xmlGenericErrorContext,
4261 "HPP: entering CONTENT\n");
4262#endif
4263 break;
4264 }
4265
4266 if (CUR == '>') {
4267 NEXT;
4268 } else {
4269 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4270 ctxt->sax->error(ctxt->userData,
4271 "Couldn't find end of Start Tag %s\n",
4272 name);
4273 ctxt->wellFormed = 0;
4274
4275 /*
4276 * end of parsing of this node.
4277 */
4278 if (xmlStrEqual(name, ctxt->name)) {
4279 nodePop(ctxt);
4280 oldname = htmlnamePop(ctxt);
4281#ifdef DEBUG
4282 xmlGenericError(xmlGenericErrorContext,
4283 "End of start tag problem: popping out %s\n", oldname);
4284#endif
4285 if (oldname != NULL)
4286 xmlFree(oldname);
4287 }
4288
4289 ctxt->instate = XML_PARSER_CONTENT;
4290#ifdef DEBUG_PUSH
4291 xmlGenericError(xmlGenericErrorContext,
4292 "HPP: entering CONTENT\n");
4293#endif
4294 break;
4295 }
4296
4297 /*
4298 * Check for an Empty Element from DTD definition
4299 */
4300 if ((info != NULL) && (info->empty)) {
4301 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4302 ctxt->sax->endElement(ctxt->userData, name);
4303 oldname = htmlnamePop(ctxt);
4304#ifdef DEBUG
4305 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
4306#endif
4307 if (oldname != NULL)
4308 xmlFree(oldname);
4309 }
4310 ctxt->instate = XML_PARSER_CONTENT;
4311#ifdef DEBUG_PUSH
4312 xmlGenericError(xmlGenericErrorContext,
4313 "HPP: entering CONTENT\n");
4314#endif
4315 break;
4316 }
4317 case XML_PARSER_CONTENT: {
4318 long cons;
4319 /*
4320 * Handle preparsed entities and charRef
4321 */
4322 if (ctxt->token != 0) {
4323 xmlChar chr[2] = { 0 , 0 } ;
4324
4325 chr[0] = (xmlChar) ctxt->token;
4326 htmlCheckParagraph(ctxt);
4327 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4328 ctxt->sax->characters(ctxt->userData, chr, 1);
4329 ctxt->token = 0;
4330 ctxt->checkIndex = 0;
4331 }
4332 if ((avail == 1) && (terminate)) {
4333 cur = in->cur[0];
4334 if ((cur != '<') && (cur != '&')) {
4335 if (ctxt->sax != NULL) {
4336 if (IS_BLANK(cur)) {
4337 if (ctxt->sax->ignorableWhitespace != NULL)
4338 ctxt->sax->ignorableWhitespace(
4339 ctxt->userData, &cur, 1);
4340 } else {
4341 htmlCheckParagraph(ctxt);
4342 if (ctxt->sax->characters != NULL)
4343 ctxt->sax->characters(
4344 ctxt->userData, &cur, 1);
4345 }
4346 }
4347 ctxt->token = 0;
4348 ctxt->checkIndex = 0;
4349 NEXT;
William M. Brack1633d182001-10-05 15:41:19 +00004350 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004351 }
Owen Taylor3473f882001-02-23 17:55:21 +00004352 }
4353 if (avail < 2)
4354 goto done;
4355 cur = in->cur[0];
4356 next = in->cur[1];
4357 cons = ctxt->nbChars;
4358 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4359 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4360 /*
4361 * Handle SCRIPT/STYLE separately
4362 */
4363 if ((!terminate) &&
4364 (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4365 goto done;
4366 htmlParseScript(ctxt);
4367 if ((cur == '<') && (next == '/')) {
4368 ctxt->instate = XML_PARSER_END_TAG;
4369 ctxt->checkIndex = 0;
4370#ifdef DEBUG_PUSH
4371 xmlGenericError(xmlGenericErrorContext,
4372 "HPP: entering END_TAG\n");
4373#endif
4374 break;
4375 }
4376 } else {
4377 /*
4378 * Sometimes DOCTYPE arrives in the middle of the document
4379 */
4380 if ((cur == '<') && (next == '!') &&
4381 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4382 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4383 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4384 (UPP(8) == 'E')) {
4385 if ((!terminate) &&
4386 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4387 goto done;
4388 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4389 ctxt->sax->error(ctxt->userData,
4390 "Misplaced DOCTYPE declaration\n");
4391 ctxt->wellFormed = 0;
4392 htmlParseDocTypeDecl(ctxt);
4393 } else if ((cur == '<') && (next == '!') &&
4394 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4395 if ((!terminate) &&
4396 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4397 goto done;
4398#ifdef DEBUG_PUSH
4399 xmlGenericError(xmlGenericErrorContext,
4400 "HPP: Parsing Comment\n");
4401#endif
4402 htmlParseComment(ctxt);
4403 ctxt->instate = XML_PARSER_CONTENT;
4404 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4405 goto done;
4406 } else if ((cur == '<') && (next == '/')) {
4407 ctxt->instate = XML_PARSER_END_TAG;
4408 ctxt->checkIndex = 0;
4409#ifdef DEBUG_PUSH
4410 xmlGenericError(xmlGenericErrorContext,
4411 "HPP: entering END_TAG\n");
4412#endif
4413 break;
4414 } else if (cur == '<') {
4415 ctxt->instate = XML_PARSER_START_TAG;
4416 ctxt->checkIndex = 0;
4417#ifdef DEBUG_PUSH
4418 xmlGenericError(xmlGenericErrorContext,
4419 "HPP: entering START_TAG\n");
4420#endif
4421 break;
4422 } else if (cur == '&') {
4423 if ((!terminate) &&
4424 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4425 goto done;
4426#ifdef DEBUG_PUSH
4427 xmlGenericError(xmlGenericErrorContext,
4428 "HPP: Parsing Reference\n");
4429#endif
4430 /* TODO: check generation of subtrees if noent !!! */
4431 htmlParseReference(ctxt);
4432 } else {
4433 /* TODO Avoid the extra copy, handle directly !!!!!! */
4434 /*
4435 * Goal of the following test is :
4436 * - minimize calls to the SAX 'character' callback
4437 * when they are mergeable
4438 */
4439 if ((ctxt->inputNr == 1) &&
4440 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4441 if ((!terminate) &&
4442 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4443 goto done;
4444 }
4445 ctxt->checkIndex = 0;
4446#ifdef DEBUG_PUSH
4447 xmlGenericError(xmlGenericErrorContext,
4448 "HPP: Parsing char data\n");
4449#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004450 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004451 }
4452 }
4453 if (cons == ctxt->nbChars) {
4454 if (ctxt->node != NULL) {
4455 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4456 ctxt->sax->error(ctxt->userData,
4457 "detected an error in element content\n");
4458 ctxt->wellFormed = 0;
4459 }
4460 NEXT;
4461 break;
4462 }
4463
4464 break;
4465 }
4466 case XML_PARSER_END_TAG:
4467 if (avail < 2)
4468 goto done;
4469 if ((!terminate) &&
4470 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4471 goto done;
4472 htmlParseEndTag(ctxt);
4473 if (ctxt->nameNr == 0) {
4474 ctxt->instate = XML_PARSER_EPILOG;
4475 } else {
4476 ctxt->instate = XML_PARSER_CONTENT;
4477 }
4478 ctxt->checkIndex = 0;
4479#ifdef DEBUG_PUSH
4480 xmlGenericError(xmlGenericErrorContext,
4481 "HPP: entering CONTENT\n");
4482#endif
4483 break;
4484 case XML_PARSER_CDATA_SECTION:
4485 xmlGenericError(xmlGenericErrorContext,
4486 "HPP: internal error, state == CDATA\n");
4487 ctxt->instate = XML_PARSER_CONTENT;
4488 ctxt->checkIndex = 0;
4489#ifdef DEBUG_PUSH
4490 xmlGenericError(xmlGenericErrorContext,
4491 "HPP: entering CONTENT\n");
4492#endif
4493 break;
4494 case XML_PARSER_DTD:
4495 xmlGenericError(xmlGenericErrorContext,
4496 "HPP: internal error, state == DTD\n");
4497 ctxt->instate = XML_PARSER_CONTENT;
4498 ctxt->checkIndex = 0;
4499#ifdef DEBUG_PUSH
4500 xmlGenericError(xmlGenericErrorContext,
4501 "HPP: entering CONTENT\n");
4502#endif
4503 break;
4504 case XML_PARSER_COMMENT:
4505 xmlGenericError(xmlGenericErrorContext,
4506 "HPP: internal error, state == COMMENT\n");
4507 ctxt->instate = XML_PARSER_CONTENT;
4508 ctxt->checkIndex = 0;
4509#ifdef DEBUG_PUSH
4510 xmlGenericError(xmlGenericErrorContext,
4511 "HPP: entering CONTENT\n");
4512#endif
4513 break;
4514 case XML_PARSER_PI:
4515 xmlGenericError(xmlGenericErrorContext,
4516 "HPP: internal error, state == PI\n");
4517 ctxt->instate = XML_PARSER_CONTENT;
4518 ctxt->checkIndex = 0;
4519#ifdef DEBUG_PUSH
4520 xmlGenericError(xmlGenericErrorContext,
4521 "HPP: entering CONTENT\n");
4522#endif
4523 break;
4524 case XML_PARSER_ENTITY_DECL:
4525 xmlGenericError(xmlGenericErrorContext,
4526 "HPP: internal error, state == ENTITY_DECL\n");
4527 ctxt->instate = XML_PARSER_CONTENT;
4528 ctxt->checkIndex = 0;
4529#ifdef DEBUG_PUSH
4530 xmlGenericError(xmlGenericErrorContext,
4531 "HPP: entering CONTENT\n");
4532#endif
4533 break;
4534 case XML_PARSER_ENTITY_VALUE:
4535 xmlGenericError(xmlGenericErrorContext,
4536 "HPP: internal error, state == ENTITY_VALUE\n");
4537 ctxt->instate = XML_PARSER_CONTENT;
4538 ctxt->checkIndex = 0;
4539#ifdef DEBUG_PUSH
4540 xmlGenericError(xmlGenericErrorContext,
4541 "HPP: entering DTD\n");
4542#endif
4543 break;
4544 case XML_PARSER_ATTRIBUTE_VALUE:
4545 xmlGenericError(xmlGenericErrorContext,
4546 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
4547 ctxt->instate = XML_PARSER_START_TAG;
4548 ctxt->checkIndex = 0;
4549#ifdef DEBUG_PUSH
4550 xmlGenericError(xmlGenericErrorContext,
4551 "HPP: entering START_TAG\n");
4552#endif
4553 break;
4554 case XML_PARSER_SYSTEM_LITERAL:
4555 xmlGenericError(xmlGenericErrorContext,
4556 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
4557 ctxt->instate = XML_PARSER_CONTENT;
4558 ctxt->checkIndex = 0;
4559#ifdef DEBUG_PUSH
4560 xmlGenericError(xmlGenericErrorContext,
4561 "HPP: entering CONTENT\n");
4562#endif
4563 break;
4564 case XML_PARSER_IGNORE:
4565 xmlGenericError(xmlGenericErrorContext,
4566 "HPP: internal error, state == XML_PARSER_IGNORE\n");
4567 ctxt->instate = XML_PARSER_CONTENT;
4568 ctxt->checkIndex = 0;
4569#ifdef DEBUG_PUSH
4570 xmlGenericError(xmlGenericErrorContext,
4571 "HPP: entering CONTENT\n");
4572#endif
4573 break;
4574 }
4575 }
4576done:
4577 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004578 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004579 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4580 /*
4581 * SAX: end of the document processing.
4582 */
4583 ctxt->instate = XML_PARSER_EOF;
4584 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4585 ctxt->sax->endDocument(ctxt->userData);
4586 }
4587 }
4588 if ((ctxt->myDoc != NULL) &&
4589 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4590 (ctxt->instate == XML_PARSER_EPILOG))) {
4591 xmlDtdPtr dtd;
4592 dtd = xmlGetIntSubset(ctxt->myDoc);
4593 if (dtd == NULL)
4594 ctxt->myDoc->intSubset =
4595 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4596 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4597 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4598 }
4599#ifdef DEBUG_PUSH
4600 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4601#endif
4602 return(ret);
4603}
4604
4605/**
Owen Taylor3473f882001-02-23 17:55:21 +00004606 * htmlParseChunk:
4607 * @ctxt: an XML parser context
4608 * @chunk: an char array
4609 * @size: the size in byte of the chunk
4610 * @terminate: last chunk indicator
4611 *
4612 * Parse a Chunk of memory
4613 *
4614 * Returns zero if no error, the xmlParserErrors otherwise.
4615 */
4616int
4617htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4618 int terminate) {
4619 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4620 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4621 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4622 int cur = ctxt->input->cur - ctxt->input->base;
4623
4624 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4625 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4626 ctxt->input->cur = ctxt->input->base + cur;
4627#ifdef DEBUG_PUSH
4628 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4629#endif
4630
4631 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4632 htmlParseTryOrFinish(ctxt, terminate);
4633 } else if (ctxt->instate != XML_PARSER_EOF) {
4634 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
4635 htmlParseTryOrFinish(ctxt, terminate);
4636 }
4637 if (terminate) {
4638 if ((ctxt->instate != XML_PARSER_EOF) &&
4639 (ctxt->instate != XML_PARSER_EPILOG) &&
4640 (ctxt->instate != XML_PARSER_MISC)) {
4641 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004642 ctxt->wellFormed = 0;
4643 }
4644 if (ctxt->instate != XML_PARSER_EOF) {
4645 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4646 ctxt->sax->endDocument(ctxt->userData);
4647 }
4648 ctxt->instate = XML_PARSER_EOF;
4649 }
4650 return((xmlParserErrors) ctxt->errNo);
4651}
4652
4653/************************************************************************
4654 * *
4655 * User entry points *
4656 * *
4657 ************************************************************************/
4658
4659/**
4660 * htmlCreatePushParserCtxt :
4661 * @sax: a SAX handler
4662 * @user_data: The user data returned on SAX callbacks
4663 * @chunk: a pointer to an array of chars
4664 * @size: number of chars in the array
4665 * @filename: an optional file name or URI
4666 * @enc: an optional encoding
4667 *
4668 * Create a parser context for using the HTML parser in push mode
4669 * To allow content encoding detection, @size should be >= 4
4670 * The value of @filename is used for fetching external entities
4671 * and error/warning reports.
4672 *
4673 * Returns the new parser context or NULL
4674 */
4675htmlParserCtxtPtr
4676htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4677 const char *chunk, int size, const char *filename,
4678 xmlCharEncoding enc) {
4679 htmlParserCtxtPtr ctxt;
4680 htmlParserInputPtr inputStream;
4681 xmlParserInputBufferPtr buf;
4682
Daniel Veillardd0463562001-10-13 09:15:48 +00004683 xmlInitParser();
4684
Owen Taylor3473f882001-02-23 17:55:21 +00004685 buf = xmlAllocParserInputBuffer(enc);
4686 if (buf == NULL) return(NULL);
4687
4688 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4689 if (ctxt == NULL) {
4690 xmlFree(buf);
4691 return(NULL);
4692 }
4693 memset(ctxt, 0, sizeof(htmlParserCtxt));
4694 htmlInitParserCtxt(ctxt);
4695 if (sax != NULL) {
4696 if (ctxt->sax != &htmlDefaultSAXHandler)
4697 xmlFree(ctxt->sax);
4698 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4699 if (ctxt->sax == NULL) {
4700 xmlFree(buf);
4701 xmlFree(ctxt);
4702 return(NULL);
4703 }
4704 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4705 if (user_data != NULL)
4706 ctxt->userData = user_data;
4707 }
4708 if (filename == NULL) {
4709 ctxt->directory = NULL;
4710 } else {
4711 ctxt->directory = xmlParserGetDirectory(filename);
4712 }
4713
4714 inputStream = htmlNewInputStream(ctxt);
4715 if (inputStream == NULL) {
4716 xmlFreeParserCtxt(ctxt);
4717 return(NULL);
4718 }
4719
4720 if (filename == NULL)
4721 inputStream->filename = NULL;
4722 else
4723 inputStream->filename = xmlMemStrdup(filename);
4724 inputStream->buf = buf;
4725 inputStream->base = inputStream->buf->buffer->content;
4726 inputStream->cur = inputStream->buf->buffer->content;
4727
4728 inputPush(ctxt, inputStream);
4729
4730 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4731 (ctxt->input->buf != NULL)) {
4732 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4733#ifdef DEBUG_PUSH
4734 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4735#endif
4736 }
4737
4738 return(ctxt);
4739}
4740
4741/**
4742 * htmlSAXParseDoc :
4743 * @cur: a pointer to an array of xmlChar
4744 * @encoding: a free form C string describing the HTML document encoding, or NULL
4745 * @sax: the SAX handler block
4746 * @userData: if using SAX, this pointer will be provided on callbacks.
4747 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004748 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
4749 * to handle parse events. If sax is NULL, fallback to the default DOM
4750 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00004751 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004752 * Returns the resulting document tree unless SAX is NULL or the document is
4753 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00004754 */
4755
4756htmlDocPtr
4757htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
4758 htmlDocPtr ret;
4759 htmlParserCtxtPtr ctxt;
4760
Daniel Veillardd0463562001-10-13 09:15:48 +00004761 xmlInitParser();
4762
Owen Taylor3473f882001-02-23 17:55:21 +00004763 if (cur == NULL) return(NULL);
4764
4765
4766 ctxt = htmlCreateDocParserCtxt(cur, encoding);
4767 if (ctxt == NULL) return(NULL);
4768 if (sax != NULL) {
4769 ctxt->sax = sax;
4770 ctxt->userData = userData;
4771 }
4772
4773 htmlParseDocument(ctxt);
4774 ret = ctxt->myDoc;
4775 if (sax != NULL) {
4776 ctxt->sax = NULL;
4777 ctxt->userData = NULL;
4778 }
4779 htmlFreeParserCtxt(ctxt);
4780
4781 return(ret);
4782}
4783
4784/**
4785 * htmlParseDoc :
4786 * @cur: a pointer to an array of xmlChar
4787 * @encoding: a free form C string describing the HTML document encoding, or NULL
4788 *
4789 * parse an HTML in-memory document and build a tree.
4790 *
4791 * Returns the resulting document tree
4792 */
4793
4794htmlDocPtr
4795htmlParseDoc(xmlChar *cur, const char *encoding) {
4796 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4797}
4798
4799
4800/**
4801 * htmlCreateFileParserCtxt :
4802 * @filename: the filename
4803 * @encoding: a free form C string describing the HTML document encoding, or NULL
4804 *
4805 * Create a parser context for a file content.
4806 * Automatic support for ZLIB/Compress compressed document is provided
4807 * by default if found at compile-time.
4808 *
4809 * Returns the new parser context or NULL
4810 */
4811htmlParserCtxtPtr
4812htmlCreateFileParserCtxt(const char *filename, const char *encoding)
4813{
4814 htmlParserCtxtPtr ctxt;
4815 htmlParserInputPtr inputStream;
4816 xmlParserInputBufferPtr buf;
4817 /* htmlCharEncoding enc; */
4818 xmlChar *content, *content_line = (xmlChar *) "charset=";
4819
4820 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
4821 if (buf == NULL) return(NULL);
4822
4823 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4824 if (ctxt == NULL) {
4825 perror("malloc");
4826 return(NULL);
4827 }
4828 memset(ctxt, 0, sizeof(htmlParserCtxt));
4829 htmlInitParserCtxt(ctxt);
4830 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
4831 if (inputStream == NULL) {
4832 perror("malloc");
4833 xmlFree(ctxt);
4834 return(NULL);
4835 }
4836 memset(inputStream, 0, sizeof(htmlParserInput));
4837
4838 inputStream->filename = xmlMemStrdup(filename);
4839 inputStream->line = 1;
4840 inputStream->col = 1;
4841 inputStream->buf = buf;
4842 inputStream->directory = NULL;
4843
4844 inputStream->base = inputStream->buf->buffer->content;
4845 inputStream->cur = inputStream->buf->buffer->content;
4846 inputStream->free = NULL;
4847
4848 inputPush(ctxt, inputStream);
4849
4850 /* set encoding */
4851 if (encoding) {
4852 content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
4853 if (content) {
4854 strcpy ((char *)content, (char *)content_line);
4855 strcat ((char *)content, (char *)encoding);
4856 htmlCheckEncoding (ctxt, content);
4857 xmlFree (content);
4858 }
4859 }
4860
4861 return(ctxt);
4862}
4863
4864/**
4865 * htmlSAXParseFile :
4866 * @filename: the filename
4867 * @encoding: a free form C string describing the HTML document encoding, or NULL
4868 * @sax: the SAX handler block
4869 * @userData: if using SAX, this pointer will be provided on callbacks.
4870 *
4871 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4872 * compressed document is provided by default if found at compile-time.
4873 * It use the given SAX function block to handle the parsing callback.
4874 * If sax is NULL, fallback to the default DOM tree building routines.
4875 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004876 * Returns the resulting document tree unless SAX is NULL or the document is
4877 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00004878 */
4879
4880htmlDocPtr
4881htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
4882 void *userData) {
4883 htmlDocPtr ret;
4884 htmlParserCtxtPtr ctxt;
4885 htmlSAXHandlerPtr oldsax = NULL;
4886
Daniel Veillardd0463562001-10-13 09:15:48 +00004887 xmlInitParser();
4888
Owen Taylor3473f882001-02-23 17:55:21 +00004889 ctxt = htmlCreateFileParserCtxt(filename, encoding);
4890 if (ctxt == NULL) return(NULL);
4891 if (sax != NULL) {
4892 oldsax = ctxt->sax;
4893 ctxt->sax = sax;
4894 ctxt->userData = userData;
4895 }
4896
4897 htmlParseDocument(ctxt);
4898
4899 ret = ctxt->myDoc;
4900 if (sax != NULL) {
4901 ctxt->sax = oldsax;
4902 ctxt->userData = NULL;
4903 }
4904 htmlFreeParserCtxt(ctxt);
4905
4906 return(ret);
4907}
4908
4909/**
4910 * htmlParseFile :
4911 * @filename: the filename
4912 * @encoding: a free form C string describing the HTML document encoding, or NULL
4913 *
4914 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4915 * compressed document is provided by default if found at compile-time.
4916 *
4917 * Returns the resulting document tree
4918 */
4919
4920htmlDocPtr
4921htmlParseFile(const char *filename, const char *encoding) {
4922 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
4923}
4924
4925/**
4926 * htmlHandleOmittedElem:
4927 * @val: int 0 or 1
4928 *
4929 * Set and return the previous value for handling HTML omitted tags.
4930 *
4931 * Returns the last value for 0 for no handling, 1 for auto insertion.
4932 */
4933
4934int
4935htmlHandleOmittedElem(int val) {
4936 int old = htmlOmittedDefaultValue;
4937
4938 htmlOmittedDefaultValue = val;
4939 return(old);
4940}
4941
4942#endif /* LIBXML_HTML_ENABLED */