blob: 428248edd71737e12bd2c6f6b23a4e0aa9001e16 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Bjorn Reese70a9da52001-04-21 16:57:29 +00009#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000010#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000011
Owen Taylor3473f882001-02-23 17:55:21 +000012#include <string.h>
13#ifdef HAVE_CTYPE_H
14#include <ctype.h>
15#endif
16#ifdef HAVE_STDLIB_H
17#include <stdlib.h>
18#endif
19#ifdef HAVE_SYS_STAT_H
20#include <sys/stat.h>
21#endif
22#ifdef HAVE_FCNTL_H
23#include <fcntl.h>
24#endif
25#ifdef HAVE_UNISTD_H
26#include <unistd.h>
27#endif
28#ifdef HAVE_ZLIB_H
29#include <zlib.h>
30#endif
31
32#include <libxml/xmlmemory.h>
33#include <libxml/tree.h>
34#include <libxml/parser.h>
35#include <libxml/parserInternals.h>
36#include <libxml/xmlerror.h>
37#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000038#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000039#include <libxml/entities.h>
40#include <libxml/encoding.h>
41#include <libxml/valid.h>
42#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000043#include <libxml/globals.h>
Owen Taylor3473f882001-02-23 17:55:21 +000044
45#define HTML_MAX_NAMELEN 1000
46#define HTML_PARSER_BIG_BUFFER_SIZE 1000
47#define HTML_PARSER_BUFFER_SIZE 100
48
49/* #define DEBUG */
50/* #define DEBUG_PUSH */
51
Daniel Veillard22090732001-07-16 00:06:07 +000052static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000053
Daniel Veillard56a4cb82001-03-24 17:00:36 +000054xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
55 xmlChar end, xmlChar end2, xmlChar end3);
56
57/************************************************************************
58 * *
Owen Taylor3473f882001-02-23 17:55:21 +000059 * Parser stacks related functions and macros *
60 * *
61 ************************************************************************/
62
63/*
64 * Generic function for accessing stacks in the Parser Context
65 */
66
67#define PUSH_AND_POP(scope, type, name) \
68scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
69 if (ctxt->name##Nr >= ctxt->name##Max) { \
70 ctxt->name##Max *= 2; \
71 ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
72 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
73 if (ctxt->name##Tab == NULL) { \
74 xmlGenericError(xmlGenericErrorContext, \
75 "realloc failed !\n"); \
76 return(0); \
77 } \
78 } \
79 ctxt->name##Tab[ctxt->name##Nr] = value; \
80 ctxt->name = value; \
81 return(ctxt->name##Nr++); \
82} \
83scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
84 type ret; \
85 if (ctxt->name##Nr < 0) return(0); \
86 ctxt->name##Nr--; \
87 if (ctxt->name##Nr < 0) return(0); \
88 if (ctxt->name##Nr > 0) \
89 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
90 else \
91 ctxt->name = NULL; \
92 ret = ctxt->name##Tab[ctxt->name##Nr]; \
93 ctxt->name##Tab[ctxt->name##Nr] = 0; \
94 return(ret); \
95} \
96
Daniel Veillard56a4cb82001-03-24 17:00:36 +000097/* PUSH_AND_POP(static, xmlNodePtr, node) */
98PUSH_AND_POP(static, xmlChar*, name)
Owen Taylor3473f882001-02-23 17:55:21 +000099
100/*
101 * Macros for accessing the content. Those should be used only by the parser,
102 * and not exported.
103 *
104 * Dirty macros, i.e. one need to make assumption on the context to use them
105 *
106 * CUR_PTR return the current pointer to the xmlChar to be parsed.
107 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
108 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
109 * in UNICODE mode. This should be used internally by the parser
110 * only to compare to ASCII values otherwise it would break when
111 * running with UTF-8 encoding.
112 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
113 * to compare on ASCII based substring.
114 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
115 * it should be used only to compare on ASCII based substring.
116 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
117 * strings within the parser.
118 *
119 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
120 *
121 * CURRENT Returns the current char value, with the full decoding of
122 * UTF-8 if we are using this mode. It returns an int.
123 * NEXT Skip to the next character, this does the proper decoding
124 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
125 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
126 */
127
128#define UPPER (toupper(*ctxt->input->cur))
129
130#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
131
132#define NXT(val) ctxt->input->cur[(val)]
133
134#define UPP(val) (toupper(ctxt->input->cur[(val)]))
135
136#define CUR_PTR ctxt->input->cur
137
138#define SHRINK xmlParserInputShrink(ctxt->input)
139
140#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
141
142#define CURRENT ((int) (*ctxt->input->cur))
143
144#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
145
146/* Inported from XML */
147
148/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
149#define CUR ((int) (*ctxt->input->cur))
150#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
151
152#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
153#define NXT(val) ctxt->input->cur[(val)]
154#define CUR_PTR ctxt->input->cur
155
156
157#define NEXTL(l) do { \
158 if (*(ctxt->input->cur) == '\n') { \
159 ctxt->input->line++; ctxt->input->col = 1; \
160 } else ctxt->input->col++; \
161 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
162 } while (0)
163
164/************
165 \
166 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
167 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
168 ************/
169
170#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
171#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
172
173#define COPY_BUF(l,b,i,v) \
174 if (l == 1) b[i++] = (xmlChar) v; \
175 else i += xmlCopyChar(l,&b[i],v)
176
177/**
178 * htmlCurrentChar:
179 * @ctxt: the HTML parser context
180 * @len: pointer to the length of the char read
181 *
182 * The current char value, if using UTF-8 this may actaully span multiple
183 * bytes in the input buffer. Implement the end of line normalization:
184 * 2.11 End-of-Line Handling
185 * If the encoding is unspecified, in the case we find an ISO-Latin-1
186 * char, then the encoding converter is plugged in automatically.
187 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000188 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000189 */
190
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000191static int
Owen Taylor3473f882001-02-23 17:55:21 +0000192htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
193 if (ctxt->instate == XML_PARSER_EOF)
194 return(0);
195
196 if (ctxt->token != 0) {
197 *len = 0;
198 return(ctxt->token);
199 }
200 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
201 /*
202 * We are supposed to handle UTF8, check it's valid
203 * From rfc2044: encoding of the Unicode values on UTF-8:
204 *
205 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
206 * 0000 0000-0000 007F 0xxxxxxx
207 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
208 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
209 *
210 * Check for the 0x110000 limit too
211 */
212 const unsigned char *cur = ctxt->input->cur;
213 unsigned char c;
214 unsigned int val;
215
216 c = *cur;
217 if (c & 0x80) {
218 if (cur[1] == 0)
219 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
220 if ((cur[1] & 0xc0) != 0x80)
221 goto encoding_error;
222 if ((c & 0xe0) == 0xe0) {
223
224 if (cur[2] == 0)
225 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
226 if ((cur[2] & 0xc0) != 0x80)
227 goto encoding_error;
228 if ((c & 0xf0) == 0xf0) {
229 if (cur[3] == 0)
230 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
231 if (((c & 0xf8) != 0xf0) ||
232 ((cur[3] & 0xc0) != 0x80))
233 goto encoding_error;
234 /* 4-byte code */
235 *len = 4;
236 val = (cur[0] & 0x7) << 18;
237 val |= (cur[1] & 0x3f) << 12;
238 val |= (cur[2] & 0x3f) << 6;
239 val |= cur[3] & 0x3f;
240 } else {
241 /* 3-byte code */
242 *len = 3;
243 val = (cur[0] & 0xf) << 12;
244 val |= (cur[1] & 0x3f) << 6;
245 val |= cur[2] & 0x3f;
246 }
247 } else {
248 /* 2-byte code */
249 *len = 2;
250 val = (cur[0] & 0x1f) << 6;
251 val |= cur[1] & 0x3f;
252 }
253 if (!IS_CHAR(val)) {
254 ctxt->errNo = XML_ERR_INVALID_ENCODING;
255 if ((ctxt->sax != NULL) &&
256 (ctxt->sax->error != NULL))
257 ctxt->sax->error(ctxt->userData,
258 "Char 0x%X out of allowed range\n", val);
259 ctxt->wellFormed = 0;
260 ctxt->disableSAX = 1;
261 }
262 return(val);
263 } else {
264 /* 1-byte code */
265 *len = 1;
266 return((int) *ctxt->input->cur);
267 }
268 }
269 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000270 * Assume it's a fixed length encoding (1) with
Owen Taylor3473f882001-02-23 17:55:21 +0000271 * a compatibke encoding for the ASCII set, since
272 * XML constructs only use < 128 chars
273 */
274 *len = 1;
275 if ((int) *ctxt->input->cur < 0x80)
276 return((int) *ctxt->input->cur);
277
278 /*
279 * Humm this is bad, do an automatic flow conversion
280 */
281 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
282 ctxt->charset = XML_CHAR_ENCODING_UTF8;
283 return(xmlCurrentChar(ctxt, len));
284
285encoding_error:
286 /*
287 * If we detect an UTF8 error that probably mean that the
288 * input encoding didn't get properly advertized in the
289 * declaration header. Report the error and switch the encoding
290 * to ISO-Latin-1 (if you don't like this policy, just declare the
291 * encoding !)
292 */
293 ctxt->errNo = XML_ERR_INVALID_ENCODING;
294 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
295 ctxt->sax->error(ctxt->userData,
296 "Input is not proper UTF-8, indicate encoding !\n");
297 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
298 ctxt->input->cur[0], ctxt->input->cur[1],
299 ctxt->input->cur[2], ctxt->input->cur[3]);
300 }
301
302 ctxt->charset = XML_CHAR_ENCODING_8859_1;
303 *len = 1;
304 return((int) *ctxt->input->cur);
305}
306
307/**
Owen Taylor3473f882001-02-23 17:55:21 +0000308 * htmlSkipBlankChars:
309 * @ctxt: the HTML parser context
310 *
311 * skip all blanks character found at that point in the input streams.
312 *
313 * Returns the number of space chars skipped
314 */
315
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000316static int
Owen Taylor3473f882001-02-23 17:55:21 +0000317htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
318 int res = 0;
319
320 while (IS_BLANK(*(ctxt->input->cur))) {
321 if ((*ctxt->input->cur == 0) &&
322 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
323 xmlPopInput(ctxt);
324 } else {
325 if (*(ctxt->input->cur) == '\n') {
326 ctxt->input->line++; ctxt->input->col = 1;
327 } else ctxt->input->col++;
328 ctxt->input->cur++;
329 ctxt->nbChars++;
330 if (*ctxt->input->cur == 0)
331 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
332 }
333 res++;
334 }
335 return(res);
336}
337
338
339
340/************************************************************************
341 * *
342 * The list of HTML elements and their properties *
343 * *
344 ************************************************************************/
345
346/*
347 * Start Tag: 1 means the start tag can be ommited
348 * End Tag: 1 means the end tag can be ommited
349 * 2 means it's forbidden (empty elements)
Daniel Veillard56098d42001-04-24 12:51:09 +0000350 * 3 means the tag is stylistic and should be closed easilly
Owen Taylor3473f882001-02-23 17:55:21 +0000351 * Depr: this element is deprecated
352 * DTD: 1 means that this element is valid only in the Loose DTD
353 * 2 means that this element is valid only in the Frameset DTD
354 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000355 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Owen Taylor3473f882001-02-23 17:55:21 +0000356 */
Daniel Veillard22090732001-07-16 00:06:07 +0000357static const htmlElemDesc
358html40ElementTable[] = {
Daniel Veillard02bb1702001-06-13 21:11:59 +0000359{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor " },
360{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form" },
361{ "acronym", 0, 0, 0, 0, 0, 0, 1, "" },
362{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author " },
363{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet " },
364{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area " },
365{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style" },
366{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri " },
367{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " },
368{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride " },
369{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style" },
370{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation " },
371{ "body", 1, 1, 0, 0, 0, 0, 0, "document body " },
372{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break " },
373{ "button", 0, 0, 0, 0, 0, 0, 2, "push button " },
374{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption " },
375{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center " },
376{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation" },
377{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment" },
378{ "col", 0, 2, 2, 1, 0, 0, 0, "table column " },
379{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group " },
380{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description " },
381{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text " },
382{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition" },
383{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list" },
384{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container"},
385{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list " },
386{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term " },
387{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis" },
388{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group " },
389{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font " },
390{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form " },
391{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " },
392{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" },
393{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading " },
394{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading " },
395{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading " },
396{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading " },
397{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading " },
398{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading " },
399{ "head", 1, 1, 0, 0, 0, 0, 0, "document head " },
400{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " },
401{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element " },
402{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style" },
403{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow " },
404{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image " },
405{ "input", 0, 2, 2, 1, 0, 0, 1, "form control " },
406{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text" },
407{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt " },
408{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user" },
409{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text " },
410{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend " },
411{ "li", 0, 1, 1, 0, 0, 0, 0, "list item " },
412{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link " },
413{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map " },
414{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list " },
415{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation " },
416{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering " },
417{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
418{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object " },
419{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list " },
420{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group " },
421{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " },
422{ "p", 0, 1, 1, 0, 0, 0, 0, "paragraph " },
423{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value " },
424{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text " },
425{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation " },
426{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style" },
427{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc." },
428{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements " },
429{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector " },
430{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style" },
431{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container " },
432{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text" },
433{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis" },
434{ "style", 0, 0, 0, 0, 0, 0, 0, "style info " },
435{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript" },
436{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript " },
437{ "table", 0, 0, 0, 0, 0, 0, 0, "&#160;" },
438{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body " },
439{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell" },
440{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field " },
441{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer " },
442{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell" },
443{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header " },
444{ "title", 0, 0, 0, 0, 0, 0, 0, "document title " },
445{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row " },
446{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style" },
447{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style" },
448{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list " },
449{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument" },
Owen Taylor3473f882001-02-23 17:55:21 +0000450};
451
452/*
Owen Taylor3473f882001-02-23 17:55:21 +0000453 * start tags that imply the end of current element
454 */
Daniel Veillard22090732001-07-16 00:06:07 +0000455static const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000456"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
457 "dl", "ul", "ol", "menu", "dir", "address", "pre",
458 "listing", "xmp", "head", NULL,
459"head", "p", NULL,
460"title", "p", NULL,
461"body", "head", "style", "link", "title", "p", NULL,
462"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
463 "pre", "listing", "xmp", "head", "li", NULL,
464"hr", "p", "head", NULL,
465"h1", "p", "head", NULL,
466"h2", "p", "head", NULL,
467"h3", "p", "head", NULL,
468"h4", "p", "head", NULL,
469"h5", "p", "head", NULL,
470"h6", "p", "head", NULL,
471"dir", "p", "head", NULL,
472"address", "p", "head", "ul", NULL,
473"pre", "p", "head", "ul", NULL,
474"listing", "p", "head", NULL,
475"xmp", "p", "head", NULL,
476"blockquote", "p", "head", NULL,
477"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
478 "xmp", "head", NULL,
479"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
480 "head", "dd", NULL,
481"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
482 "head", "dt", NULL,
483"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
484 "listing", "xmp", NULL,
485"ol", "p", "head", "ul", NULL,
486"menu", "p", "head", "ul", NULL,
487"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
488"div", "p", "head", NULL,
489"noscript", "p", "head", NULL,
490"center", "font", "b", "i", "p", "head", NULL,
491"a", "a", NULL,
492"caption", "p", NULL,
493"colgroup", "caption", "colgroup", "col", "p", NULL,
494"col", "caption", "col", "p", NULL,
495"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
496 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000497"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
498"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000499"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
500"thead", "caption", "col", "colgroup", NULL,
501"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
502 "tbody", "p", NULL,
503"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
504 "tfoot", "tbody", "p", NULL,
505"optgroup", "option", NULL,
506"option", "option", NULL,
507"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
508 "pre", "listing", "xmp", "a", NULL,
509NULL
510};
511
512/*
513 * The list of HTML elements which are supposed not to have
514 * CDATA content and where a p element will be implied
515 *
516 * TODO: extend that list by reading the HTML SGML DtD on
517 * implied paragraph
518 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000519static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000520 "html",
521 "head",
522 "body",
523 NULL
524};
525
526/*
527 * The list of HTML attributes which are of content %Script;
528 * NOTE: when adding ones, check htmlIsScriptAttribute() since
529 * it assumes the name starts with 'on'
530 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000531static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000532 "onclick",
533 "ondblclick",
534 "onmousedown",
535 "onmouseup",
536 "onmouseover",
537 "onmousemove",
538 "onmouseout",
539 "onkeypress",
540 "onkeydown",
541 "onkeyup",
542 "onload",
543 "onunload",
544 "onfocus",
545 "onblur",
546 "onsubmit",
547 "onrest",
548 "onchange",
549 "onselect"
550};
551
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000552/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000553 * This table is used by the htmlparser to know what to do with
554 * broken html pages. By assigning different priorities to different
555 * elements the parser can decide how to handle extra endtags.
556 * Endtags are only allowed to close elements with lower or equal
557 * priority.
558 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000559
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000560typedef struct {
561 const char *name;
562 int priority;
563} elementPriority;
564
Daniel Veillard22090732001-07-16 00:06:07 +0000565static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000566 {"div", 150},
567 {"td", 160},
568 {"th", 160},
569 {"tr", 170},
570 {"thead", 180},
571 {"tbody", 180},
572 {"tfoot", 180},
573 {"table", 190},
574 {"head", 200},
575 {"body", 200},
576 {"html", 220},
577 {NULL, 100} /* Default priority */
578};
Owen Taylor3473f882001-02-23 17:55:21 +0000579
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000580static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +0000581static int htmlStartCloseIndexinitialized = 0;
582
583/************************************************************************
584 * *
585 * functions to handle HTML specific data *
586 * *
587 ************************************************************************/
588
589/**
590 * htmlInitAutoClose:
591 *
592 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
593 * This is not reentrant. Call xmlInitParser() once before processing in
594 * case of use in multithreaded programs.
595 */
596void
597htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000598 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000599
600 if (htmlStartCloseIndexinitialized) return;
601
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000602 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
603 indx = 0;
604 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
605 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +0000606 while (htmlStartClose[i] != NULL) i++;
607 i++;
608 }
609 htmlStartCloseIndexinitialized = 1;
610}
611
612/**
613 * htmlTagLookup:
614 * @tag: The tag name in lowercase
615 *
616 * Lookup the HTML tag in the ElementTable
617 *
618 * Returns the related htmlElemDescPtr or NULL if not found.
619 */
Daniel Veillardbb371292001-08-16 23:26:59 +0000620const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +0000621htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000622 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000623
624 for (i = 0; i < (sizeof(html40ElementTable) /
625 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000626 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillard22090732001-07-16 00:06:07 +0000627 return((const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +0000628 }
629 return(NULL);
630}
631
632/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000633 * htmlGetEndPriority:
634 * @name: The name of the element to look up the priority for.
635 *
636 * Return value: The "endtag" priority.
637 **/
638static int
639htmlGetEndPriority (const xmlChar *name) {
640 int i = 0;
641
642 while ((htmlEndPriority[i].name != NULL) &&
643 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
644 i++;
645
646 return(htmlEndPriority[i].priority);
647}
648
649/**
Owen Taylor3473f882001-02-23 17:55:21 +0000650 * htmlCheckAutoClose:
651 * @newtag: The new tag name
652 * @oldtag: The old tag name
653 *
654 * Checks wether the new tag is one of the registered valid tags for closing old.
655 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
656 *
657 * Returns 0 if no, 1 if yes.
658 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000659static int
Owen Taylor3473f882001-02-23 17:55:21 +0000660htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000661 int i, indx;
662 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +0000663
664 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
665
666 /* inefficient, but not a big deal */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000667 for (indx = 0; indx < 100;indx++) {
668 closed = htmlStartCloseIndex[indx];
669 if (closed == NULL) return(0);
670 if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor3473f882001-02-23 17:55:21 +0000671 }
672
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000673 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +0000674 i++;
675 while (htmlStartClose[i] != NULL) {
676 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
677 return(1);
678 }
679 i++;
680 }
681 return(0);
682}
683
684/**
685 * htmlAutoCloseOnClose:
686 * @ctxt: an HTML parser context
687 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000688 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +0000689 *
690 * The HTmL DtD allows an ending tag to implicitely close other tags.
691 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000692static void
Owen Taylor3473f882001-02-23 17:55:21 +0000693htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillardbb371292001-08-16 23:26:59 +0000694 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +0000695 xmlChar *oldname;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000696 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +0000697
698#ifdef DEBUG
699 xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
700 for (i = 0;i < ctxt->nameNr;i++)
701 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
702#endif
703
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000704 priority = htmlGetEndPriority (newtag);
705
Owen Taylor3473f882001-02-23 17:55:21 +0000706 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000707
Owen Taylor3473f882001-02-23 17:55:21 +0000708 if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000709 /*
710 * A missplaced endtagad can only close elements with lower
711 * or equal priority, so if we find an element with higher
712 * priority before we find an element with
713 * matching name, we just ignore this endtag
714 */
715 if (htmlGetEndPriority (ctxt->nameTab[i]) > priority) return;
Owen Taylor3473f882001-02-23 17:55:21 +0000716 }
717 if (i < 0) return;
718
719 while (!xmlStrEqual(newtag, ctxt->name)) {
720 info = htmlTagLookup(ctxt->name);
721 if ((info == NULL) || (info->endTag == 1)) {
722#ifdef DEBUG
723 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
724#endif
Daniel Veillard56098d42001-04-24 12:51:09 +0000725 } else if (info->endTag == 3) {
726#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +0000727 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", newtag, ctxt->name);
William M. Brack1633d182001-10-05 15:41:19 +0000728
Daniel Veillard56098d42001-04-24 12:51:09 +0000729#endif
730 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
731 ctxt->sax->error(ctxt->userData,
732 "Opening and ending tag mismatch: %s and %s\n",
733 newtag, ctxt->name);
734 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000735 }
736 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
737 ctxt->sax->endElement(ctxt->userData, ctxt->name);
738 oldname = htmlnamePop(ctxt);
739 if (oldname != NULL) {
740#ifdef DEBUG
741 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
742#endif
743 xmlFree(oldname);
744 }
745 }
746}
747
748/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000749 * htmlAutoCloseOnEnd:
750 * @ctxt: an HTML parser context
751 *
752 * Close all remaining tags at the end of the stream
753 */
754static void
755htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
756 xmlChar *oldname;
757 int i;
758
759 if (ctxt->nameNr == 0)
760 return;
761#ifdef DEBUG
762 xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
763#endif
764
765 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
766#ifdef DEBUG
767 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
768#endif
769 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
770 ctxt->sax->endElement(ctxt->userData, ctxt->name);
771 oldname = htmlnamePop(ctxt);
772 if (oldname != NULL) {
773#ifdef DEBUG
774 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
775#endif
776 xmlFree(oldname);
777 }
778 }
779}
780
781/**
Owen Taylor3473f882001-02-23 17:55:21 +0000782 * htmlAutoClose:
783 * @ctxt: an HTML parser context
784 * @newtag: The new tag name or NULL
785 *
786 * The HTmL DtD allows a tag to implicitely close other tags.
787 * The list is kept in htmlStartClose array. This function is
788 * called when a new tag has been detected and generates the
789 * appropriates closes if possible/needed.
790 * If newtag is NULL this mean we are at the end of the resource
791 * and we should check
792 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000793static void
Owen Taylor3473f882001-02-23 17:55:21 +0000794htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
795 xmlChar *oldname;
796 while ((newtag != NULL) && (ctxt->name != NULL) &&
797 (htmlCheckAutoClose(newtag, ctxt->name))) {
798#ifdef DEBUG
799 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
800#endif
801 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
802 ctxt->sax->endElement(ctxt->userData, ctxt->name);
803 oldname = htmlnamePop(ctxt);
804 if (oldname != NULL) {
805#ifdef DEBUG
806 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
807#endif
808 xmlFree(oldname);
809 }
810 }
811 if (newtag == NULL) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000812 htmlAutoCloseOnEnd(ctxt);
813 return;
Owen Taylor3473f882001-02-23 17:55:21 +0000814 }
815 while ((newtag == NULL) && (ctxt->name != NULL) &&
816 ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
817 (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
818 (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
819#ifdef DEBUG
820 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
821#endif
822 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
823 ctxt->sax->endElement(ctxt->userData, ctxt->name);
824 oldname = htmlnamePop(ctxt);
825 if (oldname != NULL) {
826#ifdef DEBUG
827 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
828#endif
829 xmlFree(oldname);
830 }
831 }
832
833}
834
835/**
836 * htmlAutoCloseTag:
837 * @doc: the HTML document
838 * @name: The tag name
839 * @elem: the HTML element
840 *
841 * The HTmL DtD allows a tag to implicitely close other tags.
842 * The list is kept in htmlStartClose array. This function checks
843 * if the element or one of it's children would autoclose the
844 * given tag.
845 *
846 * Returns 1 if autoclose, 0 otherwise
847 */
848int
849htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
850 htmlNodePtr child;
851
852 if (elem == NULL) return(1);
853 if (xmlStrEqual(name, elem->name)) return(0);
854 if (htmlCheckAutoClose(elem->name, name)) return(1);
855 child = elem->children;
856 while (child != NULL) {
857 if (htmlAutoCloseTag(doc, name, child)) return(1);
858 child = child->next;
859 }
860 return(0);
861}
862
863/**
864 * htmlIsAutoClosed:
865 * @doc: the HTML document
866 * @elem: the HTML element
867 *
868 * The HTmL DtD allows a tag to implicitely close other tags.
869 * The list is kept in htmlStartClose array. This function checks
870 * if a tag is autoclosed by one of it's child
871 *
872 * Returns 1 if autoclosed, 0 otherwise
873 */
874int
875htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
876 htmlNodePtr child;
877
878 if (elem == NULL) return(1);
879 child = elem->children;
880 while (child != NULL) {
881 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
882 child = child->next;
883 }
884 return(0);
885}
886
887/**
888 * htmlCheckImplied:
889 * @ctxt: an HTML parser context
890 * @newtag: The new tag name
891 *
892 * The HTML DtD allows a tag to exists only implicitely
893 * called when a new tag has been detected and generates the
894 * appropriates implicit tags if missing
895 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000896static void
Owen Taylor3473f882001-02-23 17:55:21 +0000897htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
898 if (!htmlOmittedDefaultValue)
899 return;
900 if (xmlStrEqual(newtag, BAD_CAST"html"))
901 return;
902 if (ctxt->nameNr <= 0) {
903#ifdef DEBUG
904 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
905#endif
906 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
907 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
908 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
909 }
910 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
911 return;
912 if ((ctxt->nameNr <= 1) &&
913 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
914 (xmlStrEqual(newtag, BAD_CAST"style")) ||
915 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
916 (xmlStrEqual(newtag, BAD_CAST"link")) ||
917 (xmlStrEqual(newtag, BAD_CAST"title")) ||
918 (xmlStrEqual(newtag, BAD_CAST"base")))) {
919 /*
920 * dropped OBJECT ... i you put it first BODY will be
921 * assumed !
922 */
923#ifdef DEBUG
924 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
925#endif
926 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
927 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
928 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
929 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
930 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
931 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
932 int i;
933 for (i = 0;i < ctxt->nameNr;i++) {
934 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
935 return;
936 }
937 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
938 return;
939 }
940 }
941
942#ifdef DEBUG
943 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
944#endif
945 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
946 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
947 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
948 }
949}
950
951/**
952 * htmlCheckParagraph
953 * @ctxt: an HTML parser context
954 *
955 * Check whether a p element need to be implied before inserting
956 * characters in the current element.
957 *
958 * Returns 1 if a paragraph has been inserted, 0 if not and -1
959 * in case of error.
960 */
961
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000962static int
Owen Taylor3473f882001-02-23 17:55:21 +0000963htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
964 const xmlChar *tag;
965 int i;
966
967 if (ctxt == NULL)
968 return(-1);
969 tag = ctxt->name;
970 if (tag == NULL) {
971 htmlAutoClose(ctxt, BAD_CAST"p");
972 htmlCheckImplied(ctxt, BAD_CAST"p");
973 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
974 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
975 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
976 return(1);
977 }
978 if (!htmlOmittedDefaultValue)
979 return(0);
980 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
981 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
982#ifdef DEBUG
983 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
984#endif
985 htmlAutoClose(ctxt, BAD_CAST"p");
986 htmlCheckImplied(ctxt, BAD_CAST"p");
987 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
988 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
989 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
990 return(1);
991 }
992 }
993 return(0);
994}
995
996/**
997 * htmlIsScriptAttribute:
998 * @name: an attribute name
999 *
1000 * Check if an attribute is of content type Script
1001 *
1002 * Returns 1 is the attribute is a script 0 otherwise
1003 */
1004int
1005htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001006 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001007
1008 if (name == NULL)
1009 return(0);
1010 /*
1011 * all script attributes start with 'on'
1012 */
1013 if ((name[0] != 'o') || (name[1] != 'n'))
1014 return(0);
1015 for (i = 0;
1016 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1017 i++) {
1018 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1019 return(1);
1020 }
1021 return(0);
1022}
1023
1024/************************************************************************
1025 * *
1026 * The list of HTML predefined entities *
1027 * *
1028 ************************************************************************/
1029
1030
Daniel Veillard22090732001-07-16 00:06:07 +00001031static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001032/*
1033 * the 4 absolute ones, plus apostrophe.
1034 */
1035{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1036{ 38, "amp", "ampersand, U+0026 ISOnum" },
1037{ 39, "apos", "single quote" },
1038{ 60, "lt", "less-than sign, U+003C ISOnum" },
1039{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1040
1041/*
1042 * A bunch still in the 128-255 range
1043 * Replacing them depend really on the charset used.
1044 */
1045{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1046{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1047{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1048{ 163, "pound","pound sign, U+00A3 ISOnum" },
1049{ 164, "curren","currency sign, U+00A4 ISOnum" },
1050{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1051{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1052{ 167, "sect", "section sign, U+00A7 ISOnum" },
1053{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1054{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1055{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1056{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1057{ 172, "not", "not sign, U+00AC ISOnum" },
1058{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1059{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1060{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1061{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1062{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1063{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1064{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1065{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1066{ 181, "micro","micro sign, U+00B5 ISOnum" },
1067{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1068{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1069{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1070{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1071{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1072{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1073{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1074{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1075{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1076{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1077{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1078{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1079{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1080{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1081{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1082{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1083{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1084{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1085{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1086{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1087{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1088{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1089{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1090{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1091{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1092{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1093{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1094{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1095{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1096{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1097{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1098{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1099{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1100{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1101{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1102{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1103{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1104{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1105{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1106{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1107{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1108{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1109{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1110{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1111{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1112{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1113{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1114{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1115{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1116{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1117{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1118{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1119{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1120{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1121{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1122{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1123{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1124{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1125{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1126{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1127{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1128{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1129{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1130{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1131{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1132{ 247, "divide","division sign, U+00F7 ISOnum" },
1133{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1134{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1135{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1136{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1137{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1138{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1139{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1140{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1141
1142{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1143{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1144{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1145{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1146{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1147
1148/*
1149 * Anything below should really be kept as entities references
1150 */
1151{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1152
1153{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1154{ 732, "tilde","small tilde, U+02DC ISOdia" },
1155
1156{ 913, "Alpha","greek capital letter alpha, U+0391" },
1157{ 914, "Beta", "greek capital letter beta, U+0392" },
1158{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1159{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1160{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1161{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1162{ 919, "Eta", "greek capital letter eta, U+0397" },
1163{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1164{ 921, "Iota", "greek capital letter iota, U+0399" },
1165{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001166{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001167{ 924, "Mu", "greek capital letter mu, U+039C" },
1168{ 925, "Nu", "greek capital letter nu, U+039D" },
1169{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1170{ 927, "Omicron","greek capital letter omicron, U+039F" },
1171{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1172{ 929, "Rho", "greek capital letter rho, U+03A1" },
1173{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1174{ 932, "Tau", "greek capital letter tau, U+03A4" },
1175{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1176{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1177{ 935, "Chi", "greek capital letter chi, U+03A7" },
1178{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1179{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1180
1181{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1182{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1183{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1184{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1185{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1186{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1187{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1188{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1189{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1190{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1191{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1192{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1193{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1194{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1195{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1196{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1197{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1198{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1199{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1200{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1201{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1202{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1203{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1204{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1205{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1206{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1207{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1208{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1209
1210{ 8194, "ensp", "en space, U+2002 ISOpub" },
1211{ 8195, "emsp", "em space, U+2003 ISOpub" },
1212{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1213{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1214{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1215{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1216{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1217{ 8211, "ndash","en dash, U+2013 ISOpub" },
1218{ 8212, "mdash","em dash, U+2014 ISOpub" },
1219{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1220{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1221{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1222{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1223{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1224{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1225{ 8224, "dagger","dagger, U+2020 ISOpub" },
1226{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1227
1228{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1229{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1230
1231{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1232
1233{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1234{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1235
1236{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1237{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1238
1239{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1240{ 8260, "frasl","fraction slash, U+2044 NEW" },
1241
1242{ 8364, "euro", "euro sign, U+20AC NEW" },
1243
1244{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1245{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1246{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1247{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1248{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1249{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1250{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1251{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1252{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1253{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1254{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1255{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1256{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1257{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1258{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1259{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1260
1261{ 8704, "forall","for all, U+2200 ISOtech" },
1262{ 8706, "part", "partial differential, U+2202 ISOtech" },
1263{ 8707, "exist","there exists, U+2203 ISOtech" },
1264{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1265{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1266{ 8712, "isin", "element of, U+2208 ISOtech" },
1267{ 8713, "notin","not an element of, U+2209 ISOtech" },
1268{ 8715, "ni", "contains as member, U+220B ISOtech" },
1269{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1270{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
1271{ 8722, "minus","minus sign, U+2212 ISOtech" },
1272{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1273{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1274{ 8733, "prop", "proportional to, U+221D ISOtech" },
1275{ 8734, "infin","infinity, U+221E ISOtech" },
1276{ 8736, "ang", "angle, U+2220 ISOamso" },
1277{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1278{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1279{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1280{ 8746, "cup", "union = cup, U+222A ISOtech" },
1281{ 8747, "int", "integral, U+222B ISOtech" },
1282{ 8756, "there4","therefore, U+2234 ISOtech" },
1283{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1284{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1285{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1286{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1287{ 8801, "equiv","identical to, U+2261 ISOtech" },
1288{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1289{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1290{ 8834, "sub", "subset of, U+2282 ISOtech" },
1291{ 8835, "sup", "superset of, U+2283 ISOtech" },
1292{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1293{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1294{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1295{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1296{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1297{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1298{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1299{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1300{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1301{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1302{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1303{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1304{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1305{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1306
1307{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1308{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1309{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1310{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1311
1312};
1313
1314/************************************************************************
1315 * *
1316 * Commodity functions to handle entities *
1317 * *
1318 ************************************************************************/
1319
1320/*
1321 * Macro used to grow the current buffer.
1322 */
1323#define growBuffer(buffer) { \
1324 buffer##_size *= 2; \
1325 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1326 if (buffer == NULL) { \
1327 perror("realloc failed"); \
1328 return(NULL); \
1329 } \
1330}
1331
1332/**
1333 * htmlEntityLookup:
1334 * @name: the entity name
1335 *
1336 * Lookup the given entity in EntitiesTable
1337 *
1338 * TODO: the linear scan is really ugly, an hash table is really needed.
1339 *
1340 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1341 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001342const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001343htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001344 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001345
1346 for (i = 0;i < (sizeof(html40EntitiesTable)/
1347 sizeof(html40EntitiesTable[0]));i++) {
1348 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1349#ifdef DEBUG
1350 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
1351#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001352 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001353 }
1354 }
1355 return(NULL);
1356}
1357
1358/**
1359 * htmlEntityValueLookup:
1360 * @value: the entity's unicode value
1361 *
1362 * Lookup the given entity in EntitiesTable
1363 *
1364 * TODO: the linear scan is really ugly, an hash table is really needed.
1365 *
1366 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1367 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001368const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001369htmlEntityValueLookup(unsigned int value) {
1370 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001371#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00001372 unsigned int lv = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001373#endif
1374
1375 for (i = 0;i < (sizeof(html40EntitiesTable)/
1376 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001377 if (html40EntitiesTable[i].value >= value) {
1378 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001379 break;
1380#ifdef DEBUG
1381 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
1382#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001383 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001384 }
1385#ifdef DEBUG
1386 if (lv > html40EntitiesTable[i].value) {
1387 xmlGenericError(xmlGenericErrorContext,
1388 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1389 lv, html40EntitiesTable[i].value);
1390 }
1391 lv = html40EntitiesTable[i].value;
1392#endif
1393 }
1394 return(NULL);
1395}
1396
1397/**
1398 * UTF8ToHtml:
1399 * @out: a pointer to an array of bytes to store the result
1400 * @outlen: the length of @out
1401 * @in: a pointer to an array of UTF-8 chars
1402 * @inlen: the length of @in
1403 *
1404 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1405 * plus HTML entities block of chars out.
1406 *
1407 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1408 * The value of @inlen after return is the number of octets consumed
1409 * as the return value is positive, else unpredictiable.
1410 * The value of @outlen after return is the number of octets consumed.
1411 */
1412int
1413UTF8ToHtml(unsigned char* out, int *outlen,
1414 const unsigned char* in, int *inlen) {
1415 const unsigned char* processed = in;
1416 const unsigned char* outend;
1417 const unsigned char* outstart = out;
1418 const unsigned char* instart = in;
1419 const unsigned char* inend;
1420 unsigned int c, d;
1421 int trailing;
1422
1423 if (in == NULL) {
1424 /*
1425 * initialization nothing to do
1426 */
1427 *outlen = 0;
1428 *inlen = 0;
1429 return(0);
1430 }
1431 inend = in + (*inlen);
1432 outend = out + (*outlen);
1433 while (in < inend) {
1434 d = *in++;
1435 if (d < 0x80) { c= d; trailing= 0; }
1436 else if (d < 0xC0) {
1437 /* trailing byte in leading position */
1438 *outlen = out - outstart;
1439 *inlen = processed - instart;
1440 return(-2);
1441 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1442 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1443 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1444 else {
1445 /* no chance for this in Ascii */
1446 *outlen = out - outstart;
1447 *inlen = processed - instart;
1448 return(-2);
1449 }
1450
1451 if (inend - in < trailing) {
1452 break;
1453 }
1454
1455 for ( ; trailing; trailing--) {
1456 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1457 break;
1458 c <<= 6;
1459 c |= d & 0x3F;
1460 }
1461
1462 /* assertion: c is a single UTF-4 value */
1463 if (c < 0x80) {
1464 if (out + 1 >= outend)
1465 break;
1466 *out++ = c;
1467 } else {
1468 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001469 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001470
1471 /*
1472 * Try to lookup a predefined HTML entity for it
1473 */
1474
1475 ent = htmlEntityValueLookup(c);
1476 if (ent == NULL) {
1477 /* no chance for this in Ascii */
1478 *outlen = out - outstart;
1479 *inlen = processed - instart;
1480 return(-2);
1481 }
1482 len = strlen(ent->name);
1483 if (out + 2 + len >= outend)
1484 break;
1485 *out++ = '&';
1486 memcpy(out, ent->name, len);
1487 out += len;
1488 *out++ = ';';
1489 }
1490 processed = in;
1491 }
1492 *outlen = out - outstart;
1493 *inlen = processed - instart;
1494 return(0);
1495}
1496
1497/**
1498 * htmlEncodeEntities:
1499 * @out: a pointer to an array of bytes to store the result
1500 * @outlen: the length of @out
1501 * @in: a pointer to an array of UTF-8 chars
1502 * @inlen: the length of @in
1503 * @quoteChar: the quote character to escape (' or ") or zero.
1504 *
1505 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1506 * plus HTML entities block of chars out.
1507 *
1508 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1509 * The value of @inlen after return is the number of octets consumed
1510 * as the return value is positive, else unpredictiable.
1511 * The value of @outlen after return is the number of octets consumed.
1512 */
1513int
1514htmlEncodeEntities(unsigned char* out, int *outlen,
1515 const unsigned char* in, int *inlen, int quoteChar) {
1516 const unsigned char* processed = in;
1517 const unsigned char* outend = out + (*outlen);
1518 const unsigned char* outstart = out;
1519 const unsigned char* instart = in;
1520 const unsigned char* inend = in + (*inlen);
1521 unsigned int c, d;
1522 int trailing;
1523
1524 while (in < inend) {
1525 d = *in++;
1526 if (d < 0x80) { c= d; trailing= 0; }
1527 else if (d < 0xC0) {
1528 /* trailing byte in leading position */
1529 *outlen = out - outstart;
1530 *inlen = processed - instart;
1531 return(-2);
1532 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1533 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1534 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1535 else {
1536 /* no chance for this in Ascii */
1537 *outlen = out - outstart;
1538 *inlen = processed - instart;
1539 return(-2);
1540 }
1541
1542 if (inend - in < trailing)
1543 break;
1544
1545 while (trailing--) {
1546 if (((d= *in++) & 0xC0) != 0x80) {
1547 *outlen = out - outstart;
1548 *inlen = processed - instart;
1549 return(-2);
1550 }
1551 c <<= 6;
1552 c |= d & 0x3F;
1553 }
1554
1555 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001556 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1557 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001558 if (out >= outend)
1559 break;
1560 *out++ = c;
1561 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001562 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001563 const char *cp;
1564 char nbuf[16];
1565 int len;
1566
1567 /*
1568 * Try to lookup a predefined HTML entity for it
1569 */
1570 ent = htmlEntityValueLookup(c);
1571 if (ent == NULL) {
1572 sprintf(nbuf, "#%u", c);
1573 cp = nbuf;
1574 }
1575 else
1576 cp = ent->name;
1577 len = strlen(cp);
1578 if (out + 2 + len > outend)
1579 break;
1580 *out++ = '&';
1581 memcpy(out, cp, len);
1582 out += len;
1583 *out++ = ';';
1584 }
1585 processed = in;
1586 }
1587 *outlen = out - outstart;
1588 *inlen = processed - instart;
1589 return(0);
1590}
1591
1592/**
1593 * htmlDecodeEntities:
1594 * @ctxt: the parser context
1595 * @len: the len to decode (in bytes !), -1 for no size limit
1596 * @end: an end marker xmlChar, 0 if none
1597 * @end2: an end marker xmlChar, 0 if none
1598 * @end3: an end marker xmlChar, 0 if none
1599 *
1600 * Subtitute the HTML entities by their value
1601 *
1602 * DEPRECATED !!!!
1603 *
1604 * Returns A newly allocated string with the substitution done. The caller
1605 * must deallocate it !
1606 */
1607xmlChar *
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00001608htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
1609 xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001610 static int deprecated = 0;
1611 if (!deprecated) {
1612 xmlGenericError(xmlGenericErrorContext,
1613 "htmlDecodeEntities() deprecated function reached\n");
1614 deprecated = 1;
1615 }
1616 return(NULL);
1617#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00001618 xmlChar *name = NULL;
1619 xmlChar *buffer = NULL;
1620 unsigned int buffer_size = 0;
1621 unsigned int nbchars = 0;
1622 htmlEntityDescPtr ent;
1623 unsigned int max = (unsigned int) len;
1624 int c,l;
1625
1626 if (ctxt->depth > 40) {
1627 ctxt->errNo = XML_ERR_ENTITY_LOOP;
1628 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1629 ctxt->sax->error(ctxt->userData,
1630 "Detected entity reference loop\n");
1631 ctxt->wellFormed = 0;
1632 ctxt->disableSAX = 1;
1633 return(NULL);
1634 }
1635
1636 /*
1637 * allocate a translation buffer.
1638 */
1639 buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1640 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1641 if (buffer == NULL) {
1642 perror("xmlDecodeEntities: malloc failed");
1643 return(NULL);
1644 }
1645
1646 /*
1647 * Ok loop until we reach one of the ending char or a size limit.
1648 */
1649 c = CUR_CHAR(l);
1650 while ((nbchars < max) && (c != end) &&
1651 (c != end2) && (c != end3)) {
1652
1653 if (c == 0) break;
1654 if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
1655 int val = htmlParseCharRef(ctxt);
1656 COPY_BUF(0,buffer,nbchars,val);
1657 NEXTL(l);
1658 } else if ((c == '&') && (ctxt->token != '&')) {
1659 ent = htmlParseEntityRef(ctxt, &name);
1660 if (name != NULL) {
1661 if (ent != NULL) {
1662 int val = ent->value;
1663 COPY_BUF(0,buffer,nbchars,val);
1664 NEXTL(l);
1665 } else {
1666 const xmlChar *cur = name;
1667
1668 buffer[nbchars++] = '&';
1669 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1670 growBuffer(buffer);
1671 }
1672 while (*cur != 0) {
1673 buffer[nbchars++] = *cur++;
1674 }
1675 buffer[nbchars++] = ';';
1676 }
1677 }
1678 } else {
1679 COPY_BUF(l,buffer,nbchars,c);
1680 NEXTL(l);
1681 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1682 growBuffer(buffer);
1683 }
1684 }
1685 c = CUR_CHAR(l);
1686 }
1687 buffer[nbchars++] = 0;
1688 return(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001689#endif
Owen Taylor3473f882001-02-23 17:55:21 +00001690}
1691
1692/************************************************************************
1693 * *
1694 * Commodity functions to handle streams *
1695 * *
1696 ************************************************************************/
1697
1698/**
Owen Taylor3473f882001-02-23 17:55:21 +00001699 * htmlNewInputStream:
1700 * @ctxt: an HTML parser context
1701 *
1702 * Create a new input stream structure
1703 * Returns the new input stream or NULL
1704 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001705static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001706htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1707 htmlParserInputPtr input;
1708
1709 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1710 if (input == NULL) {
1711 ctxt->errNo = XML_ERR_NO_MEMORY;
1712 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1713 ctxt->sax->error(ctxt->userData,
1714 "malloc: couldn't allocate a new input stream\n");
1715 return(NULL);
1716 }
1717 memset(input, 0, sizeof(htmlParserInput));
1718 input->filename = NULL;
1719 input->directory = NULL;
1720 input->base = NULL;
1721 input->cur = NULL;
1722 input->buf = NULL;
1723 input->line = 1;
1724 input->col = 1;
1725 input->buf = NULL;
1726 input->free = NULL;
1727 input->version = NULL;
1728 input->consumed = 0;
1729 input->length = 0;
1730 return(input);
1731}
1732
1733
1734/************************************************************************
1735 * *
1736 * Commodity functions, cleanup needed ? *
1737 * *
1738 ************************************************************************/
1739
1740/**
1741 * areBlanks:
1742 * @ctxt: an HTML parser context
1743 * @str: a xmlChar *
1744 * @len: the size of @str
1745 *
1746 * Is this a sequence of blank chars that one can ignore ?
1747 *
1748 * Returns 1 if ignorable 0 otherwise.
1749 */
1750
1751static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
1752 int i;
1753 xmlNodePtr lastChild;
1754
1755 for (i = 0;i < len;i++)
1756 if (!(IS_BLANK(str[i]))) return(0);
1757
1758 if (CUR == 0) return(1);
1759 if (CUR != '<') return(0);
1760 if (ctxt->name == NULL)
1761 return(1);
1762 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
1763 return(1);
1764 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
1765 return(1);
1766 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
1767 return(1);
1768 if (ctxt->node == NULL) return(0);
1769 lastChild = xmlGetLastChild(ctxt->node);
1770 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00001771 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
1772 (ctxt->node->content != NULL)) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001773 } else if (xmlNodeIsText(lastChild)) {
1774 return(0);
1775 } else if (xmlStrEqual(lastChild->name, BAD_CAST"b")) {
1776 return(0);
1777 } else if (xmlStrEqual(lastChild->name, BAD_CAST"bold")) {
1778 return(0);
1779 } else if (xmlStrEqual(lastChild->name, BAD_CAST"em")) {
1780 return(0);
1781 }
1782 return(1);
1783}
1784
1785/**
Owen Taylor3473f882001-02-23 17:55:21 +00001786 * htmlNewDocNoDtD:
1787 * @URI: URI for the dtd, or NULL
1788 * @ExternalID: the external ID of the DTD, or NULL
1789 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001790 * Creates a new HTML document without a DTD node if @URI and @ExternalID
1791 * are NULL
1792 *
Owen Taylor3473f882001-02-23 17:55:21 +00001793 * Returns a new document, do not intialize the DTD if not provided
1794 */
1795htmlDocPtr
1796htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
1797 xmlDocPtr cur;
1798
1799 /*
1800 * Allocate a new document and fill the fields.
1801 */
1802 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
1803 if (cur == NULL) {
1804 xmlGenericError(xmlGenericErrorContext,
1805 "xmlNewDoc : malloc failed\n");
1806 return(NULL);
1807 }
1808 memset(cur, 0, sizeof(xmlDoc));
1809
1810 cur->type = XML_HTML_DOCUMENT_NODE;
1811 cur->version = NULL;
1812 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001813 cur->doc = cur;
1814 cur->name = NULL;
1815 cur->children = NULL;
1816 cur->extSubset = NULL;
1817 cur->oldNs = NULL;
1818 cur->encoding = NULL;
1819 cur->standalone = 1;
1820 cur->compression = 0;
1821 cur->ids = NULL;
1822 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001823 cur->_private = NULL;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00001824 if ((ExternalID != NULL) ||
1825 (URI != NULL))
1826 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00001827 return(cur);
1828}
1829
1830/**
1831 * htmlNewDoc:
1832 * @URI: URI for the dtd, or NULL
1833 * @ExternalID: the external ID of the DTD, or NULL
1834 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001835 * Creates a new HTML document
1836 *
Owen Taylor3473f882001-02-23 17:55:21 +00001837 * Returns a new document
1838 */
1839htmlDocPtr
1840htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1841 if ((URI == NULL) && (ExternalID == NULL))
1842 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00001843 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
1844 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00001845
1846 return(htmlNewDocNoDtD(URI, ExternalID));
1847}
1848
1849
1850/************************************************************************
1851 * *
1852 * The parser itself *
1853 * Relates to http://www.w3.org/TR/html40 *
1854 * *
1855 ************************************************************************/
1856
1857/************************************************************************
1858 * *
1859 * The parser itself *
1860 * *
1861 ************************************************************************/
1862
1863/**
1864 * htmlParseHTMLName:
1865 * @ctxt: an HTML parser context
1866 *
1867 * parse an HTML tag or attribute name, note that we convert it to lowercase
1868 * since HTML names are not case-sensitive.
1869 *
1870 * Returns the Tag Name parsed or NULL
1871 */
1872
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001873static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001874htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1875 xmlChar *ret = NULL;
1876 int i = 0;
1877 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
1878
1879 if (!IS_LETTER(CUR) && (CUR != '_') &&
1880 (CUR != ':')) return(NULL);
1881
1882 while ((i < HTML_PARSER_BUFFER_SIZE) &&
1883 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1884 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
1885 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
1886 else loc[i] = CUR;
1887 i++;
1888
1889 NEXT;
1890 }
1891
1892 ret = xmlStrndup(loc, i);
1893
1894 return(ret);
1895}
1896
1897/**
1898 * htmlParseName:
1899 * @ctxt: an HTML parser context
1900 *
1901 * parse an HTML name, this routine is case sensistive.
1902 *
1903 * Returns the Name parsed or NULL
1904 */
1905
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001906static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001907htmlParseName(htmlParserCtxtPtr ctxt) {
1908 xmlChar buf[HTML_MAX_NAMELEN];
1909 int len = 0;
1910
1911 GROW;
1912 if (!IS_LETTER(CUR) && (CUR != '_')) {
1913 return(NULL);
1914 }
1915
1916 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1917 (CUR == '.') || (CUR == '-') ||
1918 (CUR == '_') || (CUR == ':') ||
1919 (IS_COMBINING(CUR)) ||
1920 (IS_EXTENDER(CUR))) {
1921 buf[len++] = CUR;
1922 NEXT;
1923 if (len >= HTML_MAX_NAMELEN) {
1924 xmlGenericError(xmlGenericErrorContext,
1925 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1926 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1927 (CUR == '.') || (CUR == '-') ||
1928 (CUR == '_') || (CUR == ':') ||
1929 (IS_COMBINING(CUR)) ||
1930 (IS_EXTENDER(CUR)))
1931 NEXT;
1932 break;
1933 }
1934 }
1935 return(xmlStrndup(buf, len));
1936}
1937
1938/**
1939 * htmlParseHTMLAttribute:
1940 * @ctxt: an HTML parser context
1941 * @stop: a char stop value
1942 *
1943 * parse an HTML attribute value till the stop (quote), if
1944 * stop is 0 then it stops at the first space
1945 *
1946 * Returns the attribute parsed or NULL
1947 */
1948
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001949static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001950htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
1951 xmlChar *buffer = NULL;
1952 int buffer_size = 0;
1953 xmlChar *out = NULL;
1954 xmlChar *name = NULL;
1955
1956 xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00001957 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001958
1959 /*
1960 * allocate a translation buffer.
1961 */
1962 buffer_size = HTML_PARSER_BUFFER_SIZE;
1963 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1964 if (buffer == NULL) {
1965 perror("htmlParseHTMLAttribute: malloc failed");
1966 return(NULL);
1967 }
1968 out = buffer;
1969
1970 /*
1971 * Ok loop until we reach one of the ending chars
1972 */
1973 while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1974 if ((stop == 0) && (IS_BLANK(CUR))) break;
1975 if (CUR == '&') {
1976 if (NXT(1) == '#') {
1977 unsigned int c;
1978 int bits;
1979
1980 c = htmlParseCharRef(ctxt);
1981 if (c < 0x80)
1982 { *out++ = c; bits= -6; }
1983 else if (c < 0x800)
1984 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1985 else if (c < 0x10000)
1986 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1987 else
1988 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1989
1990 for ( ; bits >= 0; bits-= 6) {
1991 *out++ = ((c >> bits) & 0x3F) | 0x80;
1992 }
1993 } else {
1994 ent = htmlParseEntityRef(ctxt, &name);
1995 if (name == NULL) {
1996 *out++ = '&';
1997 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001998 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00001999
2000 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002001 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002002 }
2003 } else if (ent == NULL) {
2004 *out++ = '&';
2005 cur = name;
2006 while (*cur != 0) {
2007 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002008 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002009
2010 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002011 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002012 }
2013 *out++ = *cur++;
2014 }
2015 xmlFree(name);
2016 } else {
2017 unsigned int c;
2018 int bits;
2019
2020 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002021 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002022
2023 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002024 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002025 }
2026 c = (xmlChar)ent->value;
2027 if (c < 0x80)
2028 { *out++ = c; bits= -6; }
2029 else if (c < 0x800)
2030 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2031 else if (c < 0x10000)
2032 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2033 else
2034 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2035
2036 for ( ; bits >= 0; bits-= 6) {
2037 *out++ = ((c >> bits) & 0x3F) | 0x80;
2038 }
2039 xmlFree(name);
2040 }
2041 }
2042 } else {
2043 unsigned int c;
2044 int bits, l;
2045
2046 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002047 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002048
2049 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002050 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002051 }
2052 c = CUR_CHAR(l);
2053 if (c < 0x80)
2054 { *out++ = c; bits= -6; }
2055 else if (c < 0x800)
2056 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2057 else if (c < 0x10000)
2058 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2059 else
2060 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2061
2062 for ( ; bits >= 0; bits-= 6) {
2063 *out++ = ((c >> bits) & 0x3F) | 0x80;
2064 }
2065 NEXT;
2066 }
2067 }
2068 *out++ = 0;
2069 return(buffer);
2070}
2071
2072/**
Owen Taylor3473f882001-02-23 17:55:21 +00002073 * htmlParseEntityRef:
2074 * @ctxt: an HTML parser context
2075 * @str: location to store the entity name
2076 *
2077 * parse an HTML ENTITY references
2078 *
2079 * [68] EntityRef ::= '&' Name ';'
2080 *
2081 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2082 * if non-NULL *str will have to be freed by the caller.
2083 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002084const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00002085htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2086 xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002087 const htmlEntityDesc * ent = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002088 *str = NULL;
2089
2090 if (CUR == '&') {
2091 NEXT;
2092 name = htmlParseName(ctxt);
2093 if (name == NULL) {
2094 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2095 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2096 ctxt->wellFormed = 0;
2097 } else {
2098 GROW;
2099 if (CUR == ';') {
2100 *str = name;
2101
2102 /*
2103 * Lookup the entity in the table.
2104 */
2105 ent = htmlEntityLookup(name);
2106 if (ent != NULL) /* OK that's ugly !!! */
2107 NEXT;
2108 } else {
2109 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2110 ctxt->sax->error(ctxt->userData,
2111 "htmlParseEntityRef: expecting ';'\n");
2112 *str = name;
2113 }
2114 }
2115 }
2116 return(ent);
2117}
2118
2119/**
2120 * htmlParseAttValue:
2121 * @ctxt: an HTML parser context
2122 *
2123 * parse a value for an attribute
2124 * Note: the parser won't do substitution of entities here, this
2125 * will be handled later in xmlStringGetNodeList, unless it was
2126 * asked for ctxt->replaceEntities != 0
2127 *
2128 * Returns the AttValue parsed or NULL.
2129 */
2130
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002131static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002132htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2133 xmlChar *ret = NULL;
2134
2135 if (CUR == '"') {
2136 NEXT;
2137 ret = htmlParseHTMLAttribute(ctxt, '"');
2138 if (CUR != '"') {
2139 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2140 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2141 ctxt->wellFormed = 0;
2142 } else
2143 NEXT;
2144 } else if (CUR == '\'') {
2145 NEXT;
2146 ret = htmlParseHTMLAttribute(ctxt, '\'');
2147 if (CUR != '\'') {
2148 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2149 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2150 ctxt->wellFormed = 0;
2151 } else
2152 NEXT;
2153 } else {
2154 /*
2155 * That's an HTMLism, the attribute value may not be quoted
2156 */
2157 ret = htmlParseHTMLAttribute(ctxt, 0);
2158 if (ret == NULL) {
2159 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2160 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2161 ctxt->wellFormed = 0;
2162 }
2163 }
2164 return(ret);
2165}
2166
2167/**
2168 * htmlParseSystemLiteral:
2169 * @ctxt: an HTML parser context
2170 *
2171 * parse an HTML Literal
2172 *
2173 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2174 *
2175 * Returns the SystemLiteral parsed or NULL
2176 */
2177
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002178static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002179htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2180 const xmlChar *q;
2181 xmlChar *ret = NULL;
2182
2183 if (CUR == '"') {
2184 NEXT;
2185 q = CUR_PTR;
2186 while ((IS_CHAR(CUR)) && (CUR != '"'))
2187 NEXT;
2188 if (!IS_CHAR(CUR)) {
2189 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2190 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2191 ctxt->wellFormed = 0;
2192 } else {
2193 ret = xmlStrndup(q, CUR_PTR - q);
2194 NEXT;
2195 }
2196 } else if (CUR == '\'') {
2197 NEXT;
2198 q = CUR_PTR;
2199 while ((IS_CHAR(CUR)) && (CUR != '\''))
2200 NEXT;
2201 if (!IS_CHAR(CUR)) {
2202 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2203 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2204 ctxt->wellFormed = 0;
2205 } else {
2206 ret = xmlStrndup(q, CUR_PTR - q);
2207 NEXT;
2208 }
2209 } else {
2210 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2211 ctxt->sax->error(ctxt->userData,
2212 "SystemLiteral \" or ' expected\n");
2213 ctxt->wellFormed = 0;
2214 }
2215
2216 return(ret);
2217}
2218
2219/**
2220 * htmlParsePubidLiteral:
2221 * @ctxt: an HTML parser context
2222 *
2223 * parse an HTML public literal
2224 *
2225 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2226 *
2227 * Returns the PubidLiteral parsed or NULL.
2228 */
2229
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002230static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002231htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2232 const xmlChar *q;
2233 xmlChar *ret = NULL;
2234 /*
2235 * Name ::= (Letter | '_') (NameChar)*
2236 */
2237 if (CUR == '"') {
2238 NEXT;
2239 q = CUR_PTR;
2240 while (IS_PUBIDCHAR(CUR)) NEXT;
2241 if (CUR != '"') {
2242 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2243 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2244 ctxt->wellFormed = 0;
2245 } else {
2246 ret = xmlStrndup(q, CUR_PTR - q);
2247 NEXT;
2248 }
2249 } else if (CUR == '\'') {
2250 NEXT;
2251 q = CUR_PTR;
2252 while ((IS_LETTER(CUR)) && (CUR != '\''))
2253 NEXT;
2254 if (!IS_LETTER(CUR)) {
2255 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2256 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2257 ctxt->wellFormed = 0;
2258 } else {
2259 ret = xmlStrndup(q, CUR_PTR - q);
2260 NEXT;
2261 }
2262 } else {
2263 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2264 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2265 ctxt->wellFormed = 0;
2266 }
2267
2268 return(ret);
2269}
2270
2271/**
2272 * htmlParseScript:
2273 * @ctxt: an HTML parser context
2274 *
2275 * parse the content of an HTML SCRIPT or STYLE element
2276 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2277 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2278 * http://www.w3.org/TR/html4/types.html#type-script
2279 * http://www.w3.org/TR/html4/types.html#h-6.15
2280 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2281 *
2282 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2283 * element and the value of intrinsic event attributes. User agents must
2284 * not evaluate script data as HTML markup but instead must pass it on as
2285 * data to a script engine.
2286 * NOTES:
2287 * - The content is passed like CDATA
2288 * - the attributes for style and scripting "onXXX" are also described
2289 * as CDATA but SGML allows entities references in attributes so their
2290 * processing is identical as other attributes
2291 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002292static void
Owen Taylor3473f882001-02-23 17:55:21 +00002293htmlParseScript(htmlParserCtxtPtr ctxt) {
2294 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2295 int nbchar = 0;
2296 xmlChar cur;
2297
2298 SHRINK;
2299 cur = CUR;
2300 while (IS_CHAR(cur)) {
2301 if ((cur == '<') && (NXT(1) == '/')) {
2302 /*
2303 * One should break here, the specification is clear:
2304 * Authors should therefore escape "</" within the content.
2305 * Escape mechanisms are specific to each scripting or
2306 * style sheet language.
2307 */
2308 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2309 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2310 break; /* while */
2311 }
2312 buf[nbchar++] = cur;
2313 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2314 if (ctxt->sax->cdataBlock!= NULL) {
2315 /*
2316 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2317 */
2318 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2319 }
2320 nbchar = 0;
2321 }
2322 NEXT;
2323 cur = CUR;
2324 }
2325 if (!(IS_CHAR(cur))) {
2326 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2327 ctxt->sax->error(ctxt->userData,
2328 "Invalid char in CDATA 0x%X\n", cur);
2329 ctxt->wellFormed = 0;
2330 NEXT;
2331 }
2332
2333 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2334 if (ctxt->sax->cdataBlock!= NULL) {
2335 /*
2336 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2337 */
2338 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2339 }
2340 }
2341}
2342
2343
2344/**
2345 * htmlParseCharData:
2346 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002347 *
2348 * parse a CharData section.
2349 * if we are within a CDATA section ']]>' marks an end of section.
2350 *
2351 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2352 */
2353
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002354static void
2355htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002356 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2357 int nbchar = 0;
2358 int cur, l;
2359
2360 SHRINK;
2361 cur = CUR_CHAR(l);
2362 while (((cur != '<') || (ctxt->token == '<')) &&
2363 ((cur != '&') || (ctxt->token == '&')) &&
2364 (IS_CHAR(cur))) {
2365 COPY_BUF(l,buf,nbchar,cur);
2366 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2367 /*
2368 * Ok the segment is to be consumed as chars.
2369 */
2370 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2371 if (areBlanks(ctxt, buf, nbchar)) {
2372 if (ctxt->sax->ignorableWhitespace != NULL)
2373 ctxt->sax->ignorableWhitespace(ctxt->userData,
2374 buf, nbchar);
2375 } else {
2376 htmlCheckParagraph(ctxt);
2377 if (ctxt->sax->characters != NULL)
2378 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2379 }
2380 }
2381 nbchar = 0;
2382 }
2383 NEXTL(l);
2384 cur = CUR_CHAR(l);
2385 }
2386 if (nbchar != 0) {
2387 /*
2388 * Ok the segment is to be consumed as chars.
2389 */
2390 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2391 if (areBlanks(ctxt, buf, nbchar)) {
2392 if (ctxt->sax->ignorableWhitespace != NULL)
2393 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2394 } else {
2395 htmlCheckParagraph(ctxt);
2396 if (ctxt->sax->characters != NULL)
2397 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2398 }
2399 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002400 } else {
2401 /*
2402 * Loop detection
2403 */
2404 if (cur == 0)
2405 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002406 }
2407}
2408
2409/**
2410 * htmlParseExternalID:
2411 * @ctxt: an HTML parser context
2412 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002413 *
2414 * Parse an External ID or a Public ID
2415 *
Owen Taylor3473f882001-02-23 17:55:21 +00002416 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2417 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2418 *
2419 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2420 *
2421 * Returns the function returns SystemLiteral and in the second
2422 * case publicID receives PubidLiteral, is strict is off
2423 * it is possible to return NULL and have publicID set.
2424 */
2425
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002426static xmlChar *
2427htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002428 xmlChar *URI = NULL;
2429
2430 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2431 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2432 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2433 SKIP(6);
2434 if (!IS_BLANK(CUR)) {
2435 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2436 ctxt->sax->error(ctxt->userData,
2437 "Space required after 'SYSTEM'\n");
2438 ctxt->wellFormed = 0;
2439 }
2440 SKIP_BLANKS;
2441 URI = htmlParseSystemLiteral(ctxt);
2442 if (URI == NULL) {
2443 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2444 ctxt->sax->error(ctxt->userData,
2445 "htmlParseExternalID: SYSTEM, no URI\n");
2446 ctxt->wellFormed = 0;
2447 }
2448 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2449 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2450 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2451 SKIP(6);
2452 if (!IS_BLANK(CUR)) {
2453 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2454 ctxt->sax->error(ctxt->userData,
2455 "Space required after 'PUBLIC'\n");
2456 ctxt->wellFormed = 0;
2457 }
2458 SKIP_BLANKS;
2459 *publicID = htmlParsePubidLiteral(ctxt);
2460 if (*publicID == NULL) {
2461 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2462 ctxt->sax->error(ctxt->userData,
2463 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2464 ctxt->wellFormed = 0;
2465 }
2466 SKIP_BLANKS;
2467 if ((CUR == '"') || (CUR == '\'')) {
2468 URI = htmlParseSystemLiteral(ctxt);
2469 }
2470 }
2471 return(URI);
2472}
2473
2474/**
2475 * htmlParseComment:
2476 * @ctxt: an HTML parser context
2477 *
2478 * Parse an XML (SGML) comment <!-- .... -->
2479 *
2480 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2481 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002482static void
Owen Taylor3473f882001-02-23 17:55:21 +00002483htmlParseComment(htmlParserCtxtPtr ctxt) {
2484 xmlChar *buf = NULL;
2485 int len;
2486 int size = HTML_PARSER_BUFFER_SIZE;
2487 int q, ql;
2488 int r, rl;
2489 int cur, l;
2490 xmlParserInputState state;
2491
2492 /*
2493 * Check that there is a comment right here.
2494 */
2495 if ((RAW != '<') || (NXT(1) != '!') ||
2496 (NXT(2) != '-') || (NXT(3) != '-')) return;
2497
2498 state = ctxt->instate;
2499 ctxt->instate = XML_PARSER_COMMENT;
2500 SHRINK;
2501 SKIP(4);
2502 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2503 if (buf == NULL) {
2504 xmlGenericError(xmlGenericErrorContext,
2505 "malloc of %d byte failed\n", size);
2506 ctxt->instate = state;
2507 return;
2508 }
2509 q = CUR_CHAR(ql);
2510 NEXTL(ql);
2511 r = CUR_CHAR(rl);
2512 NEXTL(rl);
2513 cur = CUR_CHAR(l);
2514 len = 0;
2515 while (IS_CHAR(cur) &&
2516 ((cur != '>') ||
2517 (r != '-') || (q != '-'))) {
2518 if (len + 5 >= size) {
2519 size *= 2;
2520 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2521 if (buf == NULL) {
2522 xmlGenericError(xmlGenericErrorContext,
2523 "realloc of %d byte failed\n", size);
2524 ctxt->instate = state;
2525 return;
2526 }
2527 }
2528 COPY_BUF(ql,buf,len,q);
2529 q = r;
2530 ql = rl;
2531 r = cur;
2532 rl = l;
2533 NEXTL(l);
2534 cur = CUR_CHAR(l);
2535 if (cur == 0) {
2536 SHRINK;
2537 GROW;
2538 cur = CUR_CHAR(l);
2539 }
2540 }
2541 buf[len] = 0;
2542 if (!IS_CHAR(cur)) {
2543 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2544 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2545 ctxt->sax->error(ctxt->userData,
2546 "Comment not terminated \n<!--%.50s\n", buf);
2547 ctxt->wellFormed = 0;
2548 xmlFree(buf);
2549 } else {
2550 NEXT;
2551 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2552 (!ctxt->disableSAX))
2553 ctxt->sax->comment(ctxt->userData, buf);
2554 xmlFree(buf);
2555 }
2556 ctxt->instate = state;
2557}
2558
2559/**
2560 * htmlParseCharRef:
2561 * @ctxt: an HTML parser context
2562 *
2563 * parse Reference declarations
2564 *
2565 * [66] CharRef ::= '&#' [0-9]+ ';' |
2566 * '&#x' [0-9a-fA-F]+ ';'
2567 *
2568 * Returns the value parsed (as an int)
2569 */
2570int
2571htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2572 int val = 0;
2573
2574 if ((CUR == '&') && (NXT(1) == '#') &&
2575 (NXT(2) == 'x')) {
2576 SKIP(3);
2577 while (CUR != ';') {
2578 if ((CUR >= '0') && (CUR <= '9'))
2579 val = val * 16 + (CUR - '0');
2580 else if ((CUR >= 'a') && (CUR <= 'f'))
2581 val = val * 16 + (CUR - 'a') + 10;
2582 else if ((CUR >= 'A') && (CUR <= 'F'))
2583 val = val * 16 + (CUR - 'A') + 10;
2584 else {
2585 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2586 ctxt->sax->error(ctxt->userData,
2587 "htmlParseCharRef: invalid hexadecimal value\n");
2588 ctxt->wellFormed = 0;
2589 return(0);
2590 }
2591 NEXT;
2592 }
2593 if (CUR == ';')
2594 NEXT;
2595 } else if ((CUR == '&') && (NXT(1) == '#')) {
2596 SKIP(2);
2597 while (CUR != ';') {
2598 if ((CUR >= '0') && (CUR <= '9'))
2599 val = val * 10 + (CUR - '0');
2600 else {
2601 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2602 ctxt->sax->error(ctxt->userData,
2603 "htmlParseCharRef: invalid decimal value\n");
2604 ctxt->wellFormed = 0;
2605 return(0);
2606 }
2607 NEXT;
2608 }
2609 if (CUR == ';')
2610 NEXT;
2611 } else {
2612 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2613 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2614 ctxt->wellFormed = 0;
2615 }
2616 /*
2617 * Check the value IS_CHAR ...
2618 */
2619 if (IS_CHAR(val)) {
2620 return(val);
2621 } else {
2622 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2623 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
2624 val);
2625 ctxt->wellFormed = 0;
2626 }
2627 return(0);
2628}
2629
2630
2631/**
2632 * htmlParseDocTypeDecl :
2633 * @ctxt: an HTML parser context
2634 *
2635 * parse a DOCTYPE declaration
2636 *
2637 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2638 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2639 */
2640
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002641static void
Owen Taylor3473f882001-02-23 17:55:21 +00002642htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
2643 xmlChar *name;
2644 xmlChar *ExternalID = NULL;
2645 xmlChar *URI = NULL;
2646
2647 /*
2648 * We know that '<!DOCTYPE' has been detected.
2649 */
2650 SKIP(9);
2651
2652 SKIP_BLANKS;
2653
2654 /*
2655 * Parse the DOCTYPE name.
2656 */
2657 name = htmlParseName(ctxt);
2658 if (name == NULL) {
2659 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2660 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2661 ctxt->wellFormed = 0;
2662 }
2663 /*
2664 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2665 */
2666
2667 SKIP_BLANKS;
2668
2669 /*
2670 * Check for SystemID and ExternalID
2671 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002672 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00002673 SKIP_BLANKS;
2674
2675 /*
2676 * We should be at the end of the DOCTYPE declaration.
2677 */
2678 if (CUR != '>') {
2679 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardf6ed8bc2001-10-02 09:22:47 +00002680 ctxt->sax->error(ctxt->userData, "DOCTYPE improperly terminated\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002681 ctxt->wellFormed = 0;
2682 /* We shouldn't try to resynchronize ... */
2683 }
2684 NEXT;
2685
2686 /*
2687 * Create or update the document accordingly to the DOCTYPE
2688 */
2689 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2690 (!ctxt->disableSAX))
2691 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
2692
2693 /*
2694 * Cleanup, since we don't use all those identifiers
2695 */
2696 if (URI != NULL) xmlFree(URI);
2697 if (ExternalID != NULL) xmlFree(ExternalID);
2698 if (name != NULL) xmlFree(name);
2699}
2700
2701/**
2702 * htmlParseAttribute:
2703 * @ctxt: an HTML parser context
2704 * @value: a xmlChar ** used to store the value of the attribute
2705 *
2706 * parse an attribute
2707 *
2708 * [41] Attribute ::= Name Eq AttValue
2709 *
2710 * [25] Eq ::= S? '=' S?
2711 *
2712 * With namespace:
2713 *
2714 * [NS 11] Attribute ::= QName Eq AttValue
2715 *
2716 * Also the case QName == xmlns:??? is handled independently as a namespace
2717 * definition.
2718 *
2719 * Returns the attribute name, and the value in *value.
2720 */
2721
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002722static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002723htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
2724 xmlChar *name, *val = NULL;
2725
2726 *value = NULL;
2727 name = htmlParseHTMLName(ctxt);
2728 if (name == NULL) {
2729 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2730 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2731 ctxt->wellFormed = 0;
2732 return(NULL);
2733 }
2734
2735 /*
2736 * read the value
2737 */
2738 SKIP_BLANKS;
2739 if (CUR == '=') {
2740 NEXT;
2741 SKIP_BLANKS;
2742 val = htmlParseAttValue(ctxt);
2743 /******
2744 } else {
2745 * TODO : some attribute must have values, some may not
2746 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2747 ctxt->sax->warning(ctxt->userData,
2748 "No value for attribute %s\n", name); */
2749 }
2750
2751 *value = val;
2752 return(name);
2753}
2754
2755/**
2756 * htmlCheckEncoding:
2757 * @ctxt: an HTML parser context
2758 * @attvalue: the attribute value
2759 *
2760 * Checks an http-equiv attribute from a Meta tag to detect
2761 * the encoding
2762 * If a new encoding is detected the parser is switched to decode
2763 * it and pass UTF8
2764 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002765static void
Owen Taylor3473f882001-02-23 17:55:21 +00002766htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2767 const xmlChar *encoding;
2768
2769 if ((ctxt == NULL) || (attvalue == NULL))
2770 return;
2771
2772 /* do not change encoding */
2773 if (ctxt->input->encoding != NULL)
2774 return;
2775
2776 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
2777 if (encoding != NULL) {
2778 encoding += 8;
2779 } else {
2780 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
2781 if (encoding != NULL)
2782 encoding += 9;
2783 }
2784 if (encoding != NULL) {
2785 xmlCharEncoding enc;
2786 xmlCharEncodingHandlerPtr handler;
2787
2788 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2789
2790 if (ctxt->input->encoding != NULL)
2791 xmlFree((xmlChar *) ctxt->input->encoding);
2792 ctxt->input->encoding = xmlStrdup(encoding);
2793
2794 enc = xmlParseCharEncoding((const char *) encoding);
2795 /*
2796 * registered set of known encodings
2797 */
2798 if (enc != XML_CHAR_ENCODING_ERROR) {
2799 xmlSwitchEncoding(ctxt, enc);
2800 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2801 } else {
2802 /*
2803 * fallback for unknown encodings
2804 */
2805 handler = xmlFindCharEncodingHandler((const char *) encoding);
2806 if (handler != NULL) {
2807 xmlSwitchToEncoding(ctxt, handler);
2808 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2809 } else {
2810 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2811 }
2812 }
2813
2814 if ((ctxt->input->buf != NULL) &&
2815 (ctxt->input->buf->encoder != NULL) &&
2816 (ctxt->input->buf->raw != NULL) &&
2817 (ctxt->input->buf->buffer != NULL)) {
2818 int nbchars;
2819 int processed;
2820
2821 /*
2822 * convert as much as possible to the parser reading buffer.
2823 */
2824 processed = ctxt->input->cur - ctxt->input->base;
2825 xmlBufferShrink(ctxt->input->buf->buffer, processed);
2826 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2827 ctxt->input->buf->buffer,
2828 ctxt->input->buf->raw);
2829 if (nbchars < 0) {
2830 ctxt->errNo = XML_ERR_INVALID_ENCODING;
2831 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2832 ctxt->sax->error(ctxt->userData,
2833 "htmlCheckEncoding: encoder error\n");
2834 }
2835 ctxt->input->base =
2836 ctxt->input->cur = ctxt->input->buf->buffer->content;
2837 }
2838 }
2839}
2840
2841/**
2842 * htmlCheckMeta:
2843 * @ctxt: an HTML parser context
2844 * @atts: the attributes values
2845 *
2846 * Checks an attributes from a Meta tag
2847 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002848static void
Owen Taylor3473f882001-02-23 17:55:21 +00002849htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2850 int i;
2851 const xmlChar *att, *value;
2852 int http = 0;
2853 const xmlChar *content = NULL;
2854
2855 if ((ctxt == NULL) || (atts == NULL))
2856 return;
2857
2858 i = 0;
2859 att = atts[i++];
2860 while (att != NULL) {
2861 value = atts[i++];
2862 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
2863 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
2864 http = 1;
2865 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
2866 content = value;
2867 att = atts[i++];
2868 }
2869 if ((http) && (content != NULL))
2870 htmlCheckEncoding(ctxt, content);
2871
2872}
2873
2874/**
2875 * htmlParseStartTag:
2876 * @ctxt: an HTML parser context
2877 *
2878 * parse a start of tag either for rule element or
2879 * EmptyElement. In both case we don't parse the tag closing chars.
2880 *
2881 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2882 *
2883 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2884 *
2885 * With namespace:
2886 *
2887 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2888 *
2889 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2890 *
2891 */
2892
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002893static void
Owen Taylor3473f882001-02-23 17:55:21 +00002894htmlParseStartTag(htmlParserCtxtPtr ctxt) {
2895 xmlChar *name;
2896 xmlChar *attname;
2897 xmlChar *attvalue;
2898 const xmlChar **atts = NULL;
2899 int nbatts = 0;
2900 int maxatts = 0;
2901 int meta = 0;
2902 int i;
2903
2904 if (CUR != '<') return;
2905 NEXT;
2906
2907 GROW;
2908 name = htmlParseHTMLName(ctxt);
2909 if (name == NULL) {
2910 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2911 ctxt->sax->error(ctxt->userData,
2912 "htmlParseStartTag: invalid element name\n");
2913 ctxt->wellFormed = 0;
2914 /* Dump the bogus tag like browsers do */
2915 while ((IS_CHAR(CUR)) && (CUR != '>'))
2916 NEXT;
2917 return;
2918 }
2919 if (xmlStrEqual(name, BAD_CAST"meta"))
2920 meta = 1;
2921
2922 /*
2923 * Check for auto-closure of HTML elements.
2924 */
2925 htmlAutoClose(ctxt, name);
2926
2927 /*
2928 * Check for implied HTML elements.
2929 */
2930 htmlCheckImplied(ctxt, name);
2931
2932 /*
2933 * Avoid html at any level > 0, head at any level != 1
2934 * or any attempt to recurse body
2935 */
2936 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
2937 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2938 ctxt->sax->error(ctxt->userData,
2939 "htmlParseStartTag: misplaced <html> tag\n");
2940 ctxt->wellFormed = 0;
2941 xmlFree(name);
2942 return;
2943 }
2944 if ((ctxt->nameNr != 1) &&
2945 (xmlStrEqual(name, BAD_CAST"head"))) {
2946 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2947 ctxt->sax->error(ctxt->userData,
2948 "htmlParseStartTag: misplaced <head> tag\n");
2949 ctxt->wellFormed = 0;
2950 xmlFree(name);
2951 return;
2952 }
2953 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002954 int indx;
2955 for (indx = 0;indx < ctxt->nameNr;indx++) {
2956 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor3473f882001-02-23 17:55:21 +00002957 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2958 ctxt->sax->error(ctxt->userData,
2959 "htmlParseStartTag: misplaced <body> tag\n");
2960 ctxt->wellFormed = 0;
2961 xmlFree(name);
2962 return;
2963 }
2964 }
2965 }
2966
2967 /*
2968 * Now parse the attributes, it ends up with the ending
2969 *
2970 * (S Attribute)* S?
2971 */
2972 SKIP_BLANKS;
2973 while ((IS_CHAR(CUR)) &&
2974 (CUR != '>') &&
2975 ((CUR != '/') || (NXT(1) != '>'))) {
2976 long cons = ctxt->nbChars;
2977
2978 GROW;
2979 attname = htmlParseAttribute(ctxt, &attvalue);
2980 if (attname != NULL) {
2981
2982 /*
2983 * Well formedness requires at most one declaration of an attribute
2984 */
2985 for (i = 0; i < nbatts;i += 2) {
2986 if (xmlStrEqual(atts[i], attname)) {
2987 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2988 ctxt->sax->error(ctxt->userData,
2989 "Attribute %s redefined\n",
2990 attname);
2991 ctxt->wellFormed = 0;
2992 xmlFree(attname);
2993 if (attvalue != NULL)
2994 xmlFree(attvalue);
2995 goto failed;
2996 }
2997 }
2998
2999 /*
3000 * Add the pair to atts
3001 */
3002 if (atts == NULL) {
3003 maxatts = 10;
3004 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
3005 if (atts == NULL) {
3006 xmlGenericError(xmlGenericErrorContext,
3007 "malloc of %ld byte failed\n",
3008 maxatts * (long)sizeof(xmlChar *));
3009 if (name != NULL) xmlFree(name);
3010 return;
3011 }
3012 } else if (nbatts + 4 > maxatts) {
3013 maxatts *= 2;
3014 atts = (const xmlChar **) xmlRealloc((void *) atts,
3015 maxatts * sizeof(xmlChar *));
3016 if (atts == NULL) {
3017 xmlGenericError(xmlGenericErrorContext,
3018 "realloc of %ld byte failed\n",
3019 maxatts * (long)sizeof(xmlChar *));
3020 if (name != NULL) xmlFree(name);
3021 return;
3022 }
3023 }
3024 atts[nbatts++] = attname;
3025 atts[nbatts++] = attvalue;
3026 atts[nbatts] = NULL;
3027 atts[nbatts + 1] = NULL;
3028 }
3029 else {
3030 /* Dump the bogus attribute string up to the next blank or
3031 * the end of the tag. */
3032 while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
3033 && ((CUR != '/') || (NXT(1) != '>')))
3034 NEXT;
3035 }
3036
3037failed:
3038 SKIP_BLANKS;
3039 if (cons == ctxt->nbChars) {
3040 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3041 ctxt->sax->error(ctxt->userData,
3042 "htmlParseStartTag: problem parsing attributes\n");
3043 ctxt->wellFormed = 0;
3044 break;
3045 }
3046 }
3047
3048 /*
3049 * Handle specific association to the META tag
3050 */
3051 if (meta)
3052 htmlCheckMeta(ctxt, atts);
3053
3054 /*
3055 * SAX: Start of Element !
3056 */
3057 htmlnamePush(ctxt, xmlStrdup(name));
3058#ifdef DEBUG
3059 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
3060#endif
3061 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3062 ctxt->sax->startElement(ctxt->userData, name, atts);
3063
3064 if (atts != NULL) {
3065 for (i = 0;i < nbatts;i++) {
3066 if (atts[i] != NULL)
3067 xmlFree((xmlChar *) atts[i]);
3068 }
3069 xmlFree((void *) atts);
3070 }
3071 if (name != NULL) xmlFree(name);
3072}
3073
3074/**
3075 * htmlParseEndTag:
3076 * @ctxt: an HTML parser context
3077 *
3078 * parse an end of tag
3079 *
3080 * [42] ETag ::= '</' Name S? '>'
3081 *
3082 * With namespace
3083 *
3084 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003085 *
3086 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003087 */
3088
Daniel Veillardf420ac52001-07-04 16:04:09 +00003089static int
Owen Taylor3473f882001-02-23 17:55:21 +00003090htmlParseEndTag(htmlParserCtxtPtr ctxt) {
3091 xmlChar *name;
3092 xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003093 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003094
3095 if ((CUR != '<') || (NXT(1) != '/')) {
3096 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3097 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3098 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003099 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003100 }
3101 SKIP(2);
3102
3103 name = htmlParseHTMLName(ctxt);
Daniel Veillardf420ac52001-07-04 16:04:09 +00003104 if (name == NULL) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003105
3106 /*
3107 * We should definitely be at the ending "S? '>'" part
3108 */
3109 SKIP_BLANKS;
3110 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3111 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3112 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3113 ctxt->wellFormed = 0;
3114 } else
3115 NEXT;
3116
3117 /*
3118 * If the name read is not one of the element in the parsing stack
3119 * then return, it's just an error.
3120 */
3121 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
3122 if (xmlStrEqual(name, ctxt->nameTab[i])) break;
3123 }
3124 if (i < 0) {
3125 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3126 ctxt->sax->error(ctxt->userData,
3127 "Unexpected end tag : %s\n", name);
3128 xmlFree(name);
3129 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003130 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003131 }
3132
3133
3134 /*
3135 * Check for auto-closure of HTML elements.
3136 */
3137
3138 htmlAutoCloseOnClose(ctxt, name);
3139
3140 /*
3141 * Well formedness constraints, opening and closing must match.
3142 * With the exception that the autoclose may have popped stuff out
3143 * of the stack.
3144 */
3145 if (!xmlStrEqual(name, ctxt->name)) {
3146#ifdef DEBUG
3147 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
3148#endif
3149 if ((ctxt->name != NULL) &&
3150 (!xmlStrEqual(ctxt->name, name))) {
3151 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3152 ctxt->sax->error(ctxt->userData,
3153 "Opening and ending tag mismatch: %s and %s\n",
3154 name, ctxt->name);
3155 ctxt->wellFormed = 0;
3156 }
3157 }
3158
3159 /*
3160 * SAX: End of Tag
3161 */
3162 oldname = ctxt->name;
3163 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3164 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3165 ctxt->sax->endElement(ctxt->userData, name);
3166 oldname = htmlnamePop(ctxt);
3167 if (oldname != NULL) {
3168#ifdef DEBUG
3169 xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
3170#endif
3171 xmlFree(oldname);
3172#ifdef DEBUG
3173 } else {
3174 xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
3175#endif
3176 }
Daniel Veillardf420ac52001-07-04 16:04:09 +00003177 ret = 1;
3178 } else {
3179 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003180 }
3181
3182 if (name != NULL)
3183 xmlFree(name);
3184
Daniel Veillardf420ac52001-07-04 16:04:09 +00003185 return(ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003186}
3187
3188
3189/**
3190 * htmlParseReference:
3191 * @ctxt: an HTML parser context
3192 *
3193 * parse and handle entity references in content,
3194 * this will end-up in a call to character() since this is either a
3195 * CharRef, or a predefined entity.
3196 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003197static void
Owen Taylor3473f882001-02-23 17:55:21 +00003198htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003199 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003200 xmlChar out[6];
3201 xmlChar *name;
3202 if (CUR != '&') return;
3203
3204 if (NXT(1) == '#') {
3205 unsigned int c;
3206 int bits, i = 0;
3207
3208 c = htmlParseCharRef(ctxt);
3209 if (c == 0)
3210 return;
3211
3212 if (c < 0x80) { out[i++]= c; bits= -6; }
3213 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3214 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3215 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3216
3217 for ( ; bits >= 0; bits-= 6) {
3218 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3219 }
3220 out[i] = 0;
3221
3222 htmlCheckParagraph(ctxt);
3223 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3224 ctxt->sax->characters(ctxt->userData, out, i);
3225 } else {
3226 ent = htmlParseEntityRef(ctxt, &name);
3227 if (name == NULL) {
3228 htmlCheckParagraph(ctxt);
3229 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3230 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3231 return;
3232 }
3233 if ((ent == NULL) || (ent->value <= 0)) {
3234 htmlCheckParagraph(ctxt);
3235 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3236 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3237 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3238 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3239 }
3240 } else {
3241 unsigned int c;
3242 int bits, i = 0;
3243
3244 c = ent->value;
3245 if (c < 0x80)
3246 { out[i++]= c; bits= -6; }
3247 else if (c < 0x800)
3248 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3249 else if (c < 0x10000)
3250 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3251 else
3252 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3253
3254 for ( ; bits >= 0; bits-= 6) {
3255 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3256 }
3257 out[i] = 0;
3258
3259 htmlCheckParagraph(ctxt);
3260 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3261 ctxt->sax->characters(ctxt->userData, out, i);
3262 }
3263 xmlFree(name);
3264 }
3265}
3266
3267/**
3268 * htmlParseContent:
3269 * @ctxt: an HTML parser context
3270 * @name: the node name
3271 *
3272 * Parse a content: comment, sub-element, reference or text.
3273 *
3274 */
3275
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003276static void
Owen Taylor3473f882001-02-23 17:55:21 +00003277htmlParseContent(htmlParserCtxtPtr ctxt) {
3278 xmlChar *currentNode;
3279 int depth;
3280
3281 currentNode = xmlStrdup(ctxt->name);
3282 depth = ctxt->nameNr;
3283 while (1) {
3284 long cons = ctxt->nbChars;
3285
3286 GROW;
3287 /*
3288 * Our tag or one of it's parent or children is ending.
3289 */
3290 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003291 if (htmlParseEndTag(ctxt) &&
3292 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3293 if (currentNode != NULL)
3294 xmlFree(currentNode);
3295 return;
3296 }
3297 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003298 }
3299
3300 /*
3301 * Has this node been popped out during parsing of
3302 * the next element
3303 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003304 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3305 (!xmlStrEqual(currentNode, ctxt->name)))
3306 {
Owen Taylor3473f882001-02-23 17:55:21 +00003307 if (currentNode != NULL) xmlFree(currentNode);
3308 return;
3309 }
3310
Daniel Veillardf9533d12001-03-03 10:04:57 +00003311 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3312 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003313 /*
3314 * Handle SCRIPT/STYLE separately
3315 */
3316 htmlParseScript(ctxt);
3317 } else {
3318 /*
3319 * Sometimes DOCTYPE arrives in the middle of the document
3320 */
3321 if ((CUR == '<') && (NXT(1) == '!') &&
3322 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3323 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3324 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3325 (UPP(8) == 'E')) {
3326 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3327 ctxt->sax->error(ctxt->userData,
3328 "Misplaced DOCTYPE declaration\n");
3329 ctxt->wellFormed = 0;
3330 htmlParseDocTypeDecl(ctxt);
3331 }
3332
3333 /*
3334 * First case : a comment
3335 */
3336 if ((CUR == '<') && (NXT(1) == '!') &&
3337 (NXT(2) == '-') && (NXT(3) == '-')) {
3338 htmlParseComment(ctxt);
3339 }
3340
3341 /*
3342 * Second case : a sub-element.
3343 */
3344 else if (CUR == '<') {
3345 htmlParseElement(ctxt);
3346 }
3347
3348 /*
3349 * Third case : a reference. If if has not been resolved,
3350 * parsing returns it's Name, create the node
3351 */
3352 else if (CUR == '&') {
3353 htmlParseReference(ctxt);
3354 }
3355
3356 /*
3357 * Fourth : end of the resource
3358 */
3359 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003360 htmlAutoCloseOnEnd(ctxt);
3361 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003362 }
3363
3364 /*
3365 * Last case, text. Note that References are handled directly.
3366 */
3367 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003368 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003369 }
3370
3371 if (cons == ctxt->nbChars) {
3372 if (ctxt->node != NULL) {
3373 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3374 ctxt->sax->error(ctxt->userData,
3375 "detected an error in element content\n");
3376 ctxt->wellFormed = 0;
3377 }
3378 break;
3379 }
3380 }
3381 GROW;
3382 }
3383 if (currentNode != NULL) xmlFree(currentNode);
3384}
3385
3386/**
3387 * htmlParseElement:
3388 * @ctxt: an HTML parser context
3389 *
3390 * parse an HTML element, this is highly recursive
3391 *
3392 * [39] element ::= EmptyElemTag | STag content ETag
3393 *
3394 * [41] Attribute ::= Name Eq AttValue
3395 */
3396
3397void
3398htmlParseElement(htmlParserCtxtPtr ctxt) {
3399 xmlChar *name;
3400 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003401 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003402 htmlParserNodeInfo node_info;
3403 xmlChar *oldname;
3404 int depth = ctxt->nameNr;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003405 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003406
3407 /* Capture start position */
3408 if (ctxt->record_info) {
3409 node_info.begin_pos = ctxt->input->consumed +
3410 (CUR_PTR - ctxt->input->base);
3411 node_info.begin_line = ctxt->input->line;
3412 }
3413
3414 oldname = xmlStrdup(ctxt->name);
3415 htmlParseStartTag(ctxt);
3416 name = ctxt->name;
3417#ifdef DEBUG
3418 if (oldname == NULL)
3419 xmlGenericError(xmlGenericErrorContext,
3420 "Start of element %s\n", name);
3421 else if (name == NULL)
3422 xmlGenericError(xmlGenericErrorContext,
3423 "Start of element failed, was %s\n", oldname);
3424 else
3425 xmlGenericError(xmlGenericErrorContext,
3426 "Start of element %s, was %s\n", name, oldname);
3427#endif
3428 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3429 (name == NULL)) {
3430 if (CUR == '>')
3431 NEXT;
3432 if (oldname != NULL)
3433 xmlFree(oldname);
3434 return;
3435 }
3436 if (oldname != NULL)
3437 xmlFree(oldname);
3438
3439 /*
3440 * Lookup the info for that element.
3441 */
3442 info = htmlTagLookup(name);
3443 if (info == NULL) {
3444 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3445 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3446 name);
3447 ctxt->wellFormed = 0;
3448 } else if (info->depr) {
3449/***************************
3450 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3451 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3452 name);
3453 ***************************/
3454 }
3455
3456 /*
3457 * Check for an Empty Element labelled the XML/SGML way
3458 */
3459 if ((CUR == '/') && (NXT(1) == '>')) {
3460 SKIP(2);
3461 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3462 ctxt->sax->endElement(ctxt->userData, name);
3463 oldname = htmlnamePop(ctxt);
3464#ifdef DEBUG
3465 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
3466#endif
3467 if (oldname != NULL)
3468 xmlFree(oldname);
3469 return;
3470 }
3471
3472 if (CUR == '>') {
3473 NEXT;
3474 } else {
3475 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3476 ctxt->sax->error(ctxt->userData,
3477 "Couldn't find end of Start Tag %s\n",
3478 name);
3479 ctxt->wellFormed = 0;
3480
3481 /*
3482 * end of parsing of this node.
3483 */
3484 if (xmlStrEqual(name, ctxt->name)) {
3485 nodePop(ctxt);
3486 oldname = htmlnamePop(ctxt);
3487#ifdef DEBUG
3488 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
3489#endif
3490 if (oldname != NULL)
3491 xmlFree(oldname);
3492 }
3493
3494 /*
3495 * Capture end position and add node
3496 */
3497 if ( currentNode != NULL && ctxt->record_info ) {
3498 node_info.end_pos = ctxt->input->consumed +
3499 (CUR_PTR - ctxt->input->base);
3500 node_info.end_line = ctxt->input->line;
3501 node_info.node = ctxt->node;
3502 xmlParserAddNodeInfo(ctxt, &node_info);
3503 }
3504 return;
3505 }
3506
3507 /*
3508 * Check for an Empty Element from DTD definition
3509 */
3510 if ((info != NULL) && (info->empty)) {
3511 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3512 ctxt->sax->endElement(ctxt->userData, name);
3513 oldname = htmlnamePop(ctxt);
3514#ifdef DEBUG
3515 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
3516#endif
3517 if (oldname != NULL)
3518 xmlFree(oldname);
3519 return;
3520 }
3521
3522 /*
3523 * Parse the content of the element:
3524 */
3525 currentNode = xmlStrdup(ctxt->name);
3526 depth = ctxt->nameNr;
3527 while (IS_CHAR(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00003528 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00003529 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00003530 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00003531 if (ctxt->nameNr < depth) break;
3532 }
3533
Owen Taylor3473f882001-02-23 17:55:21 +00003534 /*
3535 * Capture end position and add node
3536 */
3537 if ( currentNode != NULL && ctxt->record_info ) {
3538 node_info.end_pos = ctxt->input->consumed +
3539 (CUR_PTR - ctxt->input->base);
3540 node_info.end_line = ctxt->input->line;
3541 node_info.node = ctxt->node;
3542 xmlParserAddNodeInfo(ctxt, &node_info);
3543 }
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003544 if (!IS_CHAR(CUR)) {
3545 htmlAutoCloseOnEnd(ctxt);
3546 }
3547
Owen Taylor3473f882001-02-23 17:55:21 +00003548 if (currentNode != NULL)
3549 xmlFree(currentNode);
3550}
3551
3552/**
3553 * htmlParseDocument :
3554 * @ctxt: an HTML parser context
3555 *
3556 * parse an HTML document (and build a tree if using the standard SAX
3557 * interface).
3558 *
3559 * Returns 0, -1 in case of error. the parser context is augmented
3560 * as a result of the parsing.
3561 */
3562
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003563static int
Owen Taylor3473f882001-02-23 17:55:21 +00003564htmlParseDocument(htmlParserCtxtPtr ctxt) {
3565 xmlDtdPtr dtd;
3566
Daniel Veillardd0463562001-10-13 09:15:48 +00003567 xmlInitParser();
3568
Owen Taylor3473f882001-02-23 17:55:21 +00003569 htmlDefaultSAXHandlerInit();
3570 ctxt->html = 1;
3571
3572 GROW;
3573 /*
3574 * SAX: beginning of the document processing.
3575 */
3576 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3577 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3578
3579 /*
3580 * Wipe out everything which is before the first '<'
3581 */
3582 SKIP_BLANKS;
3583 if (CUR == 0) {
3584 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3585 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3586 ctxt->wellFormed = 0;
3587 }
3588
3589 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3590 ctxt->sax->startDocument(ctxt->userData);
3591
3592
3593 /*
3594 * Parse possible comments before any content
3595 */
3596 while ((CUR == '<') && (NXT(1) == '!') &&
3597 (NXT(2) == '-') && (NXT(3) == '-')) {
3598 htmlParseComment(ctxt);
3599 SKIP_BLANKS;
3600 }
3601
3602
3603 /*
3604 * Then possibly doc type declaration(s) and more Misc
3605 * (doctypedecl Misc*)?
3606 */
3607 if ((CUR == '<') && (NXT(1) == '!') &&
3608 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3609 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3610 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3611 (UPP(8) == 'E')) {
3612 htmlParseDocTypeDecl(ctxt);
3613 }
3614 SKIP_BLANKS;
3615
3616 /*
3617 * Parse possible comments before any content
3618 */
3619 while ((CUR == '<') && (NXT(1) == '!') &&
3620 (NXT(2) == '-') && (NXT(3) == '-')) {
3621 htmlParseComment(ctxt);
3622 SKIP_BLANKS;
3623 }
3624
3625 /*
3626 * Time to start parsing the tree itself
3627 */
3628 htmlParseContent(ctxt);
3629
3630 /*
3631 * autoclose
3632 */
3633 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003634 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003635
3636
3637 /*
3638 * SAX: end of the document processing.
3639 */
3640 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3641 ctxt->sax->endDocument(ctxt->userData);
3642
3643 if (ctxt->myDoc != NULL) {
3644 dtd = xmlGetIntSubset(ctxt->myDoc);
3645 if (dtd == NULL)
3646 ctxt->myDoc->intSubset =
3647 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3648 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3649 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3650 }
3651 if (! ctxt->wellFormed) return(-1);
3652 return(0);
3653}
3654
3655
3656/************************************************************************
3657 * *
3658 * Parser contexts handling *
3659 * *
3660 ************************************************************************/
3661
3662/**
3663 * xmlInitParserCtxt:
3664 * @ctxt: an HTML parser context
3665 *
3666 * Initialize a parser context
3667 */
3668
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003669static void
Owen Taylor3473f882001-02-23 17:55:21 +00003670htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3671{
3672 htmlSAXHandler *sax;
3673
3674 if (ctxt == NULL) return;
3675 memset(ctxt, 0, sizeof(htmlParserCtxt));
3676
3677 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3678 if (sax == NULL) {
3679 xmlGenericError(xmlGenericErrorContext,
3680 "htmlInitParserCtxt: out of memory\n");
3681 }
3682 else
3683 memset(sax, 0, sizeof(htmlSAXHandler));
3684
3685 /* Allocate the Input stack */
3686 ctxt->inputTab = (htmlParserInputPtr *)
3687 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3688 if (ctxt->inputTab == NULL) {
3689 xmlGenericError(xmlGenericErrorContext,
3690 "htmlInitParserCtxt: out of memory\n");
3691 ctxt->inputNr = 0;
3692 ctxt->inputMax = 0;
3693 ctxt->input = NULL;
3694 return;
3695 }
3696 ctxt->inputNr = 0;
3697 ctxt->inputMax = 5;
3698 ctxt->input = NULL;
3699 ctxt->version = NULL;
3700 ctxt->encoding = NULL;
3701 ctxt->standalone = -1;
3702 ctxt->instate = XML_PARSER_START;
3703
3704 /* Allocate the Node stack */
3705 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3706 if (ctxt->nodeTab == NULL) {
3707 xmlGenericError(xmlGenericErrorContext,
3708 "htmlInitParserCtxt: out of memory\n");
3709 ctxt->nodeNr = 0;
3710 ctxt->nodeMax = 0;
3711 ctxt->node = NULL;
3712 ctxt->inputNr = 0;
3713 ctxt->inputMax = 0;
3714 ctxt->input = NULL;
3715 return;
3716 }
3717 ctxt->nodeNr = 0;
3718 ctxt->nodeMax = 10;
3719 ctxt->node = NULL;
3720
3721 /* Allocate the Name stack */
3722 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
3723 if (ctxt->nameTab == NULL) {
3724 xmlGenericError(xmlGenericErrorContext,
3725 "htmlInitParserCtxt: out of memory\n");
3726 ctxt->nameNr = 0;
3727 ctxt->nameMax = 10;
3728 ctxt->name = NULL;
3729 ctxt->nodeNr = 0;
3730 ctxt->nodeMax = 0;
3731 ctxt->node = NULL;
3732 ctxt->inputNr = 0;
3733 ctxt->inputMax = 0;
3734 ctxt->input = NULL;
3735 return;
3736 }
3737 ctxt->nameNr = 0;
3738 ctxt->nameMax = 10;
3739 ctxt->name = NULL;
3740
3741 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3742 else {
3743 ctxt->sax = sax;
3744 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3745 }
3746 ctxt->userData = ctxt;
3747 ctxt->myDoc = NULL;
3748 ctxt->wellFormed = 1;
3749 ctxt->replaceEntities = 0;
3750 ctxt->html = 1;
3751 ctxt->record_info = 0;
3752 ctxt->validate = 0;
3753 ctxt->nbChars = 0;
3754 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00003755 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003756 xmlInitNodeInfoSeq(&ctxt->node_seq);
3757}
3758
3759/**
3760 * htmlFreeParserCtxt:
3761 * @ctxt: an HTML parser context
3762 *
3763 * Free all the memory used by a parser context. However the parsed
3764 * document in ctxt->myDoc is not freed.
3765 */
3766
3767void
3768htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3769{
3770 xmlFreeParserCtxt(ctxt);
3771}
3772
3773/**
3774 * htmlCreateDocParserCtxt :
3775 * @cur: a pointer to an array of xmlChar
3776 * @encoding: a free form C string describing the HTML document encoding, or NULL
3777 *
3778 * Create a parser context for an HTML document.
3779 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003780 * TODO: check the need to add encoding handling there
3781 *
Owen Taylor3473f882001-02-23 17:55:21 +00003782 * Returns the new parser context or NULL
3783 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003784static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00003785htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Owen Taylor3473f882001-02-23 17:55:21 +00003786 htmlParserCtxtPtr ctxt;
3787 htmlParserInputPtr input;
3788 /* htmlCharEncoding enc; */
3789
3790 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
3791 if (ctxt == NULL) {
3792 perror("malloc");
3793 return(NULL);
3794 }
3795 htmlInitParserCtxt(ctxt);
3796 input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
3797 if (input == NULL) {
3798 perror("malloc");
3799 xmlFree(ctxt);
3800 return(NULL);
3801 }
3802 memset(input, 0, sizeof(htmlParserInput));
3803
3804 input->line = 1;
3805 input->col = 1;
3806 input->base = cur;
3807 input->cur = cur;
3808
3809 inputPush(ctxt, input);
3810 return(ctxt);
3811}
3812
3813/************************************************************************
3814 * *
3815 * Progressive parsing interfaces *
3816 * *
3817 ************************************************************************/
3818
3819/**
3820 * htmlParseLookupSequence:
3821 * @ctxt: an HTML parser context
3822 * @first: the first char to lookup
3823 * @next: the next char to lookup or zero
3824 * @third: the next char to lookup or zero
3825 *
3826 * Try to find if a sequence (first, next, third) or just (first next) or
3827 * (first) is available in the input stream.
3828 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3829 * to avoid rescanning sequences of bytes, it DOES change the state of the
3830 * parser, do not use liberally.
3831 * This is basically similar to xmlParseLookupSequence()
3832 *
3833 * Returns the index to the current parsing point if the full sequence
3834 * is available, -1 otherwise.
3835 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003836static int
Owen Taylor3473f882001-02-23 17:55:21 +00003837htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3838 xmlChar next, xmlChar third) {
3839 int base, len;
3840 htmlParserInputPtr in;
3841 const xmlChar *buf;
3842
3843 in = ctxt->input;
3844 if (in == NULL) return(-1);
3845 base = in->cur - in->base;
3846 if (base < 0) return(-1);
3847 if (ctxt->checkIndex > base)
3848 base = ctxt->checkIndex;
3849 if (in->buf == NULL) {
3850 buf = in->base;
3851 len = in->length;
3852 } else {
3853 buf = in->buf->buffer->content;
3854 len = in->buf->buffer->use;
3855 }
3856 /* take into account the sequence length */
3857 if (third) len -= 2;
3858 else if (next) len --;
3859 for (;base < len;base++) {
3860 if (buf[base] == first) {
3861 if (third != 0) {
3862 if ((buf[base + 1] != next) ||
3863 (buf[base + 2] != third)) continue;
3864 } else if (next != 0) {
3865 if (buf[base + 1] != next) continue;
3866 }
3867 ctxt->checkIndex = 0;
3868#ifdef DEBUG_PUSH
3869 if (next == 0)
3870 xmlGenericError(xmlGenericErrorContext,
3871 "HPP: lookup '%c' found at %d\n",
3872 first, base);
3873 else if (third == 0)
3874 xmlGenericError(xmlGenericErrorContext,
3875 "HPP: lookup '%c%c' found at %d\n",
3876 first, next, base);
3877 else
3878 xmlGenericError(xmlGenericErrorContext,
3879 "HPP: lookup '%c%c%c' found at %d\n",
3880 first, next, third, base);
3881#endif
3882 return(base - (in->cur - in->base));
3883 }
3884 }
3885 ctxt->checkIndex = base;
3886#ifdef DEBUG_PUSH
3887 if (next == 0)
3888 xmlGenericError(xmlGenericErrorContext,
3889 "HPP: lookup '%c' failed\n", first);
3890 else if (third == 0)
3891 xmlGenericError(xmlGenericErrorContext,
3892 "HPP: lookup '%c%c' failed\n", first, next);
3893 else
3894 xmlGenericError(xmlGenericErrorContext,
3895 "HPP: lookup '%c%c%c' failed\n", first, next, third);
3896#endif
3897 return(-1);
3898}
3899
3900/**
3901 * htmlParseTryOrFinish:
3902 * @ctxt: an HTML parser context
3903 * @terminate: last chunk indicator
3904 *
3905 * Try to progress on parsing
3906 *
3907 * Returns zero if no parsing was possible
3908 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003909static int
Owen Taylor3473f882001-02-23 17:55:21 +00003910htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
3911 int ret = 0;
3912 htmlParserInputPtr in;
3913 int avail = 0;
3914 xmlChar cur, next;
3915
3916#ifdef DEBUG_PUSH
3917 switch (ctxt->instate) {
3918 case XML_PARSER_EOF:
3919 xmlGenericError(xmlGenericErrorContext,
3920 "HPP: try EOF\n"); break;
3921 case XML_PARSER_START:
3922 xmlGenericError(xmlGenericErrorContext,
3923 "HPP: try START\n"); break;
3924 case XML_PARSER_MISC:
3925 xmlGenericError(xmlGenericErrorContext,
3926 "HPP: try MISC\n");break;
3927 case XML_PARSER_COMMENT:
3928 xmlGenericError(xmlGenericErrorContext,
3929 "HPP: try COMMENT\n");break;
3930 case XML_PARSER_PROLOG:
3931 xmlGenericError(xmlGenericErrorContext,
3932 "HPP: try PROLOG\n");break;
3933 case XML_PARSER_START_TAG:
3934 xmlGenericError(xmlGenericErrorContext,
3935 "HPP: try START_TAG\n");break;
3936 case XML_PARSER_CONTENT:
3937 xmlGenericError(xmlGenericErrorContext,
3938 "HPP: try CONTENT\n");break;
3939 case XML_PARSER_CDATA_SECTION:
3940 xmlGenericError(xmlGenericErrorContext,
3941 "HPP: try CDATA_SECTION\n");break;
3942 case XML_PARSER_END_TAG:
3943 xmlGenericError(xmlGenericErrorContext,
3944 "HPP: try END_TAG\n");break;
3945 case XML_PARSER_ENTITY_DECL:
3946 xmlGenericError(xmlGenericErrorContext,
3947 "HPP: try ENTITY_DECL\n");break;
3948 case XML_PARSER_ENTITY_VALUE:
3949 xmlGenericError(xmlGenericErrorContext,
3950 "HPP: try ENTITY_VALUE\n");break;
3951 case XML_PARSER_ATTRIBUTE_VALUE:
3952 xmlGenericError(xmlGenericErrorContext,
3953 "HPP: try ATTRIBUTE_VALUE\n");break;
3954 case XML_PARSER_DTD:
3955 xmlGenericError(xmlGenericErrorContext,
3956 "HPP: try DTD\n");break;
3957 case XML_PARSER_EPILOG:
3958 xmlGenericError(xmlGenericErrorContext,
3959 "HPP: try EPILOG\n");break;
3960 case XML_PARSER_PI:
3961 xmlGenericError(xmlGenericErrorContext,
3962 "HPP: try PI\n");break;
3963 case XML_PARSER_SYSTEM_LITERAL:
3964 xmlGenericError(xmlGenericErrorContext,
3965 "HPP: try SYSTEM_LITERAL\n");break;
3966 }
3967#endif
3968
3969 while (1) {
3970
3971 in = ctxt->input;
3972 if (in == NULL) break;
3973 if (in->buf == NULL)
3974 avail = in->length - (in->cur - in->base);
3975 else
3976 avail = in->buf->buffer->use - (in->cur - in->base);
3977 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003978 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003979 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
3980 /*
3981 * SAX: end of the document processing.
3982 */
3983 ctxt->instate = XML_PARSER_EOF;
3984 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3985 ctxt->sax->endDocument(ctxt->userData);
3986 }
3987 }
3988 if (avail < 1)
3989 goto done;
3990 switch (ctxt->instate) {
3991 case XML_PARSER_EOF:
3992 /*
3993 * Document parsing is done !
3994 */
3995 goto done;
3996 case XML_PARSER_START:
3997 /*
3998 * Very first chars read from the document flow.
3999 */
4000 cur = in->cur[0];
4001 if (IS_BLANK(cur)) {
4002 SKIP_BLANKS;
4003 if (in->buf == NULL)
4004 avail = in->length - (in->cur - in->base);
4005 else
4006 avail = in->buf->buffer->use - (in->cur - in->base);
4007 }
4008 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4009 ctxt->sax->setDocumentLocator(ctxt->userData,
4010 &xmlDefaultSAXLocator);
4011 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4012 (!ctxt->disableSAX))
4013 ctxt->sax->startDocument(ctxt->userData);
4014
4015 cur = in->cur[0];
4016 next = in->cur[1];
4017 if ((cur == '<') && (next == '!') &&
4018 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4019 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4020 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4021 (UPP(8) == 'E')) {
4022 if ((!terminate) &&
4023 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4024 goto done;
4025#ifdef DEBUG_PUSH
4026 xmlGenericError(xmlGenericErrorContext,
4027 "HPP: Parsing internal subset\n");
4028#endif
4029 htmlParseDocTypeDecl(ctxt);
4030 ctxt->instate = XML_PARSER_PROLOG;
4031#ifdef DEBUG_PUSH
4032 xmlGenericError(xmlGenericErrorContext,
4033 "HPP: entering PROLOG\n");
4034#endif
4035 } else {
4036 ctxt->instate = XML_PARSER_MISC;
4037 }
4038#ifdef DEBUG_PUSH
4039 xmlGenericError(xmlGenericErrorContext,
4040 "HPP: entering MISC\n");
4041#endif
4042 break;
4043 case XML_PARSER_MISC:
4044 SKIP_BLANKS;
4045 if (in->buf == NULL)
4046 avail = in->length - (in->cur - in->base);
4047 else
4048 avail = in->buf->buffer->use - (in->cur - in->base);
4049 if (avail < 2)
4050 goto done;
4051 cur = in->cur[0];
4052 next = in->cur[1];
4053 if ((cur == '<') && (next == '!') &&
4054 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4055 if ((!terminate) &&
4056 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4057 goto done;
4058#ifdef DEBUG_PUSH
4059 xmlGenericError(xmlGenericErrorContext,
4060 "HPP: Parsing Comment\n");
4061#endif
4062 htmlParseComment(ctxt);
4063 ctxt->instate = XML_PARSER_MISC;
4064 } else if ((cur == '<') && (next == '!') &&
4065 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4066 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4067 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4068 (UPP(8) == 'E')) {
4069 if ((!terminate) &&
4070 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4071 goto done;
4072#ifdef DEBUG_PUSH
4073 xmlGenericError(xmlGenericErrorContext,
4074 "HPP: Parsing internal subset\n");
4075#endif
4076 htmlParseDocTypeDecl(ctxt);
4077 ctxt->instate = XML_PARSER_PROLOG;
4078#ifdef DEBUG_PUSH
4079 xmlGenericError(xmlGenericErrorContext,
4080 "HPP: entering PROLOG\n");
4081#endif
4082 } else if ((cur == '<') && (next == '!') &&
4083 (avail < 9)) {
4084 goto done;
4085 } else {
4086 ctxt->instate = XML_PARSER_START_TAG;
4087#ifdef DEBUG_PUSH
4088 xmlGenericError(xmlGenericErrorContext,
4089 "HPP: entering START_TAG\n");
4090#endif
4091 }
4092 break;
4093 case XML_PARSER_PROLOG:
4094 SKIP_BLANKS;
4095 if (in->buf == NULL)
4096 avail = in->length - (in->cur - in->base);
4097 else
4098 avail = in->buf->buffer->use - (in->cur - in->base);
4099 if (avail < 2)
4100 goto done;
4101 cur = in->cur[0];
4102 next = in->cur[1];
4103 if ((cur == '<') && (next == '!') &&
4104 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4105 if ((!terminate) &&
4106 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4107 goto done;
4108#ifdef DEBUG_PUSH
4109 xmlGenericError(xmlGenericErrorContext,
4110 "HPP: Parsing Comment\n");
4111#endif
4112 htmlParseComment(ctxt);
4113 ctxt->instate = XML_PARSER_PROLOG;
4114 } else if ((cur == '<') && (next == '!') &&
4115 (avail < 4)) {
4116 goto done;
4117 } else {
4118 ctxt->instate = XML_PARSER_START_TAG;
4119#ifdef DEBUG_PUSH
4120 xmlGenericError(xmlGenericErrorContext,
4121 "HPP: entering START_TAG\n");
4122#endif
4123 }
4124 break;
4125 case XML_PARSER_EPILOG:
4126 if (in->buf == NULL)
4127 avail = in->length - (in->cur - in->base);
4128 else
4129 avail = in->buf->buffer->use - (in->cur - in->base);
4130 if (avail < 1)
4131 goto done;
4132 cur = in->cur[0];
4133 if (IS_BLANK(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004134 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004135 goto done;
4136 }
4137 if (avail < 2)
4138 goto done;
4139 next = in->cur[1];
4140 if ((cur == '<') && (next == '!') &&
4141 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4142 if ((!terminate) &&
4143 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4144 goto done;
4145#ifdef DEBUG_PUSH
4146 xmlGenericError(xmlGenericErrorContext,
4147 "HPP: Parsing Comment\n");
4148#endif
4149 htmlParseComment(ctxt);
4150 ctxt->instate = XML_PARSER_EPILOG;
4151 } else if ((cur == '<') && (next == '!') &&
4152 (avail < 4)) {
4153 goto done;
4154 } else {
4155 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004156 ctxt->wellFormed = 0;
4157 ctxt->instate = XML_PARSER_EOF;
4158#ifdef DEBUG_PUSH
4159 xmlGenericError(xmlGenericErrorContext,
4160 "HPP: entering EOF\n");
4161#endif
4162 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4163 ctxt->sax->endDocument(ctxt->userData);
4164 goto done;
4165 }
4166 break;
4167 case XML_PARSER_START_TAG: {
4168 xmlChar *name, *oldname;
4169 int depth = ctxt->nameNr;
Daniel Veillardbb371292001-08-16 23:26:59 +00004170 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004171
4172 if (avail < 2)
4173 goto done;
4174 cur = in->cur[0];
4175 if (cur != '<') {
4176 ctxt->instate = XML_PARSER_CONTENT;
4177#ifdef DEBUG_PUSH
4178 xmlGenericError(xmlGenericErrorContext,
4179 "HPP: entering CONTENT\n");
4180#endif
4181 break;
4182 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004183 if (in->cur[1] == '/') {
4184 ctxt->instate = XML_PARSER_END_TAG;
4185 ctxt->checkIndex = 0;
4186#ifdef DEBUG_PUSH
4187 xmlGenericError(xmlGenericErrorContext,
4188 "HPP: entering END_TAG\n");
4189#endif
4190 break;
4191 }
Owen Taylor3473f882001-02-23 17:55:21 +00004192 if ((!terminate) &&
4193 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4194 goto done;
4195
4196 oldname = xmlStrdup(ctxt->name);
4197 htmlParseStartTag(ctxt);
4198 name = ctxt->name;
4199#ifdef DEBUG
4200 if (oldname == NULL)
4201 xmlGenericError(xmlGenericErrorContext,
4202 "Start of element %s\n", name);
4203 else if (name == NULL)
4204 xmlGenericError(xmlGenericErrorContext,
4205 "Start of element failed, was %s\n",
4206 oldname);
4207 else
4208 xmlGenericError(xmlGenericErrorContext,
4209 "Start of element %s, was %s\n",
4210 name, oldname);
4211#endif
4212 if (((depth == ctxt->nameNr) &&
4213 (xmlStrEqual(oldname, ctxt->name))) ||
4214 (name == NULL)) {
4215 if (CUR == '>')
4216 NEXT;
4217 if (oldname != NULL)
4218 xmlFree(oldname);
4219 break;
4220 }
4221 if (oldname != NULL)
4222 xmlFree(oldname);
4223
4224 /*
4225 * Lookup the info for that element.
4226 */
4227 info = htmlTagLookup(name);
4228 if (info == NULL) {
4229 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4230 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4231 name);
4232 ctxt->wellFormed = 0;
4233 } else if (info->depr) {
4234 /***************************
4235 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4236 ctxt->sax->warning(ctxt->userData,
4237 "Tag %s is deprecated\n",
4238 name);
4239 ***************************/
4240 }
4241
4242 /*
4243 * Check for an Empty Element labelled the XML/SGML way
4244 */
4245 if ((CUR == '/') && (NXT(1) == '>')) {
4246 SKIP(2);
4247 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4248 ctxt->sax->endElement(ctxt->userData, name);
4249 oldname = htmlnamePop(ctxt);
4250#ifdef DEBUG
4251 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
4252 oldname);
4253#endif
4254 if (oldname != NULL)
4255 xmlFree(oldname);
4256 ctxt->instate = XML_PARSER_CONTENT;
4257#ifdef DEBUG_PUSH
4258 xmlGenericError(xmlGenericErrorContext,
4259 "HPP: entering CONTENT\n");
4260#endif
4261 break;
4262 }
4263
4264 if (CUR == '>') {
4265 NEXT;
4266 } else {
4267 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4268 ctxt->sax->error(ctxt->userData,
4269 "Couldn't find end of Start Tag %s\n",
4270 name);
4271 ctxt->wellFormed = 0;
4272
4273 /*
4274 * end of parsing of this node.
4275 */
4276 if (xmlStrEqual(name, ctxt->name)) {
4277 nodePop(ctxt);
4278 oldname = htmlnamePop(ctxt);
4279#ifdef DEBUG
4280 xmlGenericError(xmlGenericErrorContext,
4281 "End of start tag problem: popping out %s\n", oldname);
4282#endif
4283 if (oldname != NULL)
4284 xmlFree(oldname);
4285 }
4286
4287 ctxt->instate = XML_PARSER_CONTENT;
4288#ifdef DEBUG_PUSH
4289 xmlGenericError(xmlGenericErrorContext,
4290 "HPP: entering CONTENT\n");
4291#endif
4292 break;
4293 }
4294
4295 /*
4296 * Check for an Empty Element from DTD definition
4297 */
4298 if ((info != NULL) && (info->empty)) {
4299 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4300 ctxt->sax->endElement(ctxt->userData, name);
4301 oldname = htmlnamePop(ctxt);
4302#ifdef DEBUG
4303 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
4304#endif
4305 if (oldname != NULL)
4306 xmlFree(oldname);
4307 }
4308 ctxt->instate = XML_PARSER_CONTENT;
4309#ifdef DEBUG_PUSH
4310 xmlGenericError(xmlGenericErrorContext,
4311 "HPP: entering CONTENT\n");
4312#endif
4313 break;
4314 }
4315 case XML_PARSER_CONTENT: {
4316 long cons;
4317 /*
4318 * Handle preparsed entities and charRef
4319 */
4320 if (ctxt->token != 0) {
4321 xmlChar chr[2] = { 0 , 0 } ;
4322
4323 chr[0] = (xmlChar) ctxt->token;
4324 htmlCheckParagraph(ctxt);
4325 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4326 ctxt->sax->characters(ctxt->userData, chr, 1);
4327 ctxt->token = 0;
4328 ctxt->checkIndex = 0;
4329 }
4330 if ((avail == 1) && (terminate)) {
4331 cur = in->cur[0];
4332 if ((cur != '<') && (cur != '&')) {
4333 if (ctxt->sax != NULL) {
4334 if (IS_BLANK(cur)) {
4335 if (ctxt->sax->ignorableWhitespace != NULL)
4336 ctxt->sax->ignorableWhitespace(
4337 ctxt->userData, &cur, 1);
4338 } else {
4339 htmlCheckParagraph(ctxt);
4340 if (ctxt->sax->characters != NULL)
4341 ctxt->sax->characters(
4342 ctxt->userData, &cur, 1);
4343 }
4344 }
4345 ctxt->token = 0;
4346 ctxt->checkIndex = 0;
4347 NEXT;
William M. Brack1633d182001-10-05 15:41:19 +00004348 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004349 }
Owen Taylor3473f882001-02-23 17:55:21 +00004350 }
4351 if (avail < 2)
4352 goto done;
4353 cur = in->cur[0];
4354 next = in->cur[1];
4355 cons = ctxt->nbChars;
4356 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4357 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4358 /*
4359 * Handle SCRIPT/STYLE separately
4360 */
4361 if ((!terminate) &&
4362 (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4363 goto done;
4364 htmlParseScript(ctxt);
4365 if ((cur == '<') && (next == '/')) {
4366 ctxt->instate = XML_PARSER_END_TAG;
4367 ctxt->checkIndex = 0;
4368#ifdef DEBUG_PUSH
4369 xmlGenericError(xmlGenericErrorContext,
4370 "HPP: entering END_TAG\n");
4371#endif
4372 break;
4373 }
4374 } else {
4375 /*
4376 * Sometimes DOCTYPE arrives in the middle of the document
4377 */
4378 if ((cur == '<') && (next == '!') &&
4379 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4380 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4381 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4382 (UPP(8) == 'E')) {
4383 if ((!terminate) &&
4384 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4385 goto done;
4386 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4387 ctxt->sax->error(ctxt->userData,
4388 "Misplaced DOCTYPE declaration\n");
4389 ctxt->wellFormed = 0;
4390 htmlParseDocTypeDecl(ctxt);
4391 } else if ((cur == '<') && (next == '!') &&
4392 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4393 if ((!terminate) &&
4394 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4395 goto done;
4396#ifdef DEBUG_PUSH
4397 xmlGenericError(xmlGenericErrorContext,
4398 "HPP: Parsing Comment\n");
4399#endif
4400 htmlParseComment(ctxt);
4401 ctxt->instate = XML_PARSER_CONTENT;
4402 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4403 goto done;
4404 } else if ((cur == '<') && (next == '/')) {
4405 ctxt->instate = XML_PARSER_END_TAG;
4406 ctxt->checkIndex = 0;
4407#ifdef DEBUG_PUSH
4408 xmlGenericError(xmlGenericErrorContext,
4409 "HPP: entering END_TAG\n");
4410#endif
4411 break;
4412 } else if (cur == '<') {
4413 ctxt->instate = XML_PARSER_START_TAG;
4414 ctxt->checkIndex = 0;
4415#ifdef DEBUG_PUSH
4416 xmlGenericError(xmlGenericErrorContext,
4417 "HPP: entering START_TAG\n");
4418#endif
4419 break;
4420 } else if (cur == '&') {
4421 if ((!terminate) &&
4422 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4423 goto done;
4424#ifdef DEBUG_PUSH
4425 xmlGenericError(xmlGenericErrorContext,
4426 "HPP: Parsing Reference\n");
4427#endif
4428 /* TODO: check generation of subtrees if noent !!! */
4429 htmlParseReference(ctxt);
4430 } else {
4431 /* TODO Avoid the extra copy, handle directly !!!!!! */
4432 /*
4433 * Goal of the following test is :
4434 * - minimize calls to the SAX 'character' callback
4435 * when they are mergeable
4436 */
4437 if ((ctxt->inputNr == 1) &&
4438 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4439 if ((!terminate) &&
4440 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4441 goto done;
4442 }
4443 ctxt->checkIndex = 0;
4444#ifdef DEBUG_PUSH
4445 xmlGenericError(xmlGenericErrorContext,
4446 "HPP: Parsing char data\n");
4447#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004448 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004449 }
4450 }
4451 if (cons == ctxt->nbChars) {
4452 if (ctxt->node != NULL) {
4453 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4454 ctxt->sax->error(ctxt->userData,
4455 "detected an error in element content\n");
4456 ctxt->wellFormed = 0;
4457 }
4458 NEXT;
4459 break;
4460 }
4461
4462 break;
4463 }
4464 case XML_PARSER_END_TAG:
4465 if (avail < 2)
4466 goto done;
4467 if ((!terminate) &&
4468 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4469 goto done;
4470 htmlParseEndTag(ctxt);
4471 if (ctxt->nameNr == 0) {
4472 ctxt->instate = XML_PARSER_EPILOG;
4473 } else {
4474 ctxt->instate = XML_PARSER_CONTENT;
4475 }
4476 ctxt->checkIndex = 0;
4477#ifdef DEBUG_PUSH
4478 xmlGenericError(xmlGenericErrorContext,
4479 "HPP: entering CONTENT\n");
4480#endif
4481 break;
4482 case XML_PARSER_CDATA_SECTION:
4483 xmlGenericError(xmlGenericErrorContext,
4484 "HPP: internal error, state == CDATA\n");
4485 ctxt->instate = XML_PARSER_CONTENT;
4486 ctxt->checkIndex = 0;
4487#ifdef DEBUG_PUSH
4488 xmlGenericError(xmlGenericErrorContext,
4489 "HPP: entering CONTENT\n");
4490#endif
4491 break;
4492 case XML_PARSER_DTD:
4493 xmlGenericError(xmlGenericErrorContext,
4494 "HPP: internal error, state == DTD\n");
4495 ctxt->instate = XML_PARSER_CONTENT;
4496 ctxt->checkIndex = 0;
4497#ifdef DEBUG_PUSH
4498 xmlGenericError(xmlGenericErrorContext,
4499 "HPP: entering CONTENT\n");
4500#endif
4501 break;
4502 case XML_PARSER_COMMENT:
4503 xmlGenericError(xmlGenericErrorContext,
4504 "HPP: internal error, state == COMMENT\n");
4505 ctxt->instate = XML_PARSER_CONTENT;
4506 ctxt->checkIndex = 0;
4507#ifdef DEBUG_PUSH
4508 xmlGenericError(xmlGenericErrorContext,
4509 "HPP: entering CONTENT\n");
4510#endif
4511 break;
4512 case XML_PARSER_PI:
4513 xmlGenericError(xmlGenericErrorContext,
4514 "HPP: internal error, state == PI\n");
4515 ctxt->instate = XML_PARSER_CONTENT;
4516 ctxt->checkIndex = 0;
4517#ifdef DEBUG_PUSH
4518 xmlGenericError(xmlGenericErrorContext,
4519 "HPP: entering CONTENT\n");
4520#endif
4521 break;
4522 case XML_PARSER_ENTITY_DECL:
4523 xmlGenericError(xmlGenericErrorContext,
4524 "HPP: internal error, state == ENTITY_DECL\n");
4525 ctxt->instate = XML_PARSER_CONTENT;
4526 ctxt->checkIndex = 0;
4527#ifdef DEBUG_PUSH
4528 xmlGenericError(xmlGenericErrorContext,
4529 "HPP: entering CONTENT\n");
4530#endif
4531 break;
4532 case XML_PARSER_ENTITY_VALUE:
4533 xmlGenericError(xmlGenericErrorContext,
4534 "HPP: internal error, state == ENTITY_VALUE\n");
4535 ctxt->instate = XML_PARSER_CONTENT;
4536 ctxt->checkIndex = 0;
4537#ifdef DEBUG_PUSH
4538 xmlGenericError(xmlGenericErrorContext,
4539 "HPP: entering DTD\n");
4540#endif
4541 break;
4542 case XML_PARSER_ATTRIBUTE_VALUE:
4543 xmlGenericError(xmlGenericErrorContext,
4544 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
4545 ctxt->instate = XML_PARSER_START_TAG;
4546 ctxt->checkIndex = 0;
4547#ifdef DEBUG_PUSH
4548 xmlGenericError(xmlGenericErrorContext,
4549 "HPP: entering START_TAG\n");
4550#endif
4551 break;
4552 case XML_PARSER_SYSTEM_LITERAL:
4553 xmlGenericError(xmlGenericErrorContext,
4554 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
4555 ctxt->instate = XML_PARSER_CONTENT;
4556 ctxt->checkIndex = 0;
4557#ifdef DEBUG_PUSH
4558 xmlGenericError(xmlGenericErrorContext,
4559 "HPP: entering CONTENT\n");
4560#endif
4561 break;
4562 case XML_PARSER_IGNORE:
4563 xmlGenericError(xmlGenericErrorContext,
4564 "HPP: internal error, state == XML_PARSER_IGNORE\n");
4565 ctxt->instate = XML_PARSER_CONTENT;
4566 ctxt->checkIndex = 0;
4567#ifdef DEBUG_PUSH
4568 xmlGenericError(xmlGenericErrorContext,
4569 "HPP: entering CONTENT\n");
4570#endif
4571 break;
4572 }
4573 }
4574done:
4575 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004576 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004577 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4578 /*
4579 * SAX: end of the document processing.
4580 */
4581 ctxt->instate = XML_PARSER_EOF;
4582 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4583 ctxt->sax->endDocument(ctxt->userData);
4584 }
4585 }
4586 if ((ctxt->myDoc != NULL) &&
4587 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4588 (ctxt->instate == XML_PARSER_EPILOG))) {
4589 xmlDtdPtr dtd;
4590 dtd = xmlGetIntSubset(ctxt->myDoc);
4591 if (dtd == NULL)
4592 ctxt->myDoc->intSubset =
4593 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4594 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4595 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4596 }
4597#ifdef DEBUG_PUSH
4598 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4599#endif
4600 return(ret);
4601}
4602
4603/**
Owen Taylor3473f882001-02-23 17:55:21 +00004604 * htmlParseChunk:
4605 * @ctxt: an XML parser context
4606 * @chunk: an char array
4607 * @size: the size in byte of the chunk
4608 * @terminate: last chunk indicator
4609 *
4610 * Parse a Chunk of memory
4611 *
4612 * Returns zero if no error, the xmlParserErrors otherwise.
4613 */
4614int
4615htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4616 int terminate) {
4617 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4618 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4619 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4620 int cur = ctxt->input->cur - ctxt->input->base;
4621
4622 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4623 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4624 ctxt->input->cur = ctxt->input->base + cur;
4625#ifdef DEBUG_PUSH
4626 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4627#endif
4628
4629 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4630 htmlParseTryOrFinish(ctxt, terminate);
4631 } else if (ctxt->instate != XML_PARSER_EOF) {
4632 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
4633 htmlParseTryOrFinish(ctxt, terminate);
4634 }
4635 if (terminate) {
4636 if ((ctxt->instate != XML_PARSER_EOF) &&
4637 (ctxt->instate != XML_PARSER_EPILOG) &&
4638 (ctxt->instate != XML_PARSER_MISC)) {
4639 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004640 ctxt->wellFormed = 0;
4641 }
4642 if (ctxt->instate != XML_PARSER_EOF) {
4643 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4644 ctxt->sax->endDocument(ctxt->userData);
4645 }
4646 ctxt->instate = XML_PARSER_EOF;
4647 }
4648 return((xmlParserErrors) ctxt->errNo);
4649}
4650
4651/************************************************************************
4652 * *
4653 * User entry points *
4654 * *
4655 ************************************************************************/
4656
4657/**
4658 * htmlCreatePushParserCtxt :
4659 * @sax: a SAX handler
4660 * @user_data: The user data returned on SAX callbacks
4661 * @chunk: a pointer to an array of chars
4662 * @size: number of chars in the array
4663 * @filename: an optional file name or URI
4664 * @enc: an optional encoding
4665 *
4666 * Create a parser context for using the HTML parser in push mode
4667 * To allow content encoding detection, @size should be >= 4
4668 * The value of @filename is used for fetching external entities
4669 * and error/warning reports.
4670 *
4671 * Returns the new parser context or NULL
4672 */
4673htmlParserCtxtPtr
4674htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4675 const char *chunk, int size, const char *filename,
4676 xmlCharEncoding enc) {
4677 htmlParserCtxtPtr ctxt;
4678 htmlParserInputPtr inputStream;
4679 xmlParserInputBufferPtr buf;
4680
Daniel Veillardd0463562001-10-13 09:15:48 +00004681 xmlInitParser();
4682
Owen Taylor3473f882001-02-23 17:55:21 +00004683 buf = xmlAllocParserInputBuffer(enc);
4684 if (buf == NULL) return(NULL);
4685
4686 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4687 if (ctxt == NULL) {
4688 xmlFree(buf);
4689 return(NULL);
4690 }
4691 memset(ctxt, 0, sizeof(htmlParserCtxt));
4692 htmlInitParserCtxt(ctxt);
4693 if (sax != NULL) {
4694 if (ctxt->sax != &htmlDefaultSAXHandler)
4695 xmlFree(ctxt->sax);
4696 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4697 if (ctxt->sax == NULL) {
4698 xmlFree(buf);
4699 xmlFree(ctxt);
4700 return(NULL);
4701 }
4702 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4703 if (user_data != NULL)
4704 ctxt->userData = user_data;
4705 }
4706 if (filename == NULL) {
4707 ctxt->directory = NULL;
4708 } else {
4709 ctxt->directory = xmlParserGetDirectory(filename);
4710 }
4711
4712 inputStream = htmlNewInputStream(ctxt);
4713 if (inputStream == NULL) {
4714 xmlFreeParserCtxt(ctxt);
4715 return(NULL);
4716 }
4717
4718 if (filename == NULL)
4719 inputStream->filename = NULL;
4720 else
4721 inputStream->filename = xmlMemStrdup(filename);
4722 inputStream->buf = buf;
4723 inputStream->base = inputStream->buf->buffer->content;
4724 inputStream->cur = inputStream->buf->buffer->content;
4725
4726 inputPush(ctxt, inputStream);
4727
4728 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4729 (ctxt->input->buf != NULL)) {
4730 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4731#ifdef DEBUG_PUSH
4732 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4733#endif
4734 }
4735
4736 return(ctxt);
4737}
4738
4739/**
4740 * htmlSAXParseDoc :
4741 * @cur: a pointer to an array of xmlChar
4742 * @encoding: a free form C string describing the HTML document encoding, or NULL
4743 * @sax: the SAX handler block
4744 * @userData: if using SAX, this pointer will be provided on callbacks.
4745 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004746 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
4747 * to handle parse events. If sax is NULL, fallback to the default DOM
4748 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00004749 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004750 * Returns the resulting document tree unless SAX is NULL or the document is
4751 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00004752 */
4753
4754htmlDocPtr
4755htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
4756 htmlDocPtr ret;
4757 htmlParserCtxtPtr ctxt;
4758
Daniel Veillardd0463562001-10-13 09:15:48 +00004759 xmlInitParser();
4760
Owen Taylor3473f882001-02-23 17:55:21 +00004761 if (cur == NULL) return(NULL);
4762
4763
4764 ctxt = htmlCreateDocParserCtxt(cur, encoding);
4765 if (ctxt == NULL) return(NULL);
4766 if (sax != NULL) {
4767 ctxt->sax = sax;
4768 ctxt->userData = userData;
4769 }
4770
4771 htmlParseDocument(ctxt);
4772 ret = ctxt->myDoc;
4773 if (sax != NULL) {
4774 ctxt->sax = NULL;
4775 ctxt->userData = NULL;
4776 }
4777 htmlFreeParserCtxt(ctxt);
4778
4779 return(ret);
4780}
4781
4782/**
4783 * htmlParseDoc :
4784 * @cur: a pointer to an array of xmlChar
4785 * @encoding: a free form C string describing the HTML document encoding, or NULL
4786 *
4787 * parse an HTML in-memory document and build a tree.
4788 *
4789 * Returns the resulting document tree
4790 */
4791
4792htmlDocPtr
4793htmlParseDoc(xmlChar *cur, const char *encoding) {
4794 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4795}
4796
4797
4798/**
4799 * htmlCreateFileParserCtxt :
4800 * @filename: the filename
4801 * @encoding: a free form C string describing the HTML document encoding, or NULL
4802 *
4803 * Create a parser context for a file content.
4804 * Automatic support for ZLIB/Compress compressed document is provided
4805 * by default if found at compile-time.
4806 *
4807 * Returns the new parser context or NULL
4808 */
4809htmlParserCtxtPtr
4810htmlCreateFileParserCtxt(const char *filename, const char *encoding)
4811{
4812 htmlParserCtxtPtr ctxt;
4813 htmlParserInputPtr inputStream;
4814 xmlParserInputBufferPtr buf;
4815 /* htmlCharEncoding enc; */
4816 xmlChar *content, *content_line = (xmlChar *) "charset=";
4817
4818 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
4819 if (buf == NULL) return(NULL);
4820
4821 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4822 if (ctxt == NULL) {
4823 perror("malloc");
4824 return(NULL);
4825 }
4826 memset(ctxt, 0, sizeof(htmlParserCtxt));
4827 htmlInitParserCtxt(ctxt);
4828 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
4829 if (inputStream == NULL) {
4830 perror("malloc");
4831 xmlFree(ctxt);
4832 return(NULL);
4833 }
4834 memset(inputStream, 0, sizeof(htmlParserInput));
4835
4836 inputStream->filename = xmlMemStrdup(filename);
4837 inputStream->line = 1;
4838 inputStream->col = 1;
4839 inputStream->buf = buf;
4840 inputStream->directory = NULL;
4841
4842 inputStream->base = inputStream->buf->buffer->content;
4843 inputStream->cur = inputStream->buf->buffer->content;
4844 inputStream->free = NULL;
4845
4846 inputPush(ctxt, inputStream);
4847
4848 /* set encoding */
4849 if (encoding) {
4850 content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
4851 if (content) {
4852 strcpy ((char *)content, (char *)content_line);
4853 strcat ((char *)content, (char *)encoding);
4854 htmlCheckEncoding (ctxt, content);
4855 xmlFree (content);
4856 }
4857 }
4858
4859 return(ctxt);
4860}
4861
4862/**
4863 * htmlSAXParseFile :
4864 * @filename: the filename
4865 * @encoding: a free form C string describing the HTML document encoding, or NULL
4866 * @sax: the SAX handler block
4867 * @userData: if using SAX, this pointer will be provided on callbacks.
4868 *
4869 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4870 * compressed document is provided by default if found at compile-time.
4871 * It use the given SAX function block to handle the parsing callback.
4872 * If sax is NULL, fallback to the default DOM tree building routines.
4873 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004874 * Returns the resulting document tree unless SAX is NULL or the document is
4875 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00004876 */
4877
4878htmlDocPtr
4879htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
4880 void *userData) {
4881 htmlDocPtr ret;
4882 htmlParserCtxtPtr ctxt;
4883 htmlSAXHandlerPtr oldsax = NULL;
4884
Daniel Veillardd0463562001-10-13 09:15:48 +00004885 xmlInitParser();
4886
Owen Taylor3473f882001-02-23 17:55:21 +00004887 ctxt = htmlCreateFileParserCtxt(filename, encoding);
4888 if (ctxt == NULL) return(NULL);
4889 if (sax != NULL) {
4890 oldsax = ctxt->sax;
4891 ctxt->sax = sax;
4892 ctxt->userData = userData;
4893 }
4894
4895 htmlParseDocument(ctxt);
4896
4897 ret = ctxt->myDoc;
4898 if (sax != NULL) {
4899 ctxt->sax = oldsax;
4900 ctxt->userData = NULL;
4901 }
4902 htmlFreeParserCtxt(ctxt);
4903
4904 return(ret);
4905}
4906
4907/**
4908 * htmlParseFile :
4909 * @filename: the filename
4910 * @encoding: a free form C string describing the HTML document encoding, or NULL
4911 *
4912 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4913 * compressed document is provided by default if found at compile-time.
4914 *
4915 * Returns the resulting document tree
4916 */
4917
4918htmlDocPtr
4919htmlParseFile(const char *filename, const char *encoding) {
4920 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
4921}
4922
4923/**
4924 * htmlHandleOmittedElem:
4925 * @val: int 0 or 1
4926 *
4927 * Set and return the previous value for handling HTML omitted tags.
4928 *
4929 * Returns the last value for 0 for no handling, 1 for auto insertion.
4930 */
4931
4932int
4933htmlHandleOmittedElem(int val) {
4934 int old = htmlOmittedDefaultValue;
4935
4936 htmlOmittedDefaultValue = val;
4937 return(old);
4938}
4939
4940#endif /* LIBXML_HTML_ENABLED */