blob: 86d9e5451cf99366a3c37950a35b805c99d5295a [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Bjorn Reese70a9da52001-04-21 16:57:29 +00009#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000010#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000011
Owen Taylor3473f882001-02-23 17:55:21 +000012#include <string.h>
13#ifdef HAVE_CTYPE_H
14#include <ctype.h>
15#endif
16#ifdef HAVE_STDLIB_H
17#include <stdlib.h>
18#endif
19#ifdef HAVE_SYS_STAT_H
20#include <sys/stat.h>
21#endif
22#ifdef HAVE_FCNTL_H
23#include <fcntl.h>
24#endif
25#ifdef HAVE_UNISTD_H
26#include <unistd.h>
27#endif
28#ifdef HAVE_ZLIB_H
29#include <zlib.h>
30#endif
31
32#include <libxml/xmlmemory.h>
33#include <libxml/tree.h>
34#include <libxml/parser.h>
35#include <libxml/parserInternals.h>
36#include <libxml/xmlerror.h>
37#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000038#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000039#include <libxml/entities.h>
40#include <libxml/encoding.h>
41#include <libxml/valid.h>
42#include <libxml/xmlIO.h>
43
44#define HTML_MAX_NAMELEN 1000
45#define HTML_PARSER_BIG_BUFFER_SIZE 1000
46#define HTML_PARSER_BUFFER_SIZE 100
47
48/* #define DEBUG */
49/* #define DEBUG_PUSH */
50
Daniel Veillard22090732001-07-16 00:06:07 +000051static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000052
Daniel Veillard56a4cb82001-03-24 17:00:36 +000053xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
54 xmlChar end, xmlChar end2, xmlChar end3);
55
56/************************************************************************
57 * *
Owen Taylor3473f882001-02-23 17:55:21 +000058 * Parser stacks related functions and macros *
59 * *
60 ************************************************************************/
61
62/*
63 * Generic function for accessing stacks in the Parser Context
64 */
65
66#define PUSH_AND_POP(scope, type, name) \
67scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
68 if (ctxt->name##Nr >= ctxt->name##Max) { \
69 ctxt->name##Max *= 2; \
70 ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
71 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
72 if (ctxt->name##Tab == NULL) { \
73 xmlGenericError(xmlGenericErrorContext, \
74 "realloc failed !\n"); \
75 return(0); \
76 } \
77 } \
78 ctxt->name##Tab[ctxt->name##Nr] = value; \
79 ctxt->name = value; \
80 return(ctxt->name##Nr++); \
81} \
82scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
83 type ret; \
84 if (ctxt->name##Nr < 0) return(0); \
85 ctxt->name##Nr--; \
86 if (ctxt->name##Nr < 0) return(0); \
87 if (ctxt->name##Nr > 0) \
88 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
89 else \
90 ctxt->name = NULL; \
91 ret = ctxt->name##Tab[ctxt->name##Nr]; \
92 ctxt->name##Tab[ctxt->name##Nr] = 0; \
93 return(ret); \
94} \
95
Daniel Veillard56a4cb82001-03-24 17:00:36 +000096/* PUSH_AND_POP(static, xmlNodePtr, node) */
97PUSH_AND_POP(static, xmlChar*, name)
Owen Taylor3473f882001-02-23 17:55:21 +000098
99/*
100 * Macros for accessing the content. Those should be used only by the parser,
101 * and not exported.
102 *
103 * Dirty macros, i.e. one need to make assumption on the context to use them
104 *
105 * CUR_PTR return the current pointer to the xmlChar to be parsed.
106 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
107 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
108 * in UNICODE mode. This should be used internally by the parser
109 * only to compare to ASCII values otherwise it would break when
110 * running with UTF-8 encoding.
111 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
112 * to compare on ASCII based substring.
113 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
114 * it should be used only to compare on ASCII based substring.
115 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
116 * strings within the parser.
117 *
118 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
119 *
120 * CURRENT Returns the current char value, with the full decoding of
121 * UTF-8 if we are using this mode. It returns an int.
122 * NEXT Skip to the next character, this does the proper decoding
123 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
124 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
125 */
126
127#define UPPER (toupper(*ctxt->input->cur))
128
129#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
130
131#define NXT(val) ctxt->input->cur[(val)]
132
133#define UPP(val) (toupper(ctxt->input->cur[(val)]))
134
135#define CUR_PTR ctxt->input->cur
136
137#define SHRINK xmlParserInputShrink(ctxt->input)
138
139#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
140
141#define CURRENT ((int) (*ctxt->input->cur))
142
143#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
144
145/* Inported from XML */
146
147/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
148#define CUR ((int) (*ctxt->input->cur))
149#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
150
151#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
152#define NXT(val) ctxt->input->cur[(val)]
153#define CUR_PTR ctxt->input->cur
154
155
156#define NEXTL(l) do { \
157 if (*(ctxt->input->cur) == '\n') { \
158 ctxt->input->line++; ctxt->input->col = 1; \
159 } else ctxt->input->col++; \
160 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
161 } while (0)
162
163/************
164 \
165 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
166 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
167 ************/
168
169#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
170#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
171
172#define COPY_BUF(l,b,i,v) \
173 if (l == 1) b[i++] = (xmlChar) v; \
174 else i += xmlCopyChar(l,&b[i],v)
175
176/**
177 * htmlCurrentChar:
178 * @ctxt: the HTML parser context
179 * @len: pointer to the length of the char read
180 *
181 * The current char value, if using UTF-8 this may actaully span multiple
182 * bytes in the input buffer. Implement the end of line normalization:
183 * 2.11 End-of-Line Handling
184 * If the encoding is unspecified, in the case we find an ISO-Latin-1
185 * char, then the encoding converter is plugged in automatically.
186 *
187 * Returns the current char value and its lenght
188 */
189
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000190static int
Owen Taylor3473f882001-02-23 17:55:21 +0000191htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
192 if (ctxt->instate == XML_PARSER_EOF)
193 return(0);
194
195 if (ctxt->token != 0) {
196 *len = 0;
197 return(ctxt->token);
198 }
199 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
200 /*
201 * We are supposed to handle UTF8, check it's valid
202 * From rfc2044: encoding of the Unicode values on UTF-8:
203 *
204 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
205 * 0000 0000-0000 007F 0xxxxxxx
206 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
207 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
208 *
209 * Check for the 0x110000 limit too
210 */
211 const unsigned char *cur = ctxt->input->cur;
212 unsigned char c;
213 unsigned int val;
214
215 c = *cur;
216 if (c & 0x80) {
217 if (cur[1] == 0)
218 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
219 if ((cur[1] & 0xc0) != 0x80)
220 goto encoding_error;
221 if ((c & 0xe0) == 0xe0) {
222
223 if (cur[2] == 0)
224 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
225 if ((cur[2] & 0xc0) != 0x80)
226 goto encoding_error;
227 if ((c & 0xf0) == 0xf0) {
228 if (cur[3] == 0)
229 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
230 if (((c & 0xf8) != 0xf0) ||
231 ((cur[3] & 0xc0) != 0x80))
232 goto encoding_error;
233 /* 4-byte code */
234 *len = 4;
235 val = (cur[0] & 0x7) << 18;
236 val |= (cur[1] & 0x3f) << 12;
237 val |= (cur[2] & 0x3f) << 6;
238 val |= cur[3] & 0x3f;
239 } else {
240 /* 3-byte code */
241 *len = 3;
242 val = (cur[0] & 0xf) << 12;
243 val |= (cur[1] & 0x3f) << 6;
244 val |= cur[2] & 0x3f;
245 }
246 } else {
247 /* 2-byte code */
248 *len = 2;
249 val = (cur[0] & 0x1f) << 6;
250 val |= cur[1] & 0x3f;
251 }
252 if (!IS_CHAR(val)) {
253 ctxt->errNo = XML_ERR_INVALID_ENCODING;
254 if ((ctxt->sax != NULL) &&
255 (ctxt->sax->error != NULL))
256 ctxt->sax->error(ctxt->userData,
257 "Char 0x%X out of allowed range\n", val);
258 ctxt->wellFormed = 0;
259 ctxt->disableSAX = 1;
260 }
261 return(val);
262 } else {
263 /* 1-byte code */
264 *len = 1;
265 return((int) *ctxt->input->cur);
266 }
267 }
268 /*
269 * Assume it's a fixed lenght encoding (1) with
270 * a compatibke encoding for the ASCII set, since
271 * XML constructs only use < 128 chars
272 */
273 *len = 1;
274 if ((int) *ctxt->input->cur < 0x80)
275 return((int) *ctxt->input->cur);
276
277 /*
278 * Humm this is bad, do an automatic flow conversion
279 */
280 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
281 ctxt->charset = XML_CHAR_ENCODING_UTF8;
282 return(xmlCurrentChar(ctxt, len));
283
284encoding_error:
285 /*
286 * If we detect an UTF8 error that probably mean that the
287 * input encoding didn't get properly advertized in the
288 * declaration header. Report the error and switch the encoding
289 * to ISO-Latin-1 (if you don't like this policy, just declare the
290 * encoding !)
291 */
292 ctxt->errNo = XML_ERR_INVALID_ENCODING;
293 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
294 ctxt->sax->error(ctxt->userData,
295 "Input is not proper UTF-8, indicate encoding !\n");
296 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
297 ctxt->input->cur[0], ctxt->input->cur[1],
298 ctxt->input->cur[2], ctxt->input->cur[3]);
299 }
300
301 ctxt->charset = XML_CHAR_ENCODING_8859_1;
302 *len = 1;
303 return((int) *ctxt->input->cur);
304}
305
306/**
Owen Taylor3473f882001-02-23 17:55:21 +0000307 * htmlSkipBlankChars:
308 * @ctxt: the HTML parser context
309 *
310 * skip all blanks character found at that point in the input streams.
311 *
312 * Returns the number of space chars skipped
313 */
314
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000315static int
Owen Taylor3473f882001-02-23 17:55:21 +0000316htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
317 int res = 0;
318
319 while (IS_BLANK(*(ctxt->input->cur))) {
320 if ((*ctxt->input->cur == 0) &&
321 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
322 xmlPopInput(ctxt);
323 } else {
324 if (*(ctxt->input->cur) == '\n') {
325 ctxt->input->line++; ctxt->input->col = 1;
326 } else ctxt->input->col++;
327 ctxt->input->cur++;
328 ctxt->nbChars++;
329 if (*ctxt->input->cur == 0)
330 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
331 }
332 res++;
333 }
334 return(res);
335}
336
337
338
339/************************************************************************
340 * *
341 * The list of HTML elements and their properties *
342 * *
343 ************************************************************************/
344
345/*
346 * Start Tag: 1 means the start tag can be ommited
347 * End Tag: 1 means the end tag can be ommited
348 * 2 means it's forbidden (empty elements)
Daniel Veillard56098d42001-04-24 12:51:09 +0000349 * 3 means the tag is stylistic and should be closed easilly
Owen Taylor3473f882001-02-23 17:55:21 +0000350 * Depr: this element is deprecated
351 * DTD: 1 means that this element is valid only in the Loose DTD
352 * 2 means that this element is valid only in the Frameset DTD
353 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000354 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Owen Taylor3473f882001-02-23 17:55:21 +0000355 */
Daniel Veillard22090732001-07-16 00:06:07 +0000356static const htmlElemDesc
357html40ElementTable[] = {
Daniel Veillard02bb1702001-06-13 21:11:59 +0000358{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor " },
359{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form" },
360{ "acronym", 0, 0, 0, 0, 0, 0, 1, "" },
361{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author " },
362{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet " },
363{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area " },
364{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style" },
365{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri " },
366{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " },
367{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride " },
368{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style" },
369{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation " },
370{ "body", 1, 1, 0, 0, 0, 0, 0, "document body " },
371{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break " },
372{ "button", 0, 0, 0, 0, 0, 0, 2, "push button " },
373{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption " },
374{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center " },
375{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation" },
376{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment" },
377{ "col", 0, 2, 2, 1, 0, 0, 0, "table column " },
378{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group " },
379{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description " },
380{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text " },
381{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition" },
382{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list" },
383{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container"},
384{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list " },
385{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term " },
386{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis" },
387{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group " },
388{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font " },
389{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form " },
390{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " },
391{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" },
392{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading " },
393{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading " },
394{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading " },
395{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading " },
396{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading " },
397{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading " },
398{ "head", 1, 1, 0, 0, 0, 0, 0, "document head " },
399{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " },
400{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element " },
401{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style" },
402{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow " },
403{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image " },
404{ "input", 0, 2, 2, 1, 0, 0, 1, "form control " },
405{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text" },
406{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt " },
407{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user" },
408{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text " },
409{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend " },
410{ "li", 0, 1, 1, 0, 0, 0, 0, "list item " },
411{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link " },
412{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map " },
413{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list " },
414{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation " },
415{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering " },
416{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
417{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object " },
418{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list " },
419{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group " },
420{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " },
421{ "p", 0, 1, 1, 0, 0, 0, 0, "paragraph " },
422{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value " },
423{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text " },
424{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation " },
425{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style" },
426{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc." },
427{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements " },
428{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector " },
429{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style" },
430{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container " },
431{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text" },
432{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis" },
433{ "style", 0, 0, 0, 0, 0, 0, 0, "style info " },
434{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript" },
435{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript " },
436{ "table", 0, 0, 0, 0, 0, 0, 0, "&#160;" },
437{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body " },
438{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell" },
439{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field " },
440{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer " },
441{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell" },
442{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header " },
443{ "title", 0, 0, 0, 0, 0, 0, 0, "document title " },
444{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row " },
445{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style" },
446{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style" },
447{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list " },
448{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument" },
Owen Taylor3473f882001-02-23 17:55:21 +0000449};
450
451/*
Owen Taylor3473f882001-02-23 17:55:21 +0000452 * start tags that imply the end of current element
453 */
Daniel Veillard22090732001-07-16 00:06:07 +0000454static const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000455"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
456 "dl", "ul", "ol", "menu", "dir", "address", "pre",
457 "listing", "xmp", "head", NULL,
458"head", "p", NULL,
459"title", "p", NULL,
460"body", "head", "style", "link", "title", "p", NULL,
461"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
462 "pre", "listing", "xmp", "head", "li", NULL,
463"hr", "p", "head", NULL,
464"h1", "p", "head", NULL,
465"h2", "p", "head", NULL,
466"h3", "p", "head", NULL,
467"h4", "p", "head", NULL,
468"h5", "p", "head", NULL,
469"h6", "p", "head", NULL,
470"dir", "p", "head", NULL,
471"address", "p", "head", "ul", NULL,
472"pre", "p", "head", "ul", NULL,
473"listing", "p", "head", NULL,
474"xmp", "p", "head", NULL,
475"blockquote", "p", "head", NULL,
476"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
477 "xmp", "head", NULL,
478"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
479 "head", "dd", NULL,
480"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
481 "head", "dt", NULL,
482"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
483 "listing", "xmp", NULL,
484"ol", "p", "head", "ul", NULL,
485"menu", "p", "head", "ul", NULL,
486"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
487"div", "p", "head", NULL,
488"noscript", "p", "head", NULL,
489"center", "font", "b", "i", "p", "head", NULL,
490"a", "a", NULL,
491"caption", "p", NULL,
492"colgroup", "caption", "colgroup", "col", "p", NULL,
493"col", "caption", "col", "p", NULL,
494"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
495 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000496"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
497"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000498"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
499"thead", "caption", "col", "colgroup", NULL,
500"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
501 "tbody", "p", NULL,
502"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
503 "tfoot", "tbody", "p", NULL,
504"optgroup", "option", NULL,
505"option", "option", NULL,
506"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
507 "pre", "listing", "xmp", "a", NULL,
508NULL
509};
510
511/*
512 * The list of HTML elements which are supposed not to have
513 * CDATA content and where a p element will be implied
514 *
515 * TODO: extend that list by reading the HTML SGML DtD on
516 * implied paragraph
517 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000518static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000519 "html",
520 "head",
521 "body",
522 NULL
523};
524
525/*
526 * The list of HTML attributes which are of content %Script;
527 * NOTE: when adding ones, check htmlIsScriptAttribute() since
528 * it assumes the name starts with 'on'
529 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000530static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000531 "onclick",
532 "ondblclick",
533 "onmousedown",
534 "onmouseup",
535 "onmouseover",
536 "onmousemove",
537 "onmouseout",
538 "onkeypress",
539 "onkeydown",
540 "onkeyup",
541 "onload",
542 "onunload",
543 "onfocus",
544 "onblur",
545 "onsubmit",
546 "onrest",
547 "onchange",
548 "onselect"
549};
550
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000551/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000552 * This table is used by the htmlparser to know what to do with
553 * broken html pages. By assigning different priorities to different
554 * elements the parser can decide how to handle extra endtags.
555 * Endtags are only allowed to close elements with lower or equal
556 * priority.
557 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000558
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000559typedef struct {
560 const char *name;
561 int priority;
562} elementPriority;
563
Daniel Veillard22090732001-07-16 00:06:07 +0000564static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000565 {"div", 150},
566 {"td", 160},
567 {"th", 160},
568 {"tr", 170},
569 {"thead", 180},
570 {"tbody", 180},
571 {"tfoot", 180},
572 {"table", 190},
573 {"head", 200},
574 {"body", 200},
575 {"html", 220},
576 {NULL, 100} /* Default priority */
577};
Owen Taylor3473f882001-02-23 17:55:21 +0000578
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000579static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +0000580static int htmlStartCloseIndexinitialized = 0;
581
582/************************************************************************
583 * *
584 * functions to handle HTML specific data *
585 * *
586 ************************************************************************/
587
588/**
589 * htmlInitAutoClose:
590 *
591 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
592 * This is not reentrant. Call xmlInitParser() once before processing in
593 * case of use in multithreaded programs.
594 */
595void
596htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000597 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000598
599 if (htmlStartCloseIndexinitialized) return;
600
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000601 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
602 indx = 0;
603 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
604 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +0000605 while (htmlStartClose[i] != NULL) i++;
606 i++;
607 }
608 htmlStartCloseIndexinitialized = 1;
609}
610
611/**
612 * htmlTagLookup:
613 * @tag: The tag name in lowercase
614 *
615 * Lookup the HTML tag in the ElementTable
616 *
617 * Returns the related htmlElemDescPtr or NULL if not found.
618 */
Daniel Veillardbb371292001-08-16 23:26:59 +0000619const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +0000620htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000621 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000622
623 for (i = 0; i < (sizeof(html40ElementTable) /
624 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000625 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillard22090732001-07-16 00:06:07 +0000626 return((const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +0000627 }
628 return(NULL);
629}
630
631/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000632 * htmlGetEndPriority:
633 * @name: The name of the element to look up the priority for.
634 *
635 * Return value: The "endtag" priority.
636 **/
637static int
638htmlGetEndPriority (const xmlChar *name) {
639 int i = 0;
640
641 while ((htmlEndPriority[i].name != NULL) &&
642 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
643 i++;
644
645 return(htmlEndPriority[i].priority);
646}
647
648/**
Owen Taylor3473f882001-02-23 17:55:21 +0000649 * htmlCheckAutoClose:
650 * @newtag: The new tag name
651 * @oldtag: The old tag name
652 *
653 * Checks wether the new tag is one of the registered valid tags for closing old.
654 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
655 *
656 * Returns 0 if no, 1 if yes.
657 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000658static int
Owen Taylor3473f882001-02-23 17:55:21 +0000659htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000660 int i, indx;
661 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +0000662
663 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
664
665 /* inefficient, but not a big deal */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000666 for (indx = 0; indx < 100;indx++) {
667 closed = htmlStartCloseIndex[indx];
668 if (closed == NULL) return(0);
669 if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor3473f882001-02-23 17:55:21 +0000670 }
671
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000672 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +0000673 i++;
674 while (htmlStartClose[i] != NULL) {
675 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
676 return(1);
677 }
678 i++;
679 }
680 return(0);
681}
682
683/**
684 * htmlAutoCloseOnClose:
685 * @ctxt: an HTML parser context
686 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000687 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +0000688 *
689 * The HTmL DtD allows an ending tag to implicitely close other tags.
690 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000691static void
Owen Taylor3473f882001-02-23 17:55:21 +0000692htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillardbb371292001-08-16 23:26:59 +0000693 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +0000694 xmlChar *oldname;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000695 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +0000696
697#ifdef DEBUG
698 xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
699 for (i = 0;i < ctxt->nameNr;i++)
700 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
701#endif
702
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000703 priority = htmlGetEndPriority (newtag);
704
Owen Taylor3473f882001-02-23 17:55:21 +0000705 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000706
Owen Taylor3473f882001-02-23 17:55:21 +0000707 if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000708 /*
709 * A missplaced endtagad can only close elements with lower
710 * or equal priority, so if we find an element with higher
711 * priority before we find an element with
712 * matching name, we just ignore this endtag
713 */
714 if (htmlGetEndPriority (ctxt->nameTab[i]) > priority) return;
Owen Taylor3473f882001-02-23 17:55:21 +0000715 }
716 if (i < 0) return;
717
718 while (!xmlStrEqual(newtag, ctxt->name)) {
719 info = htmlTagLookup(ctxt->name);
720 if ((info == NULL) || (info->endTag == 1)) {
721#ifdef DEBUG
722 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
723#endif
Daniel Veillard56098d42001-04-24 12:51:09 +0000724 } else if (info->endTag == 3) {
725#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +0000726 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", newtag, ctxt->name);
William M. Brack1633d182001-10-05 15:41:19 +0000727
Daniel Veillard56098d42001-04-24 12:51:09 +0000728#endif
729 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
730 ctxt->sax->error(ctxt->userData,
731 "Opening and ending tag mismatch: %s and %s\n",
732 newtag, ctxt->name);
733 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000734 }
735 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
736 ctxt->sax->endElement(ctxt->userData, ctxt->name);
737 oldname = htmlnamePop(ctxt);
738 if (oldname != NULL) {
739#ifdef DEBUG
740 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
741#endif
742 xmlFree(oldname);
743 }
744 }
745}
746
747/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000748 * htmlAutoCloseOnEnd:
749 * @ctxt: an HTML parser context
750 *
751 * Close all remaining tags at the end of the stream
752 */
753static void
754htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
755 xmlChar *oldname;
756 int i;
757
758 if (ctxt->nameNr == 0)
759 return;
760#ifdef DEBUG
761 xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
762#endif
763
764 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
765#ifdef DEBUG
766 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
767#endif
768 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
769 ctxt->sax->endElement(ctxt->userData, ctxt->name);
770 oldname = htmlnamePop(ctxt);
771 if (oldname != NULL) {
772#ifdef DEBUG
773 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
774#endif
775 xmlFree(oldname);
776 }
777 }
778}
779
780/**
Owen Taylor3473f882001-02-23 17:55:21 +0000781 * htmlAutoClose:
782 * @ctxt: an HTML parser context
783 * @newtag: The new tag name or NULL
784 *
785 * The HTmL DtD allows a tag to implicitely close other tags.
786 * The list is kept in htmlStartClose array. This function is
787 * called when a new tag has been detected and generates the
788 * appropriates closes if possible/needed.
789 * If newtag is NULL this mean we are at the end of the resource
790 * and we should check
791 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000792static void
Owen Taylor3473f882001-02-23 17:55:21 +0000793htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
794 xmlChar *oldname;
795 while ((newtag != NULL) && (ctxt->name != NULL) &&
796 (htmlCheckAutoClose(newtag, ctxt->name))) {
797#ifdef DEBUG
798 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
799#endif
800 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
801 ctxt->sax->endElement(ctxt->userData, ctxt->name);
802 oldname = htmlnamePop(ctxt);
803 if (oldname != NULL) {
804#ifdef DEBUG
805 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
806#endif
807 xmlFree(oldname);
808 }
809 }
810 if (newtag == NULL) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000811 htmlAutoCloseOnEnd(ctxt);
812 return;
Owen Taylor3473f882001-02-23 17:55:21 +0000813 }
814 while ((newtag == NULL) && (ctxt->name != NULL) &&
815 ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
816 (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
817 (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
818#ifdef DEBUG
819 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
820#endif
821 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
822 ctxt->sax->endElement(ctxt->userData, ctxt->name);
823 oldname = htmlnamePop(ctxt);
824 if (oldname != NULL) {
825#ifdef DEBUG
826 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
827#endif
828 xmlFree(oldname);
829 }
830 }
831
832}
833
834/**
835 * htmlAutoCloseTag:
836 * @doc: the HTML document
837 * @name: The tag name
838 * @elem: the HTML element
839 *
840 * The HTmL DtD allows a tag to implicitely close other tags.
841 * The list is kept in htmlStartClose array. This function checks
842 * if the element or one of it's children would autoclose the
843 * given tag.
844 *
845 * Returns 1 if autoclose, 0 otherwise
846 */
847int
848htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
849 htmlNodePtr child;
850
851 if (elem == NULL) return(1);
852 if (xmlStrEqual(name, elem->name)) return(0);
853 if (htmlCheckAutoClose(elem->name, name)) return(1);
854 child = elem->children;
855 while (child != NULL) {
856 if (htmlAutoCloseTag(doc, name, child)) return(1);
857 child = child->next;
858 }
859 return(0);
860}
861
862/**
863 * htmlIsAutoClosed:
864 * @doc: the HTML document
865 * @elem: the HTML element
866 *
867 * The HTmL DtD allows a tag to implicitely close other tags.
868 * The list is kept in htmlStartClose array. This function checks
869 * if a tag is autoclosed by one of it's child
870 *
871 * Returns 1 if autoclosed, 0 otherwise
872 */
873int
874htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
875 htmlNodePtr child;
876
877 if (elem == NULL) return(1);
878 child = elem->children;
879 while (child != NULL) {
880 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
881 child = child->next;
882 }
883 return(0);
884}
885
886/**
887 * htmlCheckImplied:
888 * @ctxt: an HTML parser context
889 * @newtag: The new tag name
890 *
891 * The HTML DtD allows a tag to exists only implicitely
892 * called when a new tag has been detected and generates the
893 * appropriates implicit tags if missing
894 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000895static void
Owen Taylor3473f882001-02-23 17:55:21 +0000896htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
897 if (!htmlOmittedDefaultValue)
898 return;
899 if (xmlStrEqual(newtag, BAD_CAST"html"))
900 return;
901 if (ctxt->nameNr <= 0) {
902#ifdef DEBUG
903 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
904#endif
905 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
906 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
907 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
908 }
909 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
910 return;
911 if ((ctxt->nameNr <= 1) &&
912 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
913 (xmlStrEqual(newtag, BAD_CAST"style")) ||
914 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
915 (xmlStrEqual(newtag, BAD_CAST"link")) ||
916 (xmlStrEqual(newtag, BAD_CAST"title")) ||
917 (xmlStrEqual(newtag, BAD_CAST"base")))) {
918 /*
919 * dropped OBJECT ... i you put it first BODY will be
920 * assumed !
921 */
922#ifdef DEBUG
923 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
924#endif
925 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
926 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
927 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
928 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
929 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
930 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
931 int i;
932 for (i = 0;i < ctxt->nameNr;i++) {
933 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
934 return;
935 }
936 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
937 return;
938 }
939 }
940
941#ifdef DEBUG
942 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
943#endif
944 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
945 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
946 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
947 }
948}
949
950/**
951 * htmlCheckParagraph
952 * @ctxt: an HTML parser context
953 *
954 * Check whether a p element need to be implied before inserting
955 * characters in the current element.
956 *
957 * Returns 1 if a paragraph has been inserted, 0 if not and -1
958 * in case of error.
959 */
960
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000961static int
Owen Taylor3473f882001-02-23 17:55:21 +0000962htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
963 const xmlChar *tag;
964 int i;
965
966 if (ctxt == NULL)
967 return(-1);
968 tag = ctxt->name;
969 if (tag == NULL) {
970 htmlAutoClose(ctxt, BAD_CAST"p");
971 htmlCheckImplied(ctxt, BAD_CAST"p");
972 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
973 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
974 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
975 return(1);
976 }
977 if (!htmlOmittedDefaultValue)
978 return(0);
979 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
980 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
981#ifdef DEBUG
982 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
983#endif
984 htmlAutoClose(ctxt, BAD_CAST"p");
985 htmlCheckImplied(ctxt, BAD_CAST"p");
986 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
987 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
988 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
989 return(1);
990 }
991 }
992 return(0);
993}
994
995/**
996 * htmlIsScriptAttribute:
997 * @name: an attribute name
998 *
999 * Check if an attribute is of content type Script
1000 *
1001 * Returns 1 is the attribute is a script 0 otherwise
1002 */
1003int
1004htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001005 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001006
1007 if (name == NULL)
1008 return(0);
1009 /*
1010 * all script attributes start with 'on'
1011 */
1012 if ((name[0] != 'o') || (name[1] != 'n'))
1013 return(0);
1014 for (i = 0;
1015 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1016 i++) {
1017 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1018 return(1);
1019 }
1020 return(0);
1021}
1022
1023/************************************************************************
1024 * *
1025 * The list of HTML predefined entities *
1026 * *
1027 ************************************************************************/
1028
1029
Daniel Veillard22090732001-07-16 00:06:07 +00001030static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001031/*
1032 * the 4 absolute ones, plus apostrophe.
1033 */
1034{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1035{ 38, "amp", "ampersand, U+0026 ISOnum" },
1036{ 39, "apos", "single quote" },
1037{ 60, "lt", "less-than sign, U+003C ISOnum" },
1038{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1039
1040/*
1041 * A bunch still in the 128-255 range
1042 * Replacing them depend really on the charset used.
1043 */
1044{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1045{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1046{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1047{ 163, "pound","pound sign, U+00A3 ISOnum" },
1048{ 164, "curren","currency sign, U+00A4 ISOnum" },
1049{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1050{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1051{ 167, "sect", "section sign, U+00A7 ISOnum" },
1052{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1053{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1054{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1055{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1056{ 172, "not", "not sign, U+00AC ISOnum" },
1057{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1058{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1059{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1060{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1061{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1062{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1063{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1064{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1065{ 181, "micro","micro sign, U+00B5 ISOnum" },
1066{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1067{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1068{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1069{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1070{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1071{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1072{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1073{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1074{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1075{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1076{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1077{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1078{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1079{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1080{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1081{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1082{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1083{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1084{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1085{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1086{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1087{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1088{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1089{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1090{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1091{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1092{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1093{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1094{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1095{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1096{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1097{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1098{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1099{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1100{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1101{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1102{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1103{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1104{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1105{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1106{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1107{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1108{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1109{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1110{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1111{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1112{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1113{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1114{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1115{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1116{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1117{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1118{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1119{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1120{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1121{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1122{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1123{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1124{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1125{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1126{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1127{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1128{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1129{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1130{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1131{ 247, "divide","division sign, U+00F7 ISOnum" },
1132{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1133{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1134{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1135{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1136{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1137{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1138{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1139{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1140
1141{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1142{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1143{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1144{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1145{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1146
1147/*
1148 * Anything below should really be kept as entities references
1149 */
1150{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1151
1152{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1153{ 732, "tilde","small tilde, U+02DC ISOdia" },
1154
1155{ 913, "Alpha","greek capital letter alpha, U+0391" },
1156{ 914, "Beta", "greek capital letter beta, U+0392" },
1157{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1158{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1159{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1160{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1161{ 919, "Eta", "greek capital letter eta, U+0397" },
1162{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1163{ 921, "Iota", "greek capital letter iota, U+0399" },
1164{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001165{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001166{ 924, "Mu", "greek capital letter mu, U+039C" },
1167{ 925, "Nu", "greek capital letter nu, U+039D" },
1168{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1169{ 927, "Omicron","greek capital letter omicron, U+039F" },
1170{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1171{ 929, "Rho", "greek capital letter rho, U+03A1" },
1172{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1173{ 932, "Tau", "greek capital letter tau, U+03A4" },
1174{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1175{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1176{ 935, "Chi", "greek capital letter chi, U+03A7" },
1177{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1178{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1179
1180{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1181{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1182{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1183{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1184{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1185{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1186{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1187{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1188{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1189{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1190{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1191{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1192{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1193{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1194{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1195{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1196{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1197{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1198{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1199{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1200{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1201{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1202{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1203{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1204{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1205{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1206{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1207{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1208
1209{ 8194, "ensp", "en space, U+2002 ISOpub" },
1210{ 8195, "emsp", "em space, U+2003 ISOpub" },
1211{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1212{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1213{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1214{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1215{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1216{ 8211, "ndash","en dash, U+2013 ISOpub" },
1217{ 8212, "mdash","em dash, U+2014 ISOpub" },
1218{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1219{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1220{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1221{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1222{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1223{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1224{ 8224, "dagger","dagger, U+2020 ISOpub" },
1225{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1226
1227{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1228{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1229
1230{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1231
1232{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1233{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1234
1235{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1236{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1237
1238{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1239{ 8260, "frasl","fraction slash, U+2044 NEW" },
1240
1241{ 8364, "euro", "euro sign, U+20AC NEW" },
1242
1243{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1244{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1245{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1246{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1247{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1248{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1249{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1250{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1251{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1252{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1253{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1254{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1255{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1256{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1257{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1258{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1259
1260{ 8704, "forall","for all, U+2200 ISOtech" },
1261{ 8706, "part", "partial differential, U+2202 ISOtech" },
1262{ 8707, "exist","there exists, U+2203 ISOtech" },
1263{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1264{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1265{ 8712, "isin", "element of, U+2208 ISOtech" },
1266{ 8713, "notin","not an element of, U+2209 ISOtech" },
1267{ 8715, "ni", "contains as member, U+220B ISOtech" },
1268{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1269{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
1270{ 8722, "minus","minus sign, U+2212 ISOtech" },
1271{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1272{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1273{ 8733, "prop", "proportional to, U+221D ISOtech" },
1274{ 8734, "infin","infinity, U+221E ISOtech" },
1275{ 8736, "ang", "angle, U+2220 ISOamso" },
1276{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1277{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1278{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1279{ 8746, "cup", "union = cup, U+222A ISOtech" },
1280{ 8747, "int", "integral, U+222B ISOtech" },
1281{ 8756, "there4","therefore, U+2234 ISOtech" },
1282{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1283{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1284{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1285{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1286{ 8801, "equiv","identical to, U+2261 ISOtech" },
1287{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1288{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1289{ 8834, "sub", "subset of, U+2282 ISOtech" },
1290{ 8835, "sup", "superset of, U+2283 ISOtech" },
1291{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1292{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1293{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1294{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1295{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1296{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1297{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1298{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1299{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1300{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1301{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1302{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1303{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1304{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1305
1306{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1307{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1308{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1309{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1310
1311};
1312
1313/************************************************************************
1314 * *
1315 * Commodity functions to handle entities *
1316 * *
1317 ************************************************************************/
1318
1319/*
1320 * Macro used to grow the current buffer.
1321 */
1322#define growBuffer(buffer) { \
1323 buffer##_size *= 2; \
1324 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1325 if (buffer == NULL) { \
1326 perror("realloc failed"); \
1327 return(NULL); \
1328 } \
1329}
1330
1331/**
1332 * htmlEntityLookup:
1333 * @name: the entity name
1334 *
1335 * Lookup the given entity in EntitiesTable
1336 *
1337 * TODO: the linear scan is really ugly, an hash table is really needed.
1338 *
1339 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1340 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001341const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001342htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001343 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001344
1345 for (i = 0;i < (sizeof(html40EntitiesTable)/
1346 sizeof(html40EntitiesTable[0]));i++) {
1347 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1348#ifdef DEBUG
1349 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
1350#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001351 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001352 }
1353 }
1354 return(NULL);
1355}
1356
1357/**
1358 * htmlEntityValueLookup:
1359 * @value: the entity's unicode value
1360 *
1361 * Lookup the given entity in EntitiesTable
1362 *
1363 * TODO: the linear scan is really ugly, an hash table is really needed.
1364 *
1365 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1366 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001367const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001368htmlEntityValueLookup(unsigned int value) {
1369 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001370#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00001371 unsigned int lv = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001372#endif
1373
1374 for (i = 0;i < (sizeof(html40EntitiesTable)/
1375 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001376 if (html40EntitiesTable[i].value >= value) {
1377 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001378 break;
1379#ifdef DEBUG
1380 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
1381#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001382 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001383 }
1384#ifdef DEBUG
1385 if (lv > html40EntitiesTable[i].value) {
1386 xmlGenericError(xmlGenericErrorContext,
1387 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1388 lv, html40EntitiesTable[i].value);
1389 }
1390 lv = html40EntitiesTable[i].value;
1391#endif
1392 }
1393 return(NULL);
1394}
1395
1396/**
1397 * UTF8ToHtml:
1398 * @out: a pointer to an array of bytes to store the result
1399 * @outlen: the length of @out
1400 * @in: a pointer to an array of UTF-8 chars
1401 * @inlen: the length of @in
1402 *
1403 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1404 * plus HTML entities block of chars out.
1405 *
1406 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1407 * The value of @inlen after return is the number of octets consumed
1408 * as the return value is positive, else unpredictiable.
1409 * The value of @outlen after return is the number of octets consumed.
1410 */
1411int
1412UTF8ToHtml(unsigned char* out, int *outlen,
1413 const unsigned char* in, int *inlen) {
1414 const unsigned char* processed = in;
1415 const unsigned char* outend;
1416 const unsigned char* outstart = out;
1417 const unsigned char* instart = in;
1418 const unsigned char* inend;
1419 unsigned int c, d;
1420 int trailing;
1421
1422 if (in == NULL) {
1423 /*
1424 * initialization nothing to do
1425 */
1426 *outlen = 0;
1427 *inlen = 0;
1428 return(0);
1429 }
1430 inend = in + (*inlen);
1431 outend = out + (*outlen);
1432 while (in < inend) {
1433 d = *in++;
1434 if (d < 0x80) { c= d; trailing= 0; }
1435 else if (d < 0xC0) {
1436 /* trailing byte in leading position */
1437 *outlen = out - outstart;
1438 *inlen = processed - instart;
1439 return(-2);
1440 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1441 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1442 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1443 else {
1444 /* no chance for this in Ascii */
1445 *outlen = out - outstart;
1446 *inlen = processed - instart;
1447 return(-2);
1448 }
1449
1450 if (inend - in < trailing) {
1451 break;
1452 }
1453
1454 for ( ; trailing; trailing--) {
1455 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1456 break;
1457 c <<= 6;
1458 c |= d & 0x3F;
1459 }
1460
1461 /* assertion: c is a single UTF-4 value */
1462 if (c < 0x80) {
1463 if (out + 1 >= outend)
1464 break;
1465 *out++ = c;
1466 } else {
1467 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001468 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001469
1470 /*
1471 * Try to lookup a predefined HTML entity for it
1472 */
1473
1474 ent = htmlEntityValueLookup(c);
1475 if (ent == NULL) {
1476 /* no chance for this in Ascii */
1477 *outlen = out - outstart;
1478 *inlen = processed - instart;
1479 return(-2);
1480 }
1481 len = strlen(ent->name);
1482 if (out + 2 + len >= outend)
1483 break;
1484 *out++ = '&';
1485 memcpy(out, ent->name, len);
1486 out += len;
1487 *out++ = ';';
1488 }
1489 processed = in;
1490 }
1491 *outlen = out - outstart;
1492 *inlen = processed - instart;
1493 return(0);
1494}
1495
1496/**
1497 * htmlEncodeEntities:
1498 * @out: a pointer to an array of bytes to store the result
1499 * @outlen: the length of @out
1500 * @in: a pointer to an array of UTF-8 chars
1501 * @inlen: the length of @in
1502 * @quoteChar: the quote character to escape (' or ") or zero.
1503 *
1504 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1505 * plus HTML entities block of chars out.
1506 *
1507 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1508 * The value of @inlen after return is the number of octets consumed
1509 * as the return value is positive, else unpredictiable.
1510 * The value of @outlen after return is the number of octets consumed.
1511 */
1512int
1513htmlEncodeEntities(unsigned char* out, int *outlen,
1514 const unsigned char* in, int *inlen, int quoteChar) {
1515 const unsigned char* processed = in;
1516 const unsigned char* outend = out + (*outlen);
1517 const unsigned char* outstart = out;
1518 const unsigned char* instart = in;
1519 const unsigned char* inend = in + (*inlen);
1520 unsigned int c, d;
1521 int trailing;
1522
1523 while (in < inend) {
1524 d = *in++;
1525 if (d < 0x80) { c= d; trailing= 0; }
1526 else if (d < 0xC0) {
1527 /* trailing byte in leading position */
1528 *outlen = out - outstart;
1529 *inlen = processed - instart;
1530 return(-2);
1531 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1532 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1533 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1534 else {
1535 /* no chance for this in Ascii */
1536 *outlen = out - outstart;
1537 *inlen = processed - instart;
1538 return(-2);
1539 }
1540
1541 if (inend - in < trailing)
1542 break;
1543
1544 while (trailing--) {
1545 if (((d= *in++) & 0xC0) != 0x80) {
1546 *outlen = out - outstart;
1547 *inlen = processed - instart;
1548 return(-2);
1549 }
1550 c <<= 6;
1551 c |= d & 0x3F;
1552 }
1553
1554 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001555 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1556 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001557 if (out >= outend)
1558 break;
1559 *out++ = c;
1560 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001561 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001562 const char *cp;
1563 char nbuf[16];
1564 int len;
1565
1566 /*
1567 * Try to lookup a predefined HTML entity for it
1568 */
1569 ent = htmlEntityValueLookup(c);
1570 if (ent == NULL) {
1571 sprintf(nbuf, "#%u", c);
1572 cp = nbuf;
1573 }
1574 else
1575 cp = ent->name;
1576 len = strlen(cp);
1577 if (out + 2 + len > outend)
1578 break;
1579 *out++ = '&';
1580 memcpy(out, cp, len);
1581 out += len;
1582 *out++ = ';';
1583 }
1584 processed = in;
1585 }
1586 *outlen = out - outstart;
1587 *inlen = processed - instart;
1588 return(0);
1589}
1590
1591/**
1592 * htmlDecodeEntities:
1593 * @ctxt: the parser context
1594 * @len: the len to decode (in bytes !), -1 for no size limit
1595 * @end: an end marker xmlChar, 0 if none
1596 * @end2: an end marker xmlChar, 0 if none
1597 * @end3: an end marker xmlChar, 0 if none
1598 *
1599 * Subtitute the HTML entities by their value
1600 *
1601 * DEPRECATED !!!!
1602 *
1603 * Returns A newly allocated string with the substitution done. The caller
1604 * must deallocate it !
1605 */
1606xmlChar *
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00001607htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
1608 xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001609 static int deprecated = 0;
1610 if (!deprecated) {
1611 xmlGenericError(xmlGenericErrorContext,
1612 "htmlDecodeEntities() deprecated function reached\n");
1613 deprecated = 1;
1614 }
1615 return(NULL);
1616#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00001617 xmlChar *name = NULL;
1618 xmlChar *buffer = NULL;
1619 unsigned int buffer_size = 0;
1620 unsigned int nbchars = 0;
1621 htmlEntityDescPtr ent;
1622 unsigned int max = (unsigned int) len;
1623 int c,l;
1624
1625 if (ctxt->depth > 40) {
1626 ctxt->errNo = XML_ERR_ENTITY_LOOP;
1627 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1628 ctxt->sax->error(ctxt->userData,
1629 "Detected entity reference loop\n");
1630 ctxt->wellFormed = 0;
1631 ctxt->disableSAX = 1;
1632 return(NULL);
1633 }
1634
1635 /*
1636 * allocate a translation buffer.
1637 */
1638 buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1639 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1640 if (buffer == NULL) {
1641 perror("xmlDecodeEntities: malloc failed");
1642 return(NULL);
1643 }
1644
1645 /*
1646 * Ok loop until we reach one of the ending char or a size limit.
1647 */
1648 c = CUR_CHAR(l);
1649 while ((nbchars < max) && (c != end) &&
1650 (c != end2) && (c != end3)) {
1651
1652 if (c == 0) break;
1653 if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
1654 int val = htmlParseCharRef(ctxt);
1655 COPY_BUF(0,buffer,nbchars,val);
1656 NEXTL(l);
1657 } else if ((c == '&') && (ctxt->token != '&')) {
1658 ent = htmlParseEntityRef(ctxt, &name);
1659 if (name != NULL) {
1660 if (ent != NULL) {
1661 int val = ent->value;
1662 COPY_BUF(0,buffer,nbchars,val);
1663 NEXTL(l);
1664 } else {
1665 const xmlChar *cur = name;
1666
1667 buffer[nbchars++] = '&';
1668 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1669 growBuffer(buffer);
1670 }
1671 while (*cur != 0) {
1672 buffer[nbchars++] = *cur++;
1673 }
1674 buffer[nbchars++] = ';';
1675 }
1676 }
1677 } else {
1678 COPY_BUF(l,buffer,nbchars,c);
1679 NEXTL(l);
1680 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1681 growBuffer(buffer);
1682 }
1683 }
1684 c = CUR_CHAR(l);
1685 }
1686 buffer[nbchars++] = 0;
1687 return(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001688#endif
Owen Taylor3473f882001-02-23 17:55:21 +00001689}
1690
1691/************************************************************************
1692 * *
1693 * Commodity functions to handle streams *
1694 * *
1695 ************************************************************************/
1696
1697/**
Owen Taylor3473f882001-02-23 17:55:21 +00001698 * htmlNewInputStream:
1699 * @ctxt: an HTML parser context
1700 *
1701 * Create a new input stream structure
1702 * Returns the new input stream or NULL
1703 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001704static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001705htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1706 htmlParserInputPtr input;
1707
1708 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1709 if (input == NULL) {
1710 ctxt->errNo = XML_ERR_NO_MEMORY;
1711 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1712 ctxt->sax->error(ctxt->userData,
1713 "malloc: couldn't allocate a new input stream\n");
1714 return(NULL);
1715 }
1716 memset(input, 0, sizeof(htmlParserInput));
1717 input->filename = NULL;
1718 input->directory = NULL;
1719 input->base = NULL;
1720 input->cur = NULL;
1721 input->buf = NULL;
1722 input->line = 1;
1723 input->col = 1;
1724 input->buf = NULL;
1725 input->free = NULL;
1726 input->version = NULL;
1727 input->consumed = 0;
1728 input->length = 0;
1729 return(input);
1730}
1731
1732
1733/************************************************************************
1734 * *
1735 * Commodity functions, cleanup needed ? *
1736 * *
1737 ************************************************************************/
1738
1739/**
1740 * areBlanks:
1741 * @ctxt: an HTML parser context
1742 * @str: a xmlChar *
1743 * @len: the size of @str
1744 *
1745 * Is this a sequence of blank chars that one can ignore ?
1746 *
1747 * Returns 1 if ignorable 0 otherwise.
1748 */
1749
1750static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
1751 int i;
1752 xmlNodePtr lastChild;
1753
1754 for (i = 0;i < len;i++)
1755 if (!(IS_BLANK(str[i]))) return(0);
1756
1757 if (CUR == 0) return(1);
1758 if (CUR != '<') return(0);
1759 if (ctxt->name == NULL)
1760 return(1);
1761 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
1762 return(1);
1763 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
1764 return(1);
1765 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
1766 return(1);
1767 if (ctxt->node == NULL) return(0);
1768 lastChild = xmlGetLastChild(ctxt->node);
1769 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00001770 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
1771 (ctxt->node->content != NULL)) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001772 } else if (xmlNodeIsText(lastChild)) {
1773 return(0);
1774 } else if (xmlStrEqual(lastChild->name, BAD_CAST"b")) {
1775 return(0);
1776 } else if (xmlStrEqual(lastChild->name, BAD_CAST"bold")) {
1777 return(0);
1778 } else if (xmlStrEqual(lastChild->name, BAD_CAST"em")) {
1779 return(0);
1780 }
1781 return(1);
1782}
1783
1784/**
Owen Taylor3473f882001-02-23 17:55:21 +00001785 * htmlNewDocNoDtD:
1786 * @URI: URI for the dtd, or NULL
1787 * @ExternalID: the external ID of the DTD, or NULL
1788 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001789 * Creates a new HTML document without a DTD node if @URI and @ExternalID
1790 * are NULL
1791 *
Owen Taylor3473f882001-02-23 17:55:21 +00001792 * Returns a new document, do not intialize the DTD if not provided
1793 */
1794htmlDocPtr
1795htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
1796 xmlDocPtr cur;
1797
1798 /*
1799 * Allocate a new document and fill the fields.
1800 */
1801 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
1802 if (cur == NULL) {
1803 xmlGenericError(xmlGenericErrorContext,
1804 "xmlNewDoc : malloc failed\n");
1805 return(NULL);
1806 }
1807 memset(cur, 0, sizeof(xmlDoc));
1808
1809 cur->type = XML_HTML_DOCUMENT_NODE;
1810 cur->version = NULL;
1811 cur->intSubset = NULL;
1812 if ((ExternalID != NULL) ||
1813 (URI != NULL))
1814 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
1815 cur->doc = cur;
1816 cur->name = NULL;
1817 cur->children = NULL;
1818 cur->extSubset = NULL;
1819 cur->oldNs = NULL;
1820 cur->encoding = NULL;
1821 cur->standalone = 1;
1822 cur->compression = 0;
1823 cur->ids = NULL;
1824 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001825 cur->_private = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001826 return(cur);
1827}
1828
1829/**
1830 * htmlNewDoc:
1831 * @URI: URI for the dtd, or NULL
1832 * @ExternalID: the external ID of the DTD, or NULL
1833 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001834 * Creates a new HTML document
1835 *
Owen Taylor3473f882001-02-23 17:55:21 +00001836 * Returns a new document
1837 */
1838htmlDocPtr
1839htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1840 if ((URI == NULL) && (ExternalID == NULL))
1841 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00001842 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
1843 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00001844
1845 return(htmlNewDocNoDtD(URI, ExternalID));
1846}
1847
1848
1849/************************************************************************
1850 * *
1851 * The parser itself *
1852 * Relates to http://www.w3.org/TR/html40 *
1853 * *
1854 ************************************************************************/
1855
1856/************************************************************************
1857 * *
1858 * The parser itself *
1859 * *
1860 ************************************************************************/
1861
1862/**
1863 * htmlParseHTMLName:
1864 * @ctxt: an HTML parser context
1865 *
1866 * parse an HTML tag or attribute name, note that we convert it to lowercase
1867 * since HTML names are not case-sensitive.
1868 *
1869 * Returns the Tag Name parsed or NULL
1870 */
1871
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001872static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001873htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1874 xmlChar *ret = NULL;
1875 int i = 0;
1876 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
1877
1878 if (!IS_LETTER(CUR) && (CUR != '_') &&
1879 (CUR != ':')) return(NULL);
1880
1881 while ((i < HTML_PARSER_BUFFER_SIZE) &&
1882 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1883 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
1884 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
1885 else loc[i] = CUR;
1886 i++;
1887
1888 NEXT;
1889 }
1890
1891 ret = xmlStrndup(loc, i);
1892
1893 return(ret);
1894}
1895
1896/**
1897 * htmlParseName:
1898 * @ctxt: an HTML parser context
1899 *
1900 * parse an HTML name, this routine is case sensistive.
1901 *
1902 * Returns the Name parsed or NULL
1903 */
1904
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001905static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001906htmlParseName(htmlParserCtxtPtr ctxt) {
1907 xmlChar buf[HTML_MAX_NAMELEN];
1908 int len = 0;
1909
1910 GROW;
1911 if (!IS_LETTER(CUR) && (CUR != '_')) {
1912 return(NULL);
1913 }
1914
1915 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1916 (CUR == '.') || (CUR == '-') ||
1917 (CUR == '_') || (CUR == ':') ||
1918 (IS_COMBINING(CUR)) ||
1919 (IS_EXTENDER(CUR))) {
1920 buf[len++] = CUR;
1921 NEXT;
1922 if (len >= HTML_MAX_NAMELEN) {
1923 xmlGenericError(xmlGenericErrorContext,
1924 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1925 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1926 (CUR == '.') || (CUR == '-') ||
1927 (CUR == '_') || (CUR == ':') ||
1928 (IS_COMBINING(CUR)) ||
1929 (IS_EXTENDER(CUR)))
1930 NEXT;
1931 break;
1932 }
1933 }
1934 return(xmlStrndup(buf, len));
1935}
1936
1937/**
1938 * htmlParseHTMLAttribute:
1939 * @ctxt: an HTML parser context
1940 * @stop: a char stop value
1941 *
1942 * parse an HTML attribute value till the stop (quote), if
1943 * stop is 0 then it stops at the first space
1944 *
1945 * Returns the attribute parsed or NULL
1946 */
1947
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001948static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001949htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
1950 xmlChar *buffer = NULL;
1951 int buffer_size = 0;
1952 xmlChar *out = NULL;
1953 xmlChar *name = NULL;
1954
1955 xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00001956 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001957
1958 /*
1959 * allocate a translation buffer.
1960 */
1961 buffer_size = HTML_PARSER_BUFFER_SIZE;
1962 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1963 if (buffer == NULL) {
1964 perror("htmlParseHTMLAttribute: malloc failed");
1965 return(NULL);
1966 }
1967 out = buffer;
1968
1969 /*
1970 * Ok loop until we reach one of the ending chars
1971 */
1972 while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1973 if ((stop == 0) && (IS_BLANK(CUR))) break;
1974 if (CUR == '&') {
1975 if (NXT(1) == '#') {
1976 unsigned int c;
1977 int bits;
1978
1979 c = htmlParseCharRef(ctxt);
1980 if (c < 0x80)
1981 { *out++ = c; bits= -6; }
1982 else if (c < 0x800)
1983 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1984 else if (c < 0x10000)
1985 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1986 else
1987 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1988
1989 for ( ; bits >= 0; bits-= 6) {
1990 *out++ = ((c >> bits) & 0x3F) | 0x80;
1991 }
1992 } else {
1993 ent = htmlParseEntityRef(ctxt, &name);
1994 if (name == NULL) {
1995 *out++ = '&';
1996 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001997 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00001998
1999 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002000 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002001 }
2002 } else if (ent == NULL) {
2003 *out++ = '&';
2004 cur = name;
2005 while (*cur != 0) {
2006 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002007 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002008
2009 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002010 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002011 }
2012 *out++ = *cur++;
2013 }
2014 xmlFree(name);
2015 } else {
2016 unsigned int c;
2017 int bits;
2018
2019 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002020 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002021
2022 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002023 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002024 }
2025 c = (xmlChar)ent->value;
2026 if (c < 0x80)
2027 { *out++ = c; bits= -6; }
2028 else if (c < 0x800)
2029 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2030 else if (c < 0x10000)
2031 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2032 else
2033 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2034
2035 for ( ; bits >= 0; bits-= 6) {
2036 *out++ = ((c >> bits) & 0x3F) | 0x80;
2037 }
2038 xmlFree(name);
2039 }
2040 }
2041 } else {
2042 unsigned int c;
2043 int bits, l;
2044
2045 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002046 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002047
2048 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002049 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002050 }
2051 c = CUR_CHAR(l);
2052 if (c < 0x80)
2053 { *out++ = c; bits= -6; }
2054 else if (c < 0x800)
2055 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2056 else if (c < 0x10000)
2057 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2058 else
2059 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2060
2061 for ( ; bits >= 0; bits-= 6) {
2062 *out++ = ((c >> bits) & 0x3F) | 0x80;
2063 }
2064 NEXT;
2065 }
2066 }
2067 *out++ = 0;
2068 return(buffer);
2069}
2070
2071/**
Owen Taylor3473f882001-02-23 17:55:21 +00002072 * htmlParseEntityRef:
2073 * @ctxt: an HTML parser context
2074 * @str: location to store the entity name
2075 *
2076 * parse an HTML ENTITY references
2077 *
2078 * [68] EntityRef ::= '&' Name ';'
2079 *
2080 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2081 * if non-NULL *str will have to be freed by the caller.
2082 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002083const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00002084htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2085 xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002086 const htmlEntityDesc * ent = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002087 *str = NULL;
2088
2089 if (CUR == '&') {
2090 NEXT;
2091 name = htmlParseName(ctxt);
2092 if (name == NULL) {
2093 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2094 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2095 ctxt->wellFormed = 0;
2096 } else {
2097 GROW;
2098 if (CUR == ';') {
2099 *str = name;
2100
2101 /*
2102 * Lookup the entity in the table.
2103 */
2104 ent = htmlEntityLookup(name);
2105 if (ent != NULL) /* OK that's ugly !!! */
2106 NEXT;
2107 } else {
2108 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2109 ctxt->sax->error(ctxt->userData,
2110 "htmlParseEntityRef: expecting ';'\n");
2111 *str = name;
2112 }
2113 }
2114 }
2115 return(ent);
2116}
2117
2118/**
2119 * htmlParseAttValue:
2120 * @ctxt: an HTML parser context
2121 *
2122 * parse a value for an attribute
2123 * Note: the parser won't do substitution of entities here, this
2124 * will be handled later in xmlStringGetNodeList, unless it was
2125 * asked for ctxt->replaceEntities != 0
2126 *
2127 * Returns the AttValue parsed or NULL.
2128 */
2129
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002130static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002131htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2132 xmlChar *ret = NULL;
2133
2134 if (CUR == '"') {
2135 NEXT;
2136 ret = htmlParseHTMLAttribute(ctxt, '"');
2137 if (CUR != '"') {
2138 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2139 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2140 ctxt->wellFormed = 0;
2141 } else
2142 NEXT;
2143 } else if (CUR == '\'') {
2144 NEXT;
2145 ret = htmlParseHTMLAttribute(ctxt, '\'');
2146 if (CUR != '\'') {
2147 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2148 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2149 ctxt->wellFormed = 0;
2150 } else
2151 NEXT;
2152 } else {
2153 /*
2154 * That's an HTMLism, the attribute value may not be quoted
2155 */
2156 ret = htmlParseHTMLAttribute(ctxt, 0);
2157 if (ret == NULL) {
2158 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2159 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2160 ctxt->wellFormed = 0;
2161 }
2162 }
2163 return(ret);
2164}
2165
2166/**
2167 * htmlParseSystemLiteral:
2168 * @ctxt: an HTML parser context
2169 *
2170 * parse an HTML Literal
2171 *
2172 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2173 *
2174 * Returns the SystemLiteral parsed or NULL
2175 */
2176
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002177static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002178htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2179 const xmlChar *q;
2180 xmlChar *ret = NULL;
2181
2182 if (CUR == '"') {
2183 NEXT;
2184 q = CUR_PTR;
2185 while ((IS_CHAR(CUR)) && (CUR != '"'))
2186 NEXT;
2187 if (!IS_CHAR(CUR)) {
2188 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2189 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2190 ctxt->wellFormed = 0;
2191 } else {
2192 ret = xmlStrndup(q, CUR_PTR - q);
2193 NEXT;
2194 }
2195 } else if (CUR == '\'') {
2196 NEXT;
2197 q = CUR_PTR;
2198 while ((IS_CHAR(CUR)) && (CUR != '\''))
2199 NEXT;
2200 if (!IS_CHAR(CUR)) {
2201 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2202 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2203 ctxt->wellFormed = 0;
2204 } else {
2205 ret = xmlStrndup(q, CUR_PTR - q);
2206 NEXT;
2207 }
2208 } else {
2209 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2210 ctxt->sax->error(ctxt->userData,
2211 "SystemLiteral \" or ' expected\n");
2212 ctxt->wellFormed = 0;
2213 }
2214
2215 return(ret);
2216}
2217
2218/**
2219 * htmlParsePubidLiteral:
2220 * @ctxt: an HTML parser context
2221 *
2222 * parse an HTML public literal
2223 *
2224 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2225 *
2226 * Returns the PubidLiteral parsed or NULL.
2227 */
2228
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002229static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002230htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2231 const xmlChar *q;
2232 xmlChar *ret = NULL;
2233 /*
2234 * Name ::= (Letter | '_') (NameChar)*
2235 */
2236 if (CUR == '"') {
2237 NEXT;
2238 q = CUR_PTR;
2239 while (IS_PUBIDCHAR(CUR)) NEXT;
2240 if (CUR != '"') {
2241 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2242 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2243 ctxt->wellFormed = 0;
2244 } else {
2245 ret = xmlStrndup(q, CUR_PTR - q);
2246 NEXT;
2247 }
2248 } else if (CUR == '\'') {
2249 NEXT;
2250 q = CUR_PTR;
2251 while ((IS_LETTER(CUR)) && (CUR != '\''))
2252 NEXT;
2253 if (!IS_LETTER(CUR)) {
2254 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2255 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2256 ctxt->wellFormed = 0;
2257 } else {
2258 ret = xmlStrndup(q, CUR_PTR - q);
2259 NEXT;
2260 }
2261 } else {
2262 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2263 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2264 ctxt->wellFormed = 0;
2265 }
2266
2267 return(ret);
2268}
2269
2270/**
2271 * htmlParseScript:
2272 * @ctxt: an HTML parser context
2273 *
2274 * parse the content of an HTML SCRIPT or STYLE element
2275 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2276 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2277 * http://www.w3.org/TR/html4/types.html#type-script
2278 * http://www.w3.org/TR/html4/types.html#h-6.15
2279 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2280 *
2281 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2282 * element and the value of intrinsic event attributes. User agents must
2283 * not evaluate script data as HTML markup but instead must pass it on as
2284 * data to a script engine.
2285 * NOTES:
2286 * - The content is passed like CDATA
2287 * - the attributes for style and scripting "onXXX" are also described
2288 * as CDATA but SGML allows entities references in attributes so their
2289 * processing is identical as other attributes
2290 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002291static void
Owen Taylor3473f882001-02-23 17:55:21 +00002292htmlParseScript(htmlParserCtxtPtr ctxt) {
2293 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2294 int nbchar = 0;
2295 xmlChar cur;
2296
2297 SHRINK;
2298 cur = CUR;
2299 while (IS_CHAR(cur)) {
2300 if ((cur == '<') && (NXT(1) == '/')) {
2301 /*
2302 * One should break here, the specification is clear:
2303 * Authors should therefore escape "</" within the content.
2304 * Escape mechanisms are specific to each scripting or
2305 * style sheet language.
2306 */
2307 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2308 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2309 break; /* while */
2310 }
2311 buf[nbchar++] = cur;
2312 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2313 if (ctxt->sax->cdataBlock!= NULL) {
2314 /*
2315 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2316 */
2317 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2318 }
2319 nbchar = 0;
2320 }
2321 NEXT;
2322 cur = CUR;
2323 }
2324 if (!(IS_CHAR(cur))) {
2325 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2326 ctxt->sax->error(ctxt->userData,
2327 "Invalid char in CDATA 0x%X\n", cur);
2328 ctxt->wellFormed = 0;
2329 NEXT;
2330 }
2331
2332 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2333 if (ctxt->sax->cdataBlock!= NULL) {
2334 /*
2335 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2336 */
2337 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2338 }
2339 }
2340}
2341
2342
2343/**
2344 * htmlParseCharData:
2345 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002346 *
2347 * parse a CharData section.
2348 * if we are within a CDATA section ']]>' marks an end of section.
2349 *
2350 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2351 */
2352
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002353static void
2354htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002355 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2356 int nbchar = 0;
2357 int cur, l;
2358
2359 SHRINK;
2360 cur = CUR_CHAR(l);
2361 while (((cur != '<') || (ctxt->token == '<')) &&
2362 ((cur != '&') || (ctxt->token == '&')) &&
2363 (IS_CHAR(cur))) {
2364 COPY_BUF(l,buf,nbchar,cur);
2365 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2366 /*
2367 * Ok the segment is to be consumed as chars.
2368 */
2369 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2370 if (areBlanks(ctxt, buf, nbchar)) {
2371 if (ctxt->sax->ignorableWhitespace != NULL)
2372 ctxt->sax->ignorableWhitespace(ctxt->userData,
2373 buf, nbchar);
2374 } else {
2375 htmlCheckParagraph(ctxt);
2376 if (ctxt->sax->characters != NULL)
2377 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2378 }
2379 }
2380 nbchar = 0;
2381 }
2382 NEXTL(l);
2383 cur = CUR_CHAR(l);
2384 }
2385 if (nbchar != 0) {
2386 /*
2387 * Ok the segment is to be consumed as chars.
2388 */
2389 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2390 if (areBlanks(ctxt, buf, nbchar)) {
2391 if (ctxt->sax->ignorableWhitespace != NULL)
2392 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2393 } else {
2394 htmlCheckParagraph(ctxt);
2395 if (ctxt->sax->characters != NULL)
2396 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2397 }
2398 }
2399 }
2400}
2401
2402/**
2403 * htmlParseExternalID:
2404 * @ctxt: an HTML parser context
2405 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002406 *
2407 * Parse an External ID or a Public ID
2408 *
Owen Taylor3473f882001-02-23 17:55:21 +00002409 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2410 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2411 *
2412 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2413 *
2414 * Returns the function returns SystemLiteral and in the second
2415 * case publicID receives PubidLiteral, is strict is off
2416 * it is possible to return NULL and have publicID set.
2417 */
2418
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002419static xmlChar *
2420htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002421 xmlChar *URI = NULL;
2422
2423 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2424 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2425 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2426 SKIP(6);
2427 if (!IS_BLANK(CUR)) {
2428 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2429 ctxt->sax->error(ctxt->userData,
2430 "Space required after 'SYSTEM'\n");
2431 ctxt->wellFormed = 0;
2432 }
2433 SKIP_BLANKS;
2434 URI = htmlParseSystemLiteral(ctxt);
2435 if (URI == NULL) {
2436 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2437 ctxt->sax->error(ctxt->userData,
2438 "htmlParseExternalID: SYSTEM, no URI\n");
2439 ctxt->wellFormed = 0;
2440 }
2441 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2442 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2443 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2444 SKIP(6);
2445 if (!IS_BLANK(CUR)) {
2446 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2447 ctxt->sax->error(ctxt->userData,
2448 "Space required after 'PUBLIC'\n");
2449 ctxt->wellFormed = 0;
2450 }
2451 SKIP_BLANKS;
2452 *publicID = htmlParsePubidLiteral(ctxt);
2453 if (*publicID == NULL) {
2454 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2455 ctxt->sax->error(ctxt->userData,
2456 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2457 ctxt->wellFormed = 0;
2458 }
2459 SKIP_BLANKS;
2460 if ((CUR == '"') || (CUR == '\'')) {
2461 URI = htmlParseSystemLiteral(ctxt);
2462 }
2463 }
2464 return(URI);
2465}
2466
2467/**
2468 * htmlParseComment:
2469 * @ctxt: an HTML parser context
2470 *
2471 * Parse an XML (SGML) comment <!-- .... -->
2472 *
2473 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2474 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002475static void
Owen Taylor3473f882001-02-23 17:55:21 +00002476htmlParseComment(htmlParserCtxtPtr ctxt) {
2477 xmlChar *buf = NULL;
2478 int len;
2479 int size = HTML_PARSER_BUFFER_SIZE;
2480 int q, ql;
2481 int r, rl;
2482 int cur, l;
2483 xmlParserInputState state;
2484
2485 /*
2486 * Check that there is a comment right here.
2487 */
2488 if ((RAW != '<') || (NXT(1) != '!') ||
2489 (NXT(2) != '-') || (NXT(3) != '-')) return;
2490
2491 state = ctxt->instate;
2492 ctxt->instate = XML_PARSER_COMMENT;
2493 SHRINK;
2494 SKIP(4);
2495 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2496 if (buf == NULL) {
2497 xmlGenericError(xmlGenericErrorContext,
2498 "malloc of %d byte failed\n", size);
2499 ctxt->instate = state;
2500 return;
2501 }
2502 q = CUR_CHAR(ql);
2503 NEXTL(ql);
2504 r = CUR_CHAR(rl);
2505 NEXTL(rl);
2506 cur = CUR_CHAR(l);
2507 len = 0;
2508 while (IS_CHAR(cur) &&
2509 ((cur != '>') ||
2510 (r != '-') || (q != '-'))) {
2511 if (len + 5 >= size) {
2512 size *= 2;
2513 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2514 if (buf == NULL) {
2515 xmlGenericError(xmlGenericErrorContext,
2516 "realloc of %d byte failed\n", size);
2517 ctxt->instate = state;
2518 return;
2519 }
2520 }
2521 COPY_BUF(ql,buf,len,q);
2522 q = r;
2523 ql = rl;
2524 r = cur;
2525 rl = l;
2526 NEXTL(l);
2527 cur = CUR_CHAR(l);
2528 if (cur == 0) {
2529 SHRINK;
2530 GROW;
2531 cur = CUR_CHAR(l);
2532 }
2533 }
2534 buf[len] = 0;
2535 if (!IS_CHAR(cur)) {
2536 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2537 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2538 ctxt->sax->error(ctxt->userData,
2539 "Comment not terminated \n<!--%.50s\n", buf);
2540 ctxt->wellFormed = 0;
2541 xmlFree(buf);
2542 } else {
2543 NEXT;
2544 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2545 (!ctxt->disableSAX))
2546 ctxt->sax->comment(ctxt->userData, buf);
2547 xmlFree(buf);
2548 }
2549 ctxt->instate = state;
2550}
2551
2552/**
2553 * htmlParseCharRef:
2554 * @ctxt: an HTML parser context
2555 *
2556 * parse Reference declarations
2557 *
2558 * [66] CharRef ::= '&#' [0-9]+ ';' |
2559 * '&#x' [0-9a-fA-F]+ ';'
2560 *
2561 * Returns the value parsed (as an int)
2562 */
2563int
2564htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2565 int val = 0;
2566
2567 if ((CUR == '&') && (NXT(1) == '#') &&
2568 (NXT(2) == 'x')) {
2569 SKIP(3);
2570 while (CUR != ';') {
2571 if ((CUR >= '0') && (CUR <= '9'))
2572 val = val * 16 + (CUR - '0');
2573 else if ((CUR >= 'a') && (CUR <= 'f'))
2574 val = val * 16 + (CUR - 'a') + 10;
2575 else if ((CUR >= 'A') && (CUR <= 'F'))
2576 val = val * 16 + (CUR - 'A') + 10;
2577 else {
2578 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2579 ctxt->sax->error(ctxt->userData,
2580 "htmlParseCharRef: invalid hexadecimal value\n");
2581 ctxt->wellFormed = 0;
2582 return(0);
2583 }
2584 NEXT;
2585 }
2586 if (CUR == ';')
2587 NEXT;
2588 } else if ((CUR == '&') && (NXT(1) == '#')) {
2589 SKIP(2);
2590 while (CUR != ';') {
2591 if ((CUR >= '0') && (CUR <= '9'))
2592 val = val * 10 + (CUR - '0');
2593 else {
2594 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2595 ctxt->sax->error(ctxt->userData,
2596 "htmlParseCharRef: invalid decimal value\n");
2597 ctxt->wellFormed = 0;
2598 return(0);
2599 }
2600 NEXT;
2601 }
2602 if (CUR == ';')
2603 NEXT;
2604 } else {
2605 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2606 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2607 ctxt->wellFormed = 0;
2608 }
2609 /*
2610 * Check the value IS_CHAR ...
2611 */
2612 if (IS_CHAR(val)) {
2613 return(val);
2614 } else {
2615 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2616 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
2617 val);
2618 ctxt->wellFormed = 0;
2619 }
2620 return(0);
2621}
2622
2623
2624/**
2625 * htmlParseDocTypeDecl :
2626 * @ctxt: an HTML parser context
2627 *
2628 * parse a DOCTYPE declaration
2629 *
2630 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2631 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2632 */
2633
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002634static void
Owen Taylor3473f882001-02-23 17:55:21 +00002635htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
2636 xmlChar *name;
2637 xmlChar *ExternalID = NULL;
2638 xmlChar *URI = NULL;
2639
2640 /*
2641 * We know that '<!DOCTYPE' has been detected.
2642 */
2643 SKIP(9);
2644
2645 SKIP_BLANKS;
2646
2647 /*
2648 * Parse the DOCTYPE name.
2649 */
2650 name = htmlParseName(ctxt);
2651 if (name == NULL) {
2652 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2653 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2654 ctxt->wellFormed = 0;
2655 }
2656 /*
2657 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2658 */
2659
2660 SKIP_BLANKS;
2661
2662 /*
2663 * Check for SystemID and ExternalID
2664 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002665 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00002666 SKIP_BLANKS;
2667
2668 /*
2669 * We should be at the end of the DOCTYPE declaration.
2670 */
2671 if (CUR != '>') {
2672 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardf6ed8bc2001-10-02 09:22:47 +00002673 ctxt->sax->error(ctxt->userData, "DOCTYPE improperly terminated\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002674 ctxt->wellFormed = 0;
2675 /* We shouldn't try to resynchronize ... */
2676 }
2677 NEXT;
2678
2679 /*
2680 * Create or update the document accordingly to the DOCTYPE
2681 */
2682 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2683 (!ctxt->disableSAX))
2684 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
2685
2686 /*
2687 * Cleanup, since we don't use all those identifiers
2688 */
2689 if (URI != NULL) xmlFree(URI);
2690 if (ExternalID != NULL) xmlFree(ExternalID);
2691 if (name != NULL) xmlFree(name);
2692}
2693
2694/**
2695 * htmlParseAttribute:
2696 * @ctxt: an HTML parser context
2697 * @value: a xmlChar ** used to store the value of the attribute
2698 *
2699 * parse an attribute
2700 *
2701 * [41] Attribute ::= Name Eq AttValue
2702 *
2703 * [25] Eq ::= S? '=' S?
2704 *
2705 * With namespace:
2706 *
2707 * [NS 11] Attribute ::= QName Eq AttValue
2708 *
2709 * Also the case QName == xmlns:??? is handled independently as a namespace
2710 * definition.
2711 *
2712 * Returns the attribute name, and the value in *value.
2713 */
2714
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002715static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002716htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
2717 xmlChar *name, *val = NULL;
2718
2719 *value = NULL;
2720 name = htmlParseHTMLName(ctxt);
2721 if (name == NULL) {
2722 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2723 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2724 ctxt->wellFormed = 0;
2725 return(NULL);
2726 }
2727
2728 /*
2729 * read the value
2730 */
2731 SKIP_BLANKS;
2732 if (CUR == '=') {
2733 NEXT;
2734 SKIP_BLANKS;
2735 val = htmlParseAttValue(ctxt);
2736 /******
2737 } else {
2738 * TODO : some attribute must have values, some may not
2739 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2740 ctxt->sax->warning(ctxt->userData,
2741 "No value for attribute %s\n", name); */
2742 }
2743
2744 *value = val;
2745 return(name);
2746}
2747
2748/**
2749 * htmlCheckEncoding:
2750 * @ctxt: an HTML parser context
2751 * @attvalue: the attribute value
2752 *
2753 * Checks an http-equiv attribute from a Meta tag to detect
2754 * the encoding
2755 * If a new encoding is detected the parser is switched to decode
2756 * it and pass UTF8
2757 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002758static void
Owen Taylor3473f882001-02-23 17:55:21 +00002759htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2760 const xmlChar *encoding;
2761
2762 if ((ctxt == NULL) || (attvalue == NULL))
2763 return;
2764
2765 /* do not change encoding */
2766 if (ctxt->input->encoding != NULL)
2767 return;
2768
2769 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
2770 if (encoding != NULL) {
2771 encoding += 8;
2772 } else {
2773 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
2774 if (encoding != NULL)
2775 encoding += 9;
2776 }
2777 if (encoding != NULL) {
2778 xmlCharEncoding enc;
2779 xmlCharEncodingHandlerPtr handler;
2780
2781 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2782
2783 if (ctxt->input->encoding != NULL)
2784 xmlFree((xmlChar *) ctxt->input->encoding);
2785 ctxt->input->encoding = xmlStrdup(encoding);
2786
2787 enc = xmlParseCharEncoding((const char *) encoding);
2788 /*
2789 * registered set of known encodings
2790 */
2791 if (enc != XML_CHAR_ENCODING_ERROR) {
2792 xmlSwitchEncoding(ctxt, enc);
2793 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2794 } else {
2795 /*
2796 * fallback for unknown encodings
2797 */
2798 handler = xmlFindCharEncodingHandler((const char *) encoding);
2799 if (handler != NULL) {
2800 xmlSwitchToEncoding(ctxt, handler);
2801 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2802 } else {
2803 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2804 }
2805 }
2806
2807 if ((ctxt->input->buf != NULL) &&
2808 (ctxt->input->buf->encoder != NULL) &&
2809 (ctxt->input->buf->raw != NULL) &&
2810 (ctxt->input->buf->buffer != NULL)) {
2811 int nbchars;
2812 int processed;
2813
2814 /*
2815 * convert as much as possible to the parser reading buffer.
2816 */
2817 processed = ctxt->input->cur - ctxt->input->base;
2818 xmlBufferShrink(ctxt->input->buf->buffer, processed);
2819 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2820 ctxt->input->buf->buffer,
2821 ctxt->input->buf->raw);
2822 if (nbchars < 0) {
2823 ctxt->errNo = XML_ERR_INVALID_ENCODING;
2824 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2825 ctxt->sax->error(ctxt->userData,
2826 "htmlCheckEncoding: encoder error\n");
2827 }
2828 ctxt->input->base =
2829 ctxt->input->cur = ctxt->input->buf->buffer->content;
2830 }
2831 }
2832}
2833
2834/**
2835 * htmlCheckMeta:
2836 * @ctxt: an HTML parser context
2837 * @atts: the attributes values
2838 *
2839 * Checks an attributes from a Meta tag
2840 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002841static void
Owen Taylor3473f882001-02-23 17:55:21 +00002842htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2843 int i;
2844 const xmlChar *att, *value;
2845 int http = 0;
2846 const xmlChar *content = NULL;
2847
2848 if ((ctxt == NULL) || (atts == NULL))
2849 return;
2850
2851 i = 0;
2852 att = atts[i++];
2853 while (att != NULL) {
2854 value = atts[i++];
2855 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
2856 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
2857 http = 1;
2858 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
2859 content = value;
2860 att = atts[i++];
2861 }
2862 if ((http) && (content != NULL))
2863 htmlCheckEncoding(ctxt, content);
2864
2865}
2866
2867/**
2868 * htmlParseStartTag:
2869 * @ctxt: an HTML parser context
2870 *
2871 * parse a start of tag either for rule element or
2872 * EmptyElement. In both case we don't parse the tag closing chars.
2873 *
2874 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2875 *
2876 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2877 *
2878 * With namespace:
2879 *
2880 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2881 *
2882 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2883 *
2884 */
2885
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002886static void
Owen Taylor3473f882001-02-23 17:55:21 +00002887htmlParseStartTag(htmlParserCtxtPtr ctxt) {
2888 xmlChar *name;
2889 xmlChar *attname;
2890 xmlChar *attvalue;
2891 const xmlChar **atts = NULL;
2892 int nbatts = 0;
2893 int maxatts = 0;
2894 int meta = 0;
2895 int i;
2896
2897 if (CUR != '<') return;
2898 NEXT;
2899
2900 GROW;
2901 name = htmlParseHTMLName(ctxt);
2902 if (name == NULL) {
2903 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2904 ctxt->sax->error(ctxt->userData,
2905 "htmlParseStartTag: invalid element name\n");
2906 ctxt->wellFormed = 0;
2907 /* Dump the bogus tag like browsers do */
2908 while ((IS_CHAR(CUR)) && (CUR != '>'))
2909 NEXT;
2910 return;
2911 }
2912 if (xmlStrEqual(name, BAD_CAST"meta"))
2913 meta = 1;
2914
2915 /*
2916 * Check for auto-closure of HTML elements.
2917 */
2918 htmlAutoClose(ctxt, name);
2919
2920 /*
2921 * Check for implied HTML elements.
2922 */
2923 htmlCheckImplied(ctxt, name);
2924
2925 /*
2926 * Avoid html at any level > 0, head at any level != 1
2927 * or any attempt to recurse body
2928 */
2929 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
2930 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2931 ctxt->sax->error(ctxt->userData,
2932 "htmlParseStartTag: misplaced <html> tag\n");
2933 ctxt->wellFormed = 0;
2934 xmlFree(name);
2935 return;
2936 }
2937 if ((ctxt->nameNr != 1) &&
2938 (xmlStrEqual(name, BAD_CAST"head"))) {
2939 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2940 ctxt->sax->error(ctxt->userData,
2941 "htmlParseStartTag: misplaced <head> tag\n");
2942 ctxt->wellFormed = 0;
2943 xmlFree(name);
2944 return;
2945 }
2946 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002947 int indx;
2948 for (indx = 0;indx < ctxt->nameNr;indx++) {
2949 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor3473f882001-02-23 17:55:21 +00002950 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2951 ctxt->sax->error(ctxt->userData,
2952 "htmlParseStartTag: misplaced <body> tag\n");
2953 ctxt->wellFormed = 0;
2954 xmlFree(name);
2955 return;
2956 }
2957 }
2958 }
2959
2960 /*
2961 * Now parse the attributes, it ends up with the ending
2962 *
2963 * (S Attribute)* S?
2964 */
2965 SKIP_BLANKS;
2966 while ((IS_CHAR(CUR)) &&
2967 (CUR != '>') &&
2968 ((CUR != '/') || (NXT(1) != '>'))) {
2969 long cons = ctxt->nbChars;
2970
2971 GROW;
2972 attname = htmlParseAttribute(ctxt, &attvalue);
2973 if (attname != NULL) {
2974
2975 /*
2976 * Well formedness requires at most one declaration of an attribute
2977 */
2978 for (i = 0; i < nbatts;i += 2) {
2979 if (xmlStrEqual(atts[i], attname)) {
2980 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2981 ctxt->sax->error(ctxt->userData,
2982 "Attribute %s redefined\n",
2983 attname);
2984 ctxt->wellFormed = 0;
2985 xmlFree(attname);
2986 if (attvalue != NULL)
2987 xmlFree(attvalue);
2988 goto failed;
2989 }
2990 }
2991
2992 /*
2993 * Add the pair to atts
2994 */
2995 if (atts == NULL) {
2996 maxatts = 10;
2997 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
2998 if (atts == NULL) {
2999 xmlGenericError(xmlGenericErrorContext,
3000 "malloc of %ld byte failed\n",
3001 maxatts * (long)sizeof(xmlChar *));
3002 if (name != NULL) xmlFree(name);
3003 return;
3004 }
3005 } else if (nbatts + 4 > maxatts) {
3006 maxatts *= 2;
3007 atts = (const xmlChar **) xmlRealloc((void *) atts,
3008 maxatts * sizeof(xmlChar *));
3009 if (atts == NULL) {
3010 xmlGenericError(xmlGenericErrorContext,
3011 "realloc of %ld byte failed\n",
3012 maxatts * (long)sizeof(xmlChar *));
3013 if (name != NULL) xmlFree(name);
3014 return;
3015 }
3016 }
3017 atts[nbatts++] = attname;
3018 atts[nbatts++] = attvalue;
3019 atts[nbatts] = NULL;
3020 atts[nbatts + 1] = NULL;
3021 }
3022 else {
3023 /* Dump the bogus attribute string up to the next blank or
3024 * the end of the tag. */
3025 while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
3026 && ((CUR != '/') || (NXT(1) != '>')))
3027 NEXT;
3028 }
3029
3030failed:
3031 SKIP_BLANKS;
3032 if (cons == ctxt->nbChars) {
3033 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3034 ctxt->sax->error(ctxt->userData,
3035 "htmlParseStartTag: problem parsing attributes\n");
3036 ctxt->wellFormed = 0;
3037 break;
3038 }
3039 }
3040
3041 /*
3042 * Handle specific association to the META tag
3043 */
3044 if (meta)
3045 htmlCheckMeta(ctxt, atts);
3046
3047 /*
3048 * SAX: Start of Element !
3049 */
3050 htmlnamePush(ctxt, xmlStrdup(name));
3051#ifdef DEBUG
3052 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
3053#endif
3054 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3055 ctxt->sax->startElement(ctxt->userData, name, atts);
3056
3057 if (atts != NULL) {
3058 for (i = 0;i < nbatts;i++) {
3059 if (atts[i] != NULL)
3060 xmlFree((xmlChar *) atts[i]);
3061 }
3062 xmlFree((void *) atts);
3063 }
3064 if (name != NULL) xmlFree(name);
3065}
3066
3067/**
3068 * htmlParseEndTag:
3069 * @ctxt: an HTML parser context
3070 *
3071 * parse an end of tag
3072 *
3073 * [42] ETag ::= '</' Name S? '>'
3074 *
3075 * With namespace
3076 *
3077 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003078 *
3079 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003080 */
3081
Daniel Veillardf420ac52001-07-04 16:04:09 +00003082static int
Owen Taylor3473f882001-02-23 17:55:21 +00003083htmlParseEndTag(htmlParserCtxtPtr ctxt) {
3084 xmlChar *name;
3085 xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003086 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003087
3088 if ((CUR != '<') || (NXT(1) != '/')) {
3089 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3090 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3091 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003092 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003093 }
3094 SKIP(2);
3095
3096 name = htmlParseHTMLName(ctxt);
Daniel Veillardf420ac52001-07-04 16:04:09 +00003097 if (name == NULL) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003098
3099 /*
3100 * We should definitely be at the ending "S? '>'" part
3101 */
3102 SKIP_BLANKS;
3103 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3104 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3105 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3106 ctxt->wellFormed = 0;
3107 } else
3108 NEXT;
3109
3110 /*
3111 * If the name read is not one of the element in the parsing stack
3112 * then return, it's just an error.
3113 */
3114 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
3115 if (xmlStrEqual(name, ctxt->nameTab[i])) break;
3116 }
3117 if (i < 0) {
3118 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3119 ctxt->sax->error(ctxt->userData,
3120 "Unexpected end tag : %s\n", name);
3121 xmlFree(name);
3122 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003123 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003124 }
3125
3126
3127 /*
3128 * Check for auto-closure of HTML elements.
3129 */
3130
3131 htmlAutoCloseOnClose(ctxt, name);
3132
3133 /*
3134 * Well formedness constraints, opening and closing must match.
3135 * With the exception that the autoclose may have popped stuff out
3136 * of the stack.
3137 */
3138 if (!xmlStrEqual(name, ctxt->name)) {
3139#ifdef DEBUG
3140 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
3141#endif
3142 if ((ctxt->name != NULL) &&
3143 (!xmlStrEqual(ctxt->name, name))) {
3144 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3145 ctxt->sax->error(ctxt->userData,
3146 "Opening and ending tag mismatch: %s and %s\n",
3147 name, ctxt->name);
3148 ctxt->wellFormed = 0;
3149 }
3150 }
3151
3152 /*
3153 * SAX: End of Tag
3154 */
3155 oldname = ctxt->name;
3156 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3157 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3158 ctxt->sax->endElement(ctxt->userData, name);
3159 oldname = htmlnamePop(ctxt);
3160 if (oldname != NULL) {
3161#ifdef DEBUG
3162 xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
3163#endif
3164 xmlFree(oldname);
3165#ifdef DEBUG
3166 } else {
3167 xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
3168#endif
3169 }
Daniel Veillardf420ac52001-07-04 16:04:09 +00003170 ret = 1;
3171 } else {
3172 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003173 }
3174
3175 if (name != NULL)
3176 xmlFree(name);
3177
Daniel Veillardf420ac52001-07-04 16:04:09 +00003178 return(ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003179}
3180
3181
3182/**
3183 * htmlParseReference:
3184 * @ctxt: an HTML parser context
3185 *
3186 * parse and handle entity references in content,
3187 * this will end-up in a call to character() since this is either a
3188 * CharRef, or a predefined entity.
3189 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003190static void
Owen Taylor3473f882001-02-23 17:55:21 +00003191htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003192 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003193 xmlChar out[6];
3194 xmlChar *name;
3195 if (CUR != '&') return;
3196
3197 if (NXT(1) == '#') {
3198 unsigned int c;
3199 int bits, i = 0;
3200
3201 c = htmlParseCharRef(ctxt);
3202 if (c == 0)
3203 return;
3204
3205 if (c < 0x80) { out[i++]= c; bits= -6; }
3206 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3207 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3208 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3209
3210 for ( ; bits >= 0; bits-= 6) {
3211 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3212 }
3213 out[i] = 0;
3214
3215 htmlCheckParagraph(ctxt);
3216 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3217 ctxt->sax->characters(ctxt->userData, out, i);
3218 } else {
3219 ent = htmlParseEntityRef(ctxt, &name);
3220 if (name == NULL) {
3221 htmlCheckParagraph(ctxt);
3222 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3223 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3224 return;
3225 }
3226 if ((ent == NULL) || (ent->value <= 0)) {
3227 htmlCheckParagraph(ctxt);
3228 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3229 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3230 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3231 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3232 }
3233 } else {
3234 unsigned int c;
3235 int bits, i = 0;
3236
3237 c = ent->value;
3238 if (c < 0x80)
3239 { out[i++]= c; bits= -6; }
3240 else if (c < 0x800)
3241 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3242 else if (c < 0x10000)
3243 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3244 else
3245 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3246
3247 for ( ; bits >= 0; bits-= 6) {
3248 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3249 }
3250 out[i] = 0;
3251
3252 htmlCheckParagraph(ctxt);
3253 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3254 ctxt->sax->characters(ctxt->userData, out, i);
3255 }
3256 xmlFree(name);
3257 }
3258}
3259
3260/**
3261 * htmlParseContent:
3262 * @ctxt: an HTML parser context
3263 * @name: the node name
3264 *
3265 * Parse a content: comment, sub-element, reference or text.
3266 *
3267 */
3268
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003269static void
Owen Taylor3473f882001-02-23 17:55:21 +00003270htmlParseContent(htmlParserCtxtPtr ctxt) {
3271 xmlChar *currentNode;
3272 int depth;
3273
3274 currentNode = xmlStrdup(ctxt->name);
3275 depth = ctxt->nameNr;
3276 while (1) {
3277 long cons = ctxt->nbChars;
3278
3279 GROW;
3280 /*
3281 * Our tag or one of it's parent or children is ending.
3282 */
3283 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003284 if (htmlParseEndTag(ctxt) &&
3285 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3286 if (currentNode != NULL)
3287 xmlFree(currentNode);
3288 return;
3289 }
3290 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003291 }
3292
3293 /*
3294 * Has this node been popped out during parsing of
3295 * the next element
3296 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003297 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3298 (!xmlStrEqual(currentNode, ctxt->name)))
3299 {
Owen Taylor3473f882001-02-23 17:55:21 +00003300 if (currentNode != NULL) xmlFree(currentNode);
3301 return;
3302 }
3303
Daniel Veillardf9533d12001-03-03 10:04:57 +00003304 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3305 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003306 /*
3307 * Handle SCRIPT/STYLE separately
3308 */
3309 htmlParseScript(ctxt);
3310 } else {
3311 /*
3312 * Sometimes DOCTYPE arrives in the middle of the document
3313 */
3314 if ((CUR == '<') && (NXT(1) == '!') &&
3315 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3316 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3317 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3318 (UPP(8) == 'E')) {
3319 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3320 ctxt->sax->error(ctxt->userData,
3321 "Misplaced DOCTYPE declaration\n");
3322 ctxt->wellFormed = 0;
3323 htmlParseDocTypeDecl(ctxt);
3324 }
3325
3326 /*
3327 * First case : a comment
3328 */
3329 if ((CUR == '<') && (NXT(1) == '!') &&
3330 (NXT(2) == '-') && (NXT(3) == '-')) {
3331 htmlParseComment(ctxt);
3332 }
3333
3334 /*
3335 * Second case : a sub-element.
3336 */
3337 else if (CUR == '<') {
3338 htmlParseElement(ctxt);
3339 }
3340
3341 /*
3342 * Third case : a reference. If if has not been resolved,
3343 * parsing returns it's Name, create the node
3344 */
3345 else if (CUR == '&') {
3346 htmlParseReference(ctxt);
3347 }
3348
3349 /*
3350 * Fourth : end of the resource
3351 */
3352 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003353 htmlAutoCloseOnEnd(ctxt);
3354 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003355 }
3356
3357 /*
3358 * Last case, text. Note that References are handled directly.
3359 */
3360 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003361 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003362 }
3363
3364 if (cons == ctxt->nbChars) {
3365 if (ctxt->node != NULL) {
3366 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3367 ctxt->sax->error(ctxt->userData,
3368 "detected an error in element content\n");
3369 ctxt->wellFormed = 0;
3370 }
3371 break;
3372 }
3373 }
3374 GROW;
3375 }
3376 if (currentNode != NULL) xmlFree(currentNode);
3377}
3378
3379/**
3380 * htmlParseElement:
3381 * @ctxt: an HTML parser context
3382 *
3383 * parse an HTML element, this is highly recursive
3384 *
3385 * [39] element ::= EmptyElemTag | STag content ETag
3386 *
3387 * [41] Attribute ::= Name Eq AttValue
3388 */
3389
3390void
3391htmlParseElement(htmlParserCtxtPtr ctxt) {
3392 xmlChar *name;
3393 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003394 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003395 htmlParserNodeInfo node_info;
3396 xmlChar *oldname;
3397 int depth = ctxt->nameNr;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003398 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003399
3400 /* Capture start position */
3401 if (ctxt->record_info) {
3402 node_info.begin_pos = ctxt->input->consumed +
3403 (CUR_PTR - ctxt->input->base);
3404 node_info.begin_line = ctxt->input->line;
3405 }
3406
3407 oldname = xmlStrdup(ctxt->name);
3408 htmlParseStartTag(ctxt);
3409 name = ctxt->name;
3410#ifdef DEBUG
3411 if (oldname == NULL)
3412 xmlGenericError(xmlGenericErrorContext,
3413 "Start of element %s\n", name);
3414 else if (name == NULL)
3415 xmlGenericError(xmlGenericErrorContext,
3416 "Start of element failed, was %s\n", oldname);
3417 else
3418 xmlGenericError(xmlGenericErrorContext,
3419 "Start of element %s, was %s\n", name, oldname);
3420#endif
3421 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3422 (name == NULL)) {
3423 if (CUR == '>')
3424 NEXT;
3425 if (oldname != NULL)
3426 xmlFree(oldname);
3427 return;
3428 }
3429 if (oldname != NULL)
3430 xmlFree(oldname);
3431
3432 /*
3433 * Lookup the info for that element.
3434 */
3435 info = htmlTagLookup(name);
3436 if (info == NULL) {
3437 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3438 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3439 name);
3440 ctxt->wellFormed = 0;
3441 } else if (info->depr) {
3442/***************************
3443 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3444 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3445 name);
3446 ***************************/
3447 }
3448
3449 /*
3450 * Check for an Empty Element labelled the XML/SGML way
3451 */
3452 if ((CUR == '/') && (NXT(1) == '>')) {
3453 SKIP(2);
3454 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3455 ctxt->sax->endElement(ctxt->userData, name);
3456 oldname = htmlnamePop(ctxt);
3457#ifdef DEBUG
3458 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
3459#endif
3460 if (oldname != NULL)
3461 xmlFree(oldname);
3462 return;
3463 }
3464
3465 if (CUR == '>') {
3466 NEXT;
3467 } else {
3468 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3469 ctxt->sax->error(ctxt->userData,
3470 "Couldn't find end of Start Tag %s\n",
3471 name);
3472 ctxt->wellFormed = 0;
3473
3474 /*
3475 * end of parsing of this node.
3476 */
3477 if (xmlStrEqual(name, ctxt->name)) {
3478 nodePop(ctxt);
3479 oldname = htmlnamePop(ctxt);
3480#ifdef DEBUG
3481 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
3482#endif
3483 if (oldname != NULL)
3484 xmlFree(oldname);
3485 }
3486
3487 /*
3488 * Capture end position and add node
3489 */
3490 if ( currentNode != NULL && ctxt->record_info ) {
3491 node_info.end_pos = ctxt->input->consumed +
3492 (CUR_PTR - ctxt->input->base);
3493 node_info.end_line = ctxt->input->line;
3494 node_info.node = ctxt->node;
3495 xmlParserAddNodeInfo(ctxt, &node_info);
3496 }
3497 return;
3498 }
3499
3500 /*
3501 * Check for an Empty Element from DTD definition
3502 */
3503 if ((info != NULL) && (info->empty)) {
3504 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3505 ctxt->sax->endElement(ctxt->userData, name);
3506 oldname = htmlnamePop(ctxt);
3507#ifdef DEBUG
3508 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
3509#endif
3510 if (oldname != NULL)
3511 xmlFree(oldname);
3512 return;
3513 }
3514
3515 /*
3516 * Parse the content of the element:
3517 */
3518 currentNode = xmlStrdup(ctxt->name);
3519 depth = ctxt->nameNr;
3520 while (IS_CHAR(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00003521 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00003522 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00003523 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00003524 if (ctxt->nameNr < depth) break;
3525 }
3526
Owen Taylor3473f882001-02-23 17:55:21 +00003527 /*
3528 * Capture end position and add node
3529 */
3530 if ( currentNode != NULL && ctxt->record_info ) {
3531 node_info.end_pos = ctxt->input->consumed +
3532 (CUR_PTR - ctxt->input->base);
3533 node_info.end_line = ctxt->input->line;
3534 node_info.node = ctxt->node;
3535 xmlParserAddNodeInfo(ctxt, &node_info);
3536 }
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003537 if (!IS_CHAR(CUR)) {
3538 htmlAutoCloseOnEnd(ctxt);
3539 }
3540
Owen Taylor3473f882001-02-23 17:55:21 +00003541 if (currentNode != NULL)
3542 xmlFree(currentNode);
3543}
3544
3545/**
3546 * htmlParseDocument :
3547 * @ctxt: an HTML parser context
3548 *
3549 * parse an HTML document (and build a tree if using the standard SAX
3550 * interface).
3551 *
3552 * Returns 0, -1 in case of error. the parser context is augmented
3553 * as a result of the parsing.
3554 */
3555
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003556static int
Owen Taylor3473f882001-02-23 17:55:21 +00003557htmlParseDocument(htmlParserCtxtPtr ctxt) {
3558 xmlDtdPtr dtd;
3559
3560 htmlDefaultSAXHandlerInit();
3561 ctxt->html = 1;
3562
3563 GROW;
3564 /*
3565 * SAX: beginning of the document processing.
3566 */
3567 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3568 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3569
3570 /*
3571 * Wipe out everything which is before the first '<'
3572 */
3573 SKIP_BLANKS;
3574 if (CUR == 0) {
3575 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3576 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3577 ctxt->wellFormed = 0;
3578 }
3579
3580 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3581 ctxt->sax->startDocument(ctxt->userData);
3582
3583
3584 /*
3585 * Parse possible comments before any content
3586 */
3587 while ((CUR == '<') && (NXT(1) == '!') &&
3588 (NXT(2) == '-') && (NXT(3) == '-')) {
3589 htmlParseComment(ctxt);
3590 SKIP_BLANKS;
3591 }
3592
3593
3594 /*
3595 * Then possibly doc type declaration(s) and more Misc
3596 * (doctypedecl Misc*)?
3597 */
3598 if ((CUR == '<') && (NXT(1) == '!') &&
3599 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3600 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3601 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3602 (UPP(8) == 'E')) {
3603 htmlParseDocTypeDecl(ctxt);
3604 }
3605 SKIP_BLANKS;
3606
3607 /*
3608 * Parse possible comments before any content
3609 */
3610 while ((CUR == '<') && (NXT(1) == '!') &&
3611 (NXT(2) == '-') && (NXT(3) == '-')) {
3612 htmlParseComment(ctxt);
3613 SKIP_BLANKS;
3614 }
3615
3616 /*
3617 * Time to start parsing the tree itself
3618 */
3619 htmlParseContent(ctxt);
3620
3621 /*
3622 * autoclose
3623 */
3624 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003625 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003626
3627
3628 /*
3629 * SAX: end of the document processing.
3630 */
3631 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3632 ctxt->sax->endDocument(ctxt->userData);
3633
3634 if (ctxt->myDoc != NULL) {
3635 dtd = xmlGetIntSubset(ctxt->myDoc);
3636 if (dtd == NULL)
3637 ctxt->myDoc->intSubset =
3638 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3639 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3640 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3641 }
3642 if (! ctxt->wellFormed) return(-1);
3643 return(0);
3644}
3645
3646
3647/************************************************************************
3648 * *
3649 * Parser contexts handling *
3650 * *
3651 ************************************************************************/
3652
3653/**
3654 * xmlInitParserCtxt:
3655 * @ctxt: an HTML parser context
3656 *
3657 * Initialize a parser context
3658 */
3659
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003660static void
Owen Taylor3473f882001-02-23 17:55:21 +00003661htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3662{
3663 htmlSAXHandler *sax;
3664
3665 if (ctxt == NULL) return;
3666 memset(ctxt, 0, sizeof(htmlParserCtxt));
3667
3668 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3669 if (sax == NULL) {
3670 xmlGenericError(xmlGenericErrorContext,
3671 "htmlInitParserCtxt: out of memory\n");
3672 }
3673 else
3674 memset(sax, 0, sizeof(htmlSAXHandler));
3675
3676 /* Allocate the Input stack */
3677 ctxt->inputTab = (htmlParserInputPtr *)
3678 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3679 if (ctxt->inputTab == NULL) {
3680 xmlGenericError(xmlGenericErrorContext,
3681 "htmlInitParserCtxt: out of memory\n");
3682 ctxt->inputNr = 0;
3683 ctxt->inputMax = 0;
3684 ctxt->input = NULL;
3685 return;
3686 }
3687 ctxt->inputNr = 0;
3688 ctxt->inputMax = 5;
3689 ctxt->input = NULL;
3690 ctxt->version = NULL;
3691 ctxt->encoding = NULL;
3692 ctxt->standalone = -1;
3693 ctxt->instate = XML_PARSER_START;
3694
3695 /* Allocate the Node stack */
3696 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3697 if (ctxt->nodeTab == NULL) {
3698 xmlGenericError(xmlGenericErrorContext,
3699 "htmlInitParserCtxt: out of memory\n");
3700 ctxt->nodeNr = 0;
3701 ctxt->nodeMax = 0;
3702 ctxt->node = NULL;
3703 ctxt->inputNr = 0;
3704 ctxt->inputMax = 0;
3705 ctxt->input = NULL;
3706 return;
3707 }
3708 ctxt->nodeNr = 0;
3709 ctxt->nodeMax = 10;
3710 ctxt->node = NULL;
3711
3712 /* Allocate the Name stack */
3713 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
3714 if (ctxt->nameTab == NULL) {
3715 xmlGenericError(xmlGenericErrorContext,
3716 "htmlInitParserCtxt: out of memory\n");
3717 ctxt->nameNr = 0;
3718 ctxt->nameMax = 10;
3719 ctxt->name = NULL;
3720 ctxt->nodeNr = 0;
3721 ctxt->nodeMax = 0;
3722 ctxt->node = NULL;
3723 ctxt->inputNr = 0;
3724 ctxt->inputMax = 0;
3725 ctxt->input = NULL;
3726 return;
3727 }
3728 ctxt->nameNr = 0;
3729 ctxt->nameMax = 10;
3730 ctxt->name = NULL;
3731
3732 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3733 else {
3734 ctxt->sax = sax;
3735 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3736 }
3737 ctxt->userData = ctxt;
3738 ctxt->myDoc = NULL;
3739 ctxt->wellFormed = 1;
3740 ctxt->replaceEntities = 0;
3741 ctxt->html = 1;
3742 ctxt->record_info = 0;
3743 ctxt->validate = 0;
3744 ctxt->nbChars = 0;
3745 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00003746 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003747 xmlInitNodeInfoSeq(&ctxt->node_seq);
3748}
3749
3750/**
3751 * htmlFreeParserCtxt:
3752 * @ctxt: an HTML parser context
3753 *
3754 * Free all the memory used by a parser context. However the parsed
3755 * document in ctxt->myDoc is not freed.
3756 */
3757
3758void
3759htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3760{
3761 xmlFreeParserCtxt(ctxt);
3762}
3763
3764/**
3765 * htmlCreateDocParserCtxt :
3766 * @cur: a pointer to an array of xmlChar
3767 * @encoding: a free form C string describing the HTML document encoding, or NULL
3768 *
3769 * Create a parser context for an HTML document.
3770 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003771 * TODO: check the need to add encoding handling there
3772 *
Owen Taylor3473f882001-02-23 17:55:21 +00003773 * Returns the new parser context or NULL
3774 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003775static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00003776htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Owen Taylor3473f882001-02-23 17:55:21 +00003777 htmlParserCtxtPtr ctxt;
3778 htmlParserInputPtr input;
3779 /* htmlCharEncoding enc; */
3780
3781 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
3782 if (ctxt == NULL) {
3783 perror("malloc");
3784 return(NULL);
3785 }
3786 htmlInitParserCtxt(ctxt);
3787 input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
3788 if (input == NULL) {
3789 perror("malloc");
3790 xmlFree(ctxt);
3791 return(NULL);
3792 }
3793 memset(input, 0, sizeof(htmlParserInput));
3794
3795 input->line = 1;
3796 input->col = 1;
3797 input->base = cur;
3798 input->cur = cur;
3799
3800 inputPush(ctxt, input);
3801 return(ctxt);
3802}
3803
3804/************************************************************************
3805 * *
3806 * Progressive parsing interfaces *
3807 * *
3808 ************************************************************************/
3809
3810/**
3811 * htmlParseLookupSequence:
3812 * @ctxt: an HTML parser context
3813 * @first: the first char to lookup
3814 * @next: the next char to lookup or zero
3815 * @third: the next char to lookup or zero
3816 *
3817 * Try to find if a sequence (first, next, third) or just (first next) or
3818 * (first) is available in the input stream.
3819 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3820 * to avoid rescanning sequences of bytes, it DOES change the state of the
3821 * parser, do not use liberally.
3822 * This is basically similar to xmlParseLookupSequence()
3823 *
3824 * Returns the index to the current parsing point if the full sequence
3825 * is available, -1 otherwise.
3826 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003827static int
Owen Taylor3473f882001-02-23 17:55:21 +00003828htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3829 xmlChar next, xmlChar third) {
3830 int base, len;
3831 htmlParserInputPtr in;
3832 const xmlChar *buf;
3833
3834 in = ctxt->input;
3835 if (in == NULL) return(-1);
3836 base = in->cur - in->base;
3837 if (base < 0) return(-1);
3838 if (ctxt->checkIndex > base)
3839 base = ctxt->checkIndex;
3840 if (in->buf == NULL) {
3841 buf = in->base;
3842 len = in->length;
3843 } else {
3844 buf = in->buf->buffer->content;
3845 len = in->buf->buffer->use;
3846 }
3847 /* take into account the sequence length */
3848 if (third) len -= 2;
3849 else if (next) len --;
3850 for (;base < len;base++) {
3851 if (buf[base] == first) {
3852 if (third != 0) {
3853 if ((buf[base + 1] != next) ||
3854 (buf[base + 2] != third)) continue;
3855 } else if (next != 0) {
3856 if (buf[base + 1] != next) continue;
3857 }
3858 ctxt->checkIndex = 0;
3859#ifdef DEBUG_PUSH
3860 if (next == 0)
3861 xmlGenericError(xmlGenericErrorContext,
3862 "HPP: lookup '%c' found at %d\n",
3863 first, base);
3864 else if (third == 0)
3865 xmlGenericError(xmlGenericErrorContext,
3866 "HPP: lookup '%c%c' found at %d\n",
3867 first, next, base);
3868 else
3869 xmlGenericError(xmlGenericErrorContext,
3870 "HPP: lookup '%c%c%c' found at %d\n",
3871 first, next, third, base);
3872#endif
3873 return(base - (in->cur - in->base));
3874 }
3875 }
3876 ctxt->checkIndex = base;
3877#ifdef DEBUG_PUSH
3878 if (next == 0)
3879 xmlGenericError(xmlGenericErrorContext,
3880 "HPP: lookup '%c' failed\n", first);
3881 else if (third == 0)
3882 xmlGenericError(xmlGenericErrorContext,
3883 "HPP: lookup '%c%c' failed\n", first, next);
3884 else
3885 xmlGenericError(xmlGenericErrorContext,
3886 "HPP: lookup '%c%c%c' failed\n", first, next, third);
3887#endif
3888 return(-1);
3889}
3890
3891/**
3892 * htmlParseTryOrFinish:
3893 * @ctxt: an HTML parser context
3894 * @terminate: last chunk indicator
3895 *
3896 * Try to progress on parsing
3897 *
3898 * Returns zero if no parsing was possible
3899 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003900static int
Owen Taylor3473f882001-02-23 17:55:21 +00003901htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
3902 int ret = 0;
3903 htmlParserInputPtr in;
3904 int avail = 0;
3905 xmlChar cur, next;
3906
3907#ifdef DEBUG_PUSH
3908 switch (ctxt->instate) {
3909 case XML_PARSER_EOF:
3910 xmlGenericError(xmlGenericErrorContext,
3911 "HPP: try EOF\n"); break;
3912 case XML_PARSER_START:
3913 xmlGenericError(xmlGenericErrorContext,
3914 "HPP: try START\n"); break;
3915 case XML_PARSER_MISC:
3916 xmlGenericError(xmlGenericErrorContext,
3917 "HPP: try MISC\n");break;
3918 case XML_PARSER_COMMENT:
3919 xmlGenericError(xmlGenericErrorContext,
3920 "HPP: try COMMENT\n");break;
3921 case XML_PARSER_PROLOG:
3922 xmlGenericError(xmlGenericErrorContext,
3923 "HPP: try PROLOG\n");break;
3924 case XML_PARSER_START_TAG:
3925 xmlGenericError(xmlGenericErrorContext,
3926 "HPP: try START_TAG\n");break;
3927 case XML_PARSER_CONTENT:
3928 xmlGenericError(xmlGenericErrorContext,
3929 "HPP: try CONTENT\n");break;
3930 case XML_PARSER_CDATA_SECTION:
3931 xmlGenericError(xmlGenericErrorContext,
3932 "HPP: try CDATA_SECTION\n");break;
3933 case XML_PARSER_END_TAG:
3934 xmlGenericError(xmlGenericErrorContext,
3935 "HPP: try END_TAG\n");break;
3936 case XML_PARSER_ENTITY_DECL:
3937 xmlGenericError(xmlGenericErrorContext,
3938 "HPP: try ENTITY_DECL\n");break;
3939 case XML_PARSER_ENTITY_VALUE:
3940 xmlGenericError(xmlGenericErrorContext,
3941 "HPP: try ENTITY_VALUE\n");break;
3942 case XML_PARSER_ATTRIBUTE_VALUE:
3943 xmlGenericError(xmlGenericErrorContext,
3944 "HPP: try ATTRIBUTE_VALUE\n");break;
3945 case XML_PARSER_DTD:
3946 xmlGenericError(xmlGenericErrorContext,
3947 "HPP: try DTD\n");break;
3948 case XML_PARSER_EPILOG:
3949 xmlGenericError(xmlGenericErrorContext,
3950 "HPP: try EPILOG\n");break;
3951 case XML_PARSER_PI:
3952 xmlGenericError(xmlGenericErrorContext,
3953 "HPP: try PI\n");break;
3954 case XML_PARSER_SYSTEM_LITERAL:
3955 xmlGenericError(xmlGenericErrorContext,
3956 "HPP: try SYSTEM_LITERAL\n");break;
3957 }
3958#endif
3959
3960 while (1) {
3961
3962 in = ctxt->input;
3963 if (in == NULL) break;
3964 if (in->buf == NULL)
3965 avail = in->length - (in->cur - in->base);
3966 else
3967 avail = in->buf->buffer->use - (in->cur - in->base);
3968 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003969 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003970 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
3971 /*
3972 * SAX: end of the document processing.
3973 */
3974 ctxt->instate = XML_PARSER_EOF;
3975 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3976 ctxt->sax->endDocument(ctxt->userData);
3977 }
3978 }
3979 if (avail < 1)
3980 goto done;
3981 switch (ctxt->instate) {
3982 case XML_PARSER_EOF:
3983 /*
3984 * Document parsing is done !
3985 */
3986 goto done;
3987 case XML_PARSER_START:
3988 /*
3989 * Very first chars read from the document flow.
3990 */
3991 cur = in->cur[0];
3992 if (IS_BLANK(cur)) {
3993 SKIP_BLANKS;
3994 if (in->buf == NULL)
3995 avail = in->length - (in->cur - in->base);
3996 else
3997 avail = in->buf->buffer->use - (in->cur - in->base);
3998 }
3999 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4000 ctxt->sax->setDocumentLocator(ctxt->userData,
4001 &xmlDefaultSAXLocator);
4002 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4003 (!ctxt->disableSAX))
4004 ctxt->sax->startDocument(ctxt->userData);
4005
4006 cur = in->cur[0];
4007 next = in->cur[1];
4008 if ((cur == '<') && (next == '!') &&
4009 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4010 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4011 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4012 (UPP(8) == 'E')) {
4013 if ((!terminate) &&
4014 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4015 goto done;
4016#ifdef DEBUG_PUSH
4017 xmlGenericError(xmlGenericErrorContext,
4018 "HPP: Parsing internal subset\n");
4019#endif
4020 htmlParseDocTypeDecl(ctxt);
4021 ctxt->instate = XML_PARSER_PROLOG;
4022#ifdef DEBUG_PUSH
4023 xmlGenericError(xmlGenericErrorContext,
4024 "HPP: entering PROLOG\n");
4025#endif
4026 } else {
4027 ctxt->instate = XML_PARSER_MISC;
4028 }
4029#ifdef DEBUG_PUSH
4030 xmlGenericError(xmlGenericErrorContext,
4031 "HPP: entering MISC\n");
4032#endif
4033 break;
4034 case XML_PARSER_MISC:
4035 SKIP_BLANKS;
4036 if (in->buf == NULL)
4037 avail = in->length - (in->cur - in->base);
4038 else
4039 avail = in->buf->buffer->use - (in->cur - in->base);
4040 if (avail < 2)
4041 goto done;
4042 cur = in->cur[0];
4043 next = in->cur[1];
4044 if ((cur == '<') && (next == '!') &&
4045 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4046 if ((!terminate) &&
4047 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4048 goto done;
4049#ifdef DEBUG_PUSH
4050 xmlGenericError(xmlGenericErrorContext,
4051 "HPP: Parsing Comment\n");
4052#endif
4053 htmlParseComment(ctxt);
4054 ctxt->instate = XML_PARSER_MISC;
4055 } else if ((cur == '<') && (next == '!') &&
4056 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4057 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4058 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4059 (UPP(8) == 'E')) {
4060 if ((!terminate) &&
4061 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4062 goto done;
4063#ifdef DEBUG_PUSH
4064 xmlGenericError(xmlGenericErrorContext,
4065 "HPP: Parsing internal subset\n");
4066#endif
4067 htmlParseDocTypeDecl(ctxt);
4068 ctxt->instate = XML_PARSER_PROLOG;
4069#ifdef DEBUG_PUSH
4070 xmlGenericError(xmlGenericErrorContext,
4071 "HPP: entering PROLOG\n");
4072#endif
4073 } else if ((cur == '<') && (next == '!') &&
4074 (avail < 9)) {
4075 goto done;
4076 } else {
4077 ctxt->instate = XML_PARSER_START_TAG;
4078#ifdef DEBUG_PUSH
4079 xmlGenericError(xmlGenericErrorContext,
4080 "HPP: entering START_TAG\n");
4081#endif
4082 }
4083 break;
4084 case XML_PARSER_PROLOG:
4085 SKIP_BLANKS;
4086 if (in->buf == NULL)
4087 avail = in->length - (in->cur - in->base);
4088 else
4089 avail = in->buf->buffer->use - (in->cur - in->base);
4090 if (avail < 2)
4091 goto done;
4092 cur = in->cur[0];
4093 next = in->cur[1];
4094 if ((cur == '<') && (next == '!') &&
4095 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4096 if ((!terminate) &&
4097 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4098 goto done;
4099#ifdef DEBUG_PUSH
4100 xmlGenericError(xmlGenericErrorContext,
4101 "HPP: Parsing Comment\n");
4102#endif
4103 htmlParseComment(ctxt);
4104 ctxt->instate = XML_PARSER_PROLOG;
4105 } else if ((cur == '<') && (next == '!') &&
4106 (avail < 4)) {
4107 goto done;
4108 } else {
4109 ctxt->instate = XML_PARSER_START_TAG;
4110#ifdef DEBUG_PUSH
4111 xmlGenericError(xmlGenericErrorContext,
4112 "HPP: entering START_TAG\n");
4113#endif
4114 }
4115 break;
4116 case XML_PARSER_EPILOG:
4117 if (in->buf == NULL)
4118 avail = in->length - (in->cur - in->base);
4119 else
4120 avail = in->buf->buffer->use - (in->cur - in->base);
4121 if (avail < 1)
4122 goto done;
4123 cur = in->cur[0];
4124 if (IS_BLANK(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004125 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004126 goto done;
4127 }
4128 if (avail < 2)
4129 goto done;
4130 next = in->cur[1];
4131 if ((cur == '<') && (next == '!') &&
4132 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4133 if ((!terminate) &&
4134 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4135 goto done;
4136#ifdef DEBUG_PUSH
4137 xmlGenericError(xmlGenericErrorContext,
4138 "HPP: Parsing Comment\n");
4139#endif
4140 htmlParseComment(ctxt);
4141 ctxt->instate = XML_PARSER_EPILOG;
4142 } else if ((cur == '<') && (next == '!') &&
4143 (avail < 4)) {
4144 goto done;
4145 } else {
4146 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004147 ctxt->wellFormed = 0;
4148 ctxt->instate = XML_PARSER_EOF;
4149#ifdef DEBUG_PUSH
4150 xmlGenericError(xmlGenericErrorContext,
4151 "HPP: entering EOF\n");
4152#endif
4153 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4154 ctxt->sax->endDocument(ctxt->userData);
4155 goto done;
4156 }
4157 break;
4158 case XML_PARSER_START_TAG: {
4159 xmlChar *name, *oldname;
4160 int depth = ctxt->nameNr;
Daniel Veillardbb371292001-08-16 23:26:59 +00004161 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004162
4163 if (avail < 2)
4164 goto done;
4165 cur = in->cur[0];
4166 if (cur != '<') {
4167 ctxt->instate = XML_PARSER_CONTENT;
4168#ifdef DEBUG_PUSH
4169 xmlGenericError(xmlGenericErrorContext,
4170 "HPP: entering CONTENT\n");
4171#endif
4172 break;
4173 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004174 if (in->cur[1] == '/') {
4175 ctxt->instate = XML_PARSER_END_TAG;
4176 ctxt->checkIndex = 0;
4177#ifdef DEBUG_PUSH
4178 xmlGenericError(xmlGenericErrorContext,
4179 "HPP: entering END_TAG\n");
4180#endif
4181 break;
4182 }
Owen Taylor3473f882001-02-23 17:55:21 +00004183 if ((!terminate) &&
4184 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4185 goto done;
4186
4187 oldname = xmlStrdup(ctxt->name);
4188 htmlParseStartTag(ctxt);
4189 name = ctxt->name;
4190#ifdef DEBUG
4191 if (oldname == NULL)
4192 xmlGenericError(xmlGenericErrorContext,
4193 "Start of element %s\n", name);
4194 else if (name == NULL)
4195 xmlGenericError(xmlGenericErrorContext,
4196 "Start of element failed, was %s\n",
4197 oldname);
4198 else
4199 xmlGenericError(xmlGenericErrorContext,
4200 "Start of element %s, was %s\n",
4201 name, oldname);
4202#endif
4203 if (((depth == ctxt->nameNr) &&
4204 (xmlStrEqual(oldname, ctxt->name))) ||
4205 (name == NULL)) {
4206 if (CUR == '>')
4207 NEXT;
4208 if (oldname != NULL)
4209 xmlFree(oldname);
4210 break;
4211 }
4212 if (oldname != NULL)
4213 xmlFree(oldname);
4214
4215 /*
4216 * Lookup the info for that element.
4217 */
4218 info = htmlTagLookup(name);
4219 if (info == NULL) {
4220 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4221 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4222 name);
4223 ctxt->wellFormed = 0;
4224 } else if (info->depr) {
4225 /***************************
4226 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4227 ctxt->sax->warning(ctxt->userData,
4228 "Tag %s is deprecated\n",
4229 name);
4230 ***************************/
4231 }
4232
4233 /*
4234 * Check for an Empty Element labelled the XML/SGML way
4235 */
4236 if ((CUR == '/') && (NXT(1) == '>')) {
4237 SKIP(2);
4238 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4239 ctxt->sax->endElement(ctxt->userData, name);
4240 oldname = htmlnamePop(ctxt);
4241#ifdef DEBUG
4242 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
4243 oldname);
4244#endif
4245 if (oldname != NULL)
4246 xmlFree(oldname);
4247 ctxt->instate = XML_PARSER_CONTENT;
4248#ifdef DEBUG_PUSH
4249 xmlGenericError(xmlGenericErrorContext,
4250 "HPP: entering CONTENT\n");
4251#endif
4252 break;
4253 }
4254
4255 if (CUR == '>') {
4256 NEXT;
4257 } else {
4258 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4259 ctxt->sax->error(ctxt->userData,
4260 "Couldn't find end of Start Tag %s\n",
4261 name);
4262 ctxt->wellFormed = 0;
4263
4264 /*
4265 * end of parsing of this node.
4266 */
4267 if (xmlStrEqual(name, ctxt->name)) {
4268 nodePop(ctxt);
4269 oldname = htmlnamePop(ctxt);
4270#ifdef DEBUG
4271 xmlGenericError(xmlGenericErrorContext,
4272 "End of start tag problem: popping out %s\n", oldname);
4273#endif
4274 if (oldname != NULL)
4275 xmlFree(oldname);
4276 }
4277
4278 ctxt->instate = XML_PARSER_CONTENT;
4279#ifdef DEBUG_PUSH
4280 xmlGenericError(xmlGenericErrorContext,
4281 "HPP: entering CONTENT\n");
4282#endif
4283 break;
4284 }
4285
4286 /*
4287 * Check for an Empty Element from DTD definition
4288 */
4289 if ((info != NULL) && (info->empty)) {
4290 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4291 ctxt->sax->endElement(ctxt->userData, name);
4292 oldname = htmlnamePop(ctxt);
4293#ifdef DEBUG
4294 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
4295#endif
4296 if (oldname != NULL)
4297 xmlFree(oldname);
4298 }
4299 ctxt->instate = XML_PARSER_CONTENT;
4300#ifdef DEBUG_PUSH
4301 xmlGenericError(xmlGenericErrorContext,
4302 "HPP: entering CONTENT\n");
4303#endif
4304 break;
4305 }
4306 case XML_PARSER_CONTENT: {
4307 long cons;
4308 /*
4309 * Handle preparsed entities and charRef
4310 */
4311 if (ctxt->token != 0) {
4312 xmlChar chr[2] = { 0 , 0 } ;
4313
4314 chr[0] = (xmlChar) ctxt->token;
4315 htmlCheckParagraph(ctxt);
4316 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4317 ctxt->sax->characters(ctxt->userData, chr, 1);
4318 ctxt->token = 0;
4319 ctxt->checkIndex = 0;
4320 }
4321 if ((avail == 1) && (terminate)) {
4322 cur = in->cur[0];
4323 if ((cur != '<') && (cur != '&')) {
4324 if (ctxt->sax != NULL) {
4325 if (IS_BLANK(cur)) {
4326 if (ctxt->sax->ignorableWhitespace != NULL)
4327 ctxt->sax->ignorableWhitespace(
4328 ctxt->userData, &cur, 1);
4329 } else {
4330 htmlCheckParagraph(ctxt);
4331 if (ctxt->sax->characters != NULL)
4332 ctxt->sax->characters(
4333 ctxt->userData, &cur, 1);
4334 }
4335 }
4336 ctxt->token = 0;
4337 ctxt->checkIndex = 0;
4338 NEXT;
William M. Brack1633d182001-10-05 15:41:19 +00004339 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004340 }
Owen Taylor3473f882001-02-23 17:55:21 +00004341 }
4342 if (avail < 2)
4343 goto done;
4344 cur = in->cur[0];
4345 next = in->cur[1];
4346 cons = ctxt->nbChars;
4347 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4348 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4349 /*
4350 * Handle SCRIPT/STYLE separately
4351 */
4352 if ((!terminate) &&
4353 (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4354 goto done;
4355 htmlParseScript(ctxt);
4356 if ((cur == '<') && (next == '/')) {
4357 ctxt->instate = XML_PARSER_END_TAG;
4358 ctxt->checkIndex = 0;
4359#ifdef DEBUG_PUSH
4360 xmlGenericError(xmlGenericErrorContext,
4361 "HPP: entering END_TAG\n");
4362#endif
4363 break;
4364 }
4365 } else {
4366 /*
4367 * Sometimes DOCTYPE arrives in the middle of the document
4368 */
4369 if ((cur == '<') && (next == '!') &&
4370 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4371 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4372 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4373 (UPP(8) == 'E')) {
4374 if ((!terminate) &&
4375 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4376 goto done;
4377 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4378 ctxt->sax->error(ctxt->userData,
4379 "Misplaced DOCTYPE declaration\n");
4380 ctxt->wellFormed = 0;
4381 htmlParseDocTypeDecl(ctxt);
4382 } else if ((cur == '<') && (next == '!') &&
4383 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4384 if ((!terminate) &&
4385 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4386 goto done;
4387#ifdef DEBUG_PUSH
4388 xmlGenericError(xmlGenericErrorContext,
4389 "HPP: Parsing Comment\n");
4390#endif
4391 htmlParseComment(ctxt);
4392 ctxt->instate = XML_PARSER_CONTENT;
4393 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4394 goto done;
4395 } else if ((cur == '<') && (next == '/')) {
4396 ctxt->instate = XML_PARSER_END_TAG;
4397 ctxt->checkIndex = 0;
4398#ifdef DEBUG_PUSH
4399 xmlGenericError(xmlGenericErrorContext,
4400 "HPP: entering END_TAG\n");
4401#endif
4402 break;
4403 } else if (cur == '<') {
4404 ctxt->instate = XML_PARSER_START_TAG;
4405 ctxt->checkIndex = 0;
4406#ifdef DEBUG_PUSH
4407 xmlGenericError(xmlGenericErrorContext,
4408 "HPP: entering START_TAG\n");
4409#endif
4410 break;
4411 } else if (cur == '&') {
4412 if ((!terminate) &&
4413 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4414 goto done;
4415#ifdef DEBUG_PUSH
4416 xmlGenericError(xmlGenericErrorContext,
4417 "HPP: Parsing Reference\n");
4418#endif
4419 /* TODO: check generation of subtrees if noent !!! */
4420 htmlParseReference(ctxt);
4421 } else {
4422 /* TODO Avoid the extra copy, handle directly !!!!!! */
4423 /*
4424 * Goal of the following test is :
4425 * - minimize calls to the SAX 'character' callback
4426 * when they are mergeable
4427 */
4428 if ((ctxt->inputNr == 1) &&
4429 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4430 if ((!terminate) &&
4431 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4432 goto done;
4433 }
4434 ctxt->checkIndex = 0;
4435#ifdef DEBUG_PUSH
4436 xmlGenericError(xmlGenericErrorContext,
4437 "HPP: Parsing char data\n");
4438#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004439 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004440 }
4441 }
4442 if (cons == ctxt->nbChars) {
4443 if (ctxt->node != NULL) {
4444 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4445 ctxt->sax->error(ctxt->userData,
4446 "detected an error in element content\n");
4447 ctxt->wellFormed = 0;
4448 }
4449 NEXT;
4450 break;
4451 }
4452
4453 break;
4454 }
4455 case XML_PARSER_END_TAG:
4456 if (avail < 2)
4457 goto done;
4458 if ((!terminate) &&
4459 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4460 goto done;
4461 htmlParseEndTag(ctxt);
4462 if (ctxt->nameNr == 0) {
4463 ctxt->instate = XML_PARSER_EPILOG;
4464 } else {
4465 ctxt->instate = XML_PARSER_CONTENT;
4466 }
4467 ctxt->checkIndex = 0;
4468#ifdef DEBUG_PUSH
4469 xmlGenericError(xmlGenericErrorContext,
4470 "HPP: entering CONTENT\n");
4471#endif
4472 break;
4473 case XML_PARSER_CDATA_SECTION:
4474 xmlGenericError(xmlGenericErrorContext,
4475 "HPP: internal error, state == CDATA\n");
4476 ctxt->instate = XML_PARSER_CONTENT;
4477 ctxt->checkIndex = 0;
4478#ifdef DEBUG_PUSH
4479 xmlGenericError(xmlGenericErrorContext,
4480 "HPP: entering CONTENT\n");
4481#endif
4482 break;
4483 case XML_PARSER_DTD:
4484 xmlGenericError(xmlGenericErrorContext,
4485 "HPP: internal error, state == DTD\n");
4486 ctxt->instate = XML_PARSER_CONTENT;
4487 ctxt->checkIndex = 0;
4488#ifdef DEBUG_PUSH
4489 xmlGenericError(xmlGenericErrorContext,
4490 "HPP: entering CONTENT\n");
4491#endif
4492 break;
4493 case XML_PARSER_COMMENT:
4494 xmlGenericError(xmlGenericErrorContext,
4495 "HPP: internal error, state == COMMENT\n");
4496 ctxt->instate = XML_PARSER_CONTENT;
4497 ctxt->checkIndex = 0;
4498#ifdef DEBUG_PUSH
4499 xmlGenericError(xmlGenericErrorContext,
4500 "HPP: entering CONTENT\n");
4501#endif
4502 break;
4503 case XML_PARSER_PI:
4504 xmlGenericError(xmlGenericErrorContext,
4505 "HPP: internal error, state == PI\n");
4506 ctxt->instate = XML_PARSER_CONTENT;
4507 ctxt->checkIndex = 0;
4508#ifdef DEBUG_PUSH
4509 xmlGenericError(xmlGenericErrorContext,
4510 "HPP: entering CONTENT\n");
4511#endif
4512 break;
4513 case XML_PARSER_ENTITY_DECL:
4514 xmlGenericError(xmlGenericErrorContext,
4515 "HPP: internal error, state == ENTITY_DECL\n");
4516 ctxt->instate = XML_PARSER_CONTENT;
4517 ctxt->checkIndex = 0;
4518#ifdef DEBUG_PUSH
4519 xmlGenericError(xmlGenericErrorContext,
4520 "HPP: entering CONTENT\n");
4521#endif
4522 break;
4523 case XML_PARSER_ENTITY_VALUE:
4524 xmlGenericError(xmlGenericErrorContext,
4525 "HPP: internal error, state == ENTITY_VALUE\n");
4526 ctxt->instate = XML_PARSER_CONTENT;
4527 ctxt->checkIndex = 0;
4528#ifdef DEBUG_PUSH
4529 xmlGenericError(xmlGenericErrorContext,
4530 "HPP: entering DTD\n");
4531#endif
4532 break;
4533 case XML_PARSER_ATTRIBUTE_VALUE:
4534 xmlGenericError(xmlGenericErrorContext,
4535 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
4536 ctxt->instate = XML_PARSER_START_TAG;
4537 ctxt->checkIndex = 0;
4538#ifdef DEBUG_PUSH
4539 xmlGenericError(xmlGenericErrorContext,
4540 "HPP: entering START_TAG\n");
4541#endif
4542 break;
4543 case XML_PARSER_SYSTEM_LITERAL:
4544 xmlGenericError(xmlGenericErrorContext,
4545 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
4546 ctxt->instate = XML_PARSER_CONTENT;
4547 ctxt->checkIndex = 0;
4548#ifdef DEBUG_PUSH
4549 xmlGenericError(xmlGenericErrorContext,
4550 "HPP: entering CONTENT\n");
4551#endif
4552 break;
4553 case XML_PARSER_IGNORE:
4554 xmlGenericError(xmlGenericErrorContext,
4555 "HPP: internal error, state == XML_PARSER_IGNORE\n");
4556 ctxt->instate = XML_PARSER_CONTENT;
4557 ctxt->checkIndex = 0;
4558#ifdef DEBUG_PUSH
4559 xmlGenericError(xmlGenericErrorContext,
4560 "HPP: entering CONTENT\n");
4561#endif
4562 break;
4563 }
4564 }
4565done:
4566 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004567 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004568 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4569 /*
4570 * SAX: end of the document processing.
4571 */
4572 ctxt->instate = XML_PARSER_EOF;
4573 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4574 ctxt->sax->endDocument(ctxt->userData);
4575 }
4576 }
4577 if ((ctxt->myDoc != NULL) &&
4578 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4579 (ctxt->instate == XML_PARSER_EPILOG))) {
4580 xmlDtdPtr dtd;
4581 dtd = xmlGetIntSubset(ctxt->myDoc);
4582 if (dtd == NULL)
4583 ctxt->myDoc->intSubset =
4584 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4585 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4586 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4587 }
4588#ifdef DEBUG_PUSH
4589 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4590#endif
4591 return(ret);
4592}
4593
4594/**
Owen Taylor3473f882001-02-23 17:55:21 +00004595 * htmlParseChunk:
4596 * @ctxt: an XML parser context
4597 * @chunk: an char array
4598 * @size: the size in byte of the chunk
4599 * @terminate: last chunk indicator
4600 *
4601 * Parse a Chunk of memory
4602 *
4603 * Returns zero if no error, the xmlParserErrors otherwise.
4604 */
4605int
4606htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4607 int terminate) {
4608 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4609 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4610 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4611 int cur = ctxt->input->cur - ctxt->input->base;
4612
4613 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4614 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4615 ctxt->input->cur = ctxt->input->base + cur;
4616#ifdef DEBUG_PUSH
4617 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4618#endif
4619
4620 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4621 htmlParseTryOrFinish(ctxt, terminate);
4622 } else if (ctxt->instate != XML_PARSER_EOF) {
4623 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
4624 htmlParseTryOrFinish(ctxt, terminate);
4625 }
4626 if (terminate) {
4627 if ((ctxt->instate != XML_PARSER_EOF) &&
4628 (ctxt->instate != XML_PARSER_EPILOG) &&
4629 (ctxt->instate != XML_PARSER_MISC)) {
4630 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004631 ctxt->wellFormed = 0;
4632 }
4633 if (ctxt->instate != XML_PARSER_EOF) {
4634 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4635 ctxt->sax->endDocument(ctxt->userData);
4636 }
4637 ctxt->instate = XML_PARSER_EOF;
4638 }
4639 return((xmlParserErrors) ctxt->errNo);
4640}
4641
4642/************************************************************************
4643 * *
4644 * User entry points *
4645 * *
4646 ************************************************************************/
4647
4648/**
4649 * htmlCreatePushParserCtxt :
4650 * @sax: a SAX handler
4651 * @user_data: The user data returned on SAX callbacks
4652 * @chunk: a pointer to an array of chars
4653 * @size: number of chars in the array
4654 * @filename: an optional file name or URI
4655 * @enc: an optional encoding
4656 *
4657 * Create a parser context for using the HTML parser in push mode
4658 * To allow content encoding detection, @size should be >= 4
4659 * The value of @filename is used for fetching external entities
4660 * and error/warning reports.
4661 *
4662 * Returns the new parser context or NULL
4663 */
4664htmlParserCtxtPtr
4665htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4666 const char *chunk, int size, const char *filename,
4667 xmlCharEncoding enc) {
4668 htmlParserCtxtPtr ctxt;
4669 htmlParserInputPtr inputStream;
4670 xmlParserInputBufferPtr buf;
4671
4672 buf = xmlAllocParserInputBuffer(enc);
4673 if (buf == NULL) return(NULL);
4674
4675 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4676 if (ctxt == NULL) {
4677 xmlFree(buf);
4678 return(NULL);
4679 }
4680 memset(ctxt, 0, sizeof(htmlParserCtxt));
4681 htmlInitParserCtxt(ctxt);
4682 if (sax != NULL) {
4683 if (ctxt->sax != &htmlDefaultSAXHandler)
4684 xmlFree(ctxt->sax);
4685 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4686 if (ctxt->sax == NULL) {
4687 xmlFree(buf);
4688 xmlFree(ctxt);
4689 return(NULL);
4690 }
4691 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4692 if (user_data != NULL)
4693 ctxt->userData = user_data;
4694 }
4695 if (filename == NULL) {
4696 ctxt->directory = NULL;
4697 } else {
4698 ctxt->directory = xmlParserGetDirectory(filename);
4699 }
4700
4701 inputStream = htmlNewInputStream(ctxt);
4702 if (inputStream == NULL) {
4703 xmlFreeParserCtxt(ctxt);
4704 return(NULL);
4705 }
4706
4707 if (filename == NULL)
4708 inputStream->filename = NULL;
4709 else
4710 inputStream->filename = xmlMemStrdup(filename);
4711 inputStream->buf = buf;
4712 inputStream->base = inputStream->buf->buffer->content;
4713 inputStream->cur = inputStream->buf->buffer->content;
4714
4715 inputPush(ctxt, inputStream);
4716
4717 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4718 (ctxt->input->buf != NULL)) {
4719 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4720#ifdef DEBUG_PUSH
4721 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4722#endif
4723 }
4724
4725 return(ctxt);
4726}
4727
4728/**
4729 * htmlSAXParseDoc :
4730 * @cur: a pointer to an array of xmlChar
4731 * @encoding: a free form C string describing the HTML document encoding, or NULL
4732 * @sax: the SAX handler block
4733 * @userData: if using SAX, this pointer will be provided on callbacks.
4734 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004735 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
4736 * to handle parse events. If sax is NULL, fallback to the default DOM
4737 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00004738 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004739 * Returns the resulting document tree unless SAX is NULL or the document is
4740 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00004741 */
4742
4743htmlDocPtr
4744htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
4745 htmlDocPtr ret;
4746 htmlParserCtxtPtr ctxt;
4747
4748 if (cur == NULL) return(NULL);
4749
4750
4751 ctxt = htmlCreateDocParserCtxt(cur, encoding);
4752 if (ctxt == NULL) return(NULL);
4753 if (sax != NULL) {
4754 ctxt->sax = sax;
4755 ctxt->userData = userData;
4756 }
4757
4758 htmlParseDocument(ctxt);
4759 ret = ctxt->myDoc;
4760 if (sax != NULL) {
4761 ctxt->sax = NULL;
4762 ctxt->userData = NULL;
4763 }
4764 htmlFreeParserCtxt(ctxt);
4765
4766 return(ret);
4767}
4768
4769/**
4770 * htmlParseDoc :
4771 * @cur: a pointer to an array of xmlChar
4772 * @encoding: a free form C string describing the HTML document encoding, or NULL
4773 *
4774 * parse an HTML in-memory document and build a tree.
4775 *
4776 * Returns the resulting document tree
4777 */
4778
4779htmlDocPtr
4780htmlParseDoc(xmlChar *cur, const char *encoding) {
4781 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4782}
4783
4784
4785/**
4786 * htmlCreateFileParserCtxt :
4787 * @filename: the filename
4788 * @encoding: a free form C string describing the HTML document encoding, or NULL
4789 *
4790 * Create a parser context for a file content.
4791 * Automatic support for ZLIB/Compress compressed document is provided
4792 * by default if found at compile-time.
4793 *
4794 * Returns the new parser context or NULL
4795 */
4796htmlParserCtxtPtr
4797htmlCreateFileParserCtxt(const char *filename, const char *encoding)
4798{
4799 htmlParserCtxtPtr ctxt;
4800 htmlParserInputPtr inputStream;
4801 xmlParserInputBufferPtr buf;
4802 /* htmlCharEncoding enc; */
4803 xmlChar *content, *content_line = (xmlChar *) "charset=";
4804
4805 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
4806 if (buf == NULL) return(NULL);
4807
4808 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4809 if (ctxt == NULL) {
4810 perror("malloc");
4811 return(NULL);
4812 }
4813 memset(ctxt, 0, sizeof(htmlParserCtxt));
4814 htmlInitParserCtxt(ctxt);
4815 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
4816 if (inputStream == NULL) {
4817 perror("malloc");
4818 xmlFree(ctxt);
4819 return(NULL);
4820 }
4821 memset(inputStream, 0, sizeof(htmlParserInput));
4822
4823 inputStream->filename = xmlMemStrdup(filename);
4824 inputStream->line = 1;
4825 inputStream->col = 1;
4826 inputStream->buf = buf;
4827 inputStream->directory = NULL;
4828
4829 inputStream->base = inputStream->buf->buffer->content;
4830 inputStream->cur = inputStream->buf->buffer->content;
4831 inputStream->free = NULL;
4832
4833 inputPush(ctxt, inputStream);
4834
4835 /* set encoding */
4836 if (encoding) {
4837 content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
4838 if (content) {
4839 strcpy ((char *)content, (char *)content_line);
4840 strcat ((char *)content, (char *)encoding);
4841 htmlCheckEncoding (ctxt, content);
4842 xmlFree (content);
4843 }
4844 }
4845
4846 return(ctxt);
4847}
4848
4849/**
4850 * htmlSAXParseFile :
4851 * @filename: the filename
4852 * @encoding: a free form C string describing the HTML document encoding, or NULL
4853 * @sax: the SAX handler block
4854 * @userData: if using SAX, this pointer will be provided on callbacks.
4855 *
4856 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4857 * compressed document is provided by default if found at compile-time.
4858 * It use the given SAX function block to handle the parsing callback.
4859 * If sax is NULL, fallback to the default DOM tree building routines.
4860 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004861 * Returns the resulting document tree unless SAX is NULL or the document is
4862 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00004863 */
4864
4865htmlDocPtr
4866htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
4867 void *userData) {
4868 htmlDocPtr ret;
4869 htmlParserCtxtPtr ctxt;
4870 htmlSAXHandlerPtr oldsax = NULL;
4871
4872 ctxt = htmlCreateFileParserCtxt(filename, encoding);
4873 if (ctxt == NULL) return(NULL);
4874 if (sax != NULL) {
4875 oldsax = ctxt->sax;
4876 ctxt->sax = sax;
4877 ctxt->userData = userData;
4878 }
4879
4880 htmlParseDocument(ctxt);
4881
4882 ret = ctxt->myDoc;
4883 if (sax != NULL) {
4884 ctxt->sax = oldsax;
4885 ctxt->userData = NULL;
4886 }
4887 htmlFreeParserCtxt(ctxt);
4888
4889 return(ret);
4890}
4891
4892/**
4893 * htmlParseFile :
4894 * @filename: the filename
4895 * @encoding: a free form C string describing the HTML document encoding, or NULL
4896 *
4897 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4898 * compressed document is provided by default if found at compile-time.
4899 *
4900 * Returns the resulting document tree
4901 */
4902
4903htmlDocPtr
4904htmlParseFile(const char *filename, const char *encoding) {
4905 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
4906}
4907
4908/**
4909 * htmlHandleOmittedElem:
4910 * @val: int 0 or 1
4911 *
4912 * Set and return the previous value for handling HTML omitted tags.
4913 *
4914 * Returns the last value for 0 for no handling, 1 for auto insertion.
4915 */
4916
4917int
4918htmlHandleOmittedElem(int val) {
4919 int old = htmlOmittedDefaultValue;
4920
4921 htmlOmittedDefaultValue = val;
4922 return(old);
4923}
4924
4925#endif /* LIBXML_HTML_ENABLED */