blob: da4666c0d20adfa9a277c629c0a74fa6814075f8 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Bjorn Reese70a9da52001-04-21 16:57:29 +00009#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000010#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000011
Owen Taylor3473f882001-02-23 17:55:21 +000012#include <string.h>
13#ifdef HAVE_CTYPE_H
14#include <ctype.h>
15#endif
16#ifdef HAVE_STDLIB_H
17#include <stdlib.h>
18#endif
19#ifdef HAVE_SYS_STAT_H
20#include <sys/stat.h>
21#endif
22#ifdef HAVE_FCNTL_H
23#include <fcntl.h>
24#endif
25#ifdef HAVE_UNISTD_H
26#include <unistd.h>
27#endif
28#ifdef HAVE_ZLIB_H
29#include <zlib.h>
30#endif
31
32#include <libxml/xmlmemory.h>
33#include <libxml/tree.h>
34#include <libxml/parser.h>
35#include <libxml/parserInternals.h>
36#include <libxml/xmlerror.h>
37#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000038#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000039#include <libxml/entities.h>
40#include <libxml/encoding.h>
41#include <libxml/valid.h>
42#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000043#include <libxml/globals.h>
Owen Taylor3473f882001-02-23 17:55:21 +000044
45#define HTML_MAX_NAMELEN 1000
46#define HTML_PARSER_BIG_BUFFER_SIZE 1000
47#define HTML_PARSER_BUFFER_SIZE 100
48
49/* #define DEBUG */
50/* #define DEBUG_PUSH */
51
Daniel Veillard22090732001-07-16 00:06:07 +000052static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000053
Daniel Veillard56a4cb82001-03-24 17:00:36 +000054xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
55 xmlChar end, xmlChar end2, xmlChar end3);
56
57/************************************************************************
58 * *
Owen Taylor3473f882001-02-23 17:55:21 +000059 * Parser stacks related functions and macros *
60 * *
61 ************************************************************************/
62
63/*
64 * Generic function for accessing stacks in the Parser Context
65 */
66
67#define PUSH_AND_POP(scope, type, name) \
68scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
69 if (ctxt->name##Nr >= ctxt->name##Max) { \
70 ctxt->name##Max *= 2; \
71 ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
72 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
73 if (ctxt->name##Tab == NULL) { \
74 xmlGenericError(xmlGenericErrorContext, \
75 "realloc failed !\n"); \
76 return(0); \
77 } \
78 } \
79 ctxt->name##Tab[ctxt->name##Nr] = value; \
80 ctxt->name = value; \
81 return(ctxt->name##Nr++); \
82} \
83scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
84 type ret; \
85 if (ctxt->name##Nr < 0) return(0); \
86 ctxt->name##Nr--; \
87 if (ctxt->name##Nr < 0) return(0); \
88 if (ctxt->name##Nr > 0) \
89 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
90 else \
91 ctxt->name = NULL; \
92 ret = ctxt->name##Tab[ctxt->name##Nr]; \
93 ctxt->name##Tab[ctxt->name##Nr] = 0; \
94 return(ret); \
95} \
96
Daniel Veillard56a4cb82001-03-24 17:00:36 +000097/* PUSH_AND_POP(static, xmlNodePtr, node) */
98PUSH_AND_POP(static, xmlChar*, name)
Owen Taylor3473f882001-02-23 17:55:21 +000099
100/*
101 * Macros for accessing the content. Those should be used only by the parser,
102 * and not exported.
103 *
104 * Dirty macros, i.e. one need to make assumption on the context to use them
105 *
106 * CUR_PTR return the current pointer to the xmlChar to be parsed.
107 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
108 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
109 * in UNICODE mode. This should be used internally by the parser
110 * only to compare to ASCII values otherwise it would break when
111 * running with UTF-8 encoding.
112 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
113 * to compare on ASCII based substring.
114 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
115 * it should be used only to compare on ASCII based substring.
116 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
117 * strings within the parser.
118 *
119 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
120 *
121 * CURRENT Returns the current char value, with the full decoding of
122 * UTF-8 if we are using this mode. It returns an int.
123 * NEXT Skip to the next character, this does the proper decoding
124 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
125 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
126 */
127
128#define UPPER (toupper(*ctxt->input->cur))
129
130#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
131
132#define NXT(val) ctxt->input->cur[(val)]
133
134#define UPP(val) (toupper(ctxt->input->cur[(val)]))
135
136#define CUR_PTR ctxt->input->cur
137
138#define SHRINK xmlParserInputShrink(ctxt->input)
139
140#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
141
142#define CURRENT ((int) (*ctxt->input->cur))
143
144#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
145
146/* Inported from XML */
147
148/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
149#define CUR ((int) (*ctxt->input->cur))
150#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
151
152#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
153#define NXT(val) ctxt->input->cur[(val)]
154#define CUR_PTR ctxt->input->cur
155
156
157#define NEXTL(l) do { \
158 if (*(ctxt->input->cur) == '\n') { \
159 ctxt->input->line++; ctxt->input->col = 1; \
160 } else ctxt->input->col++; \
161 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
162 } while (0)
163
164/************
165 \
166 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
167 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
168 ************/
169
170#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
171#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
172
173#define COPY_BUF(l,b,i,v) \
174 if (l == 1) b[i++] = (xmlChar) v; \
175 else i += xmlCopyChar(l,&b[i],v)
176
177/**
178 * htmlCurrentChar:
179 * @ctxt: the HTML parser context
180 * @len: pointer to the length of the char read
181 *
182 * The current char value, if using UTF-8 this may actaully span multiple
183 * bytes in the input buffer. Implement the end of line normalization:
184 * 2.11 End-of-Line Handling
185 * If the encoding is unspecified, in the case we find an ISO-Latin-1
186 * char, then the encoding converter is plugged in automatically.
187 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000188 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000189 */
190
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000191static int
Owen Taylor3473f882001-02-23 17:55:21 +0000192htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
193 if (ctxt->instate == XML_PARSER_EOF)
194 return(0);
195
196 if (ctxt->token != 0) {
197 *len = 0;
198 return(ctxt->token);
199 }
200 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
201 /*
202 * We are supposed to handle UTF8, check it's valid
203 * From rfc2044: encoding of the Unicode values on UTF-8:
204 *
205 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
206 * 0000 0000-0000 007F 0xxxxxxx
207 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
208 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
209 *
210 * Check for the 0x110000 limit too
211 */
212 const unsigned char *cur = ctxt->input->cur;
213 unsigned char c;
214 unsigned int val;
215
216 c = *cur;
217 if (c & 0x80) {
218 if (cur[1] == 0)
219 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
220 if ((cur[1] & 0xc0) != 0x80)
221 goto encoding_error;
222 if ((c & 0xe0) == 0xe0) {
223
224 if (cur[2] == 0)
225 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
226 if ((cur[2] & 0xc0) != 0x80)
227 goto encoding_error;
228 if ((c & 0xf0) == 0xf0) {
229 if (cur[3] == 0)
230 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
231 if (((c & 0xf8) != 0xf0) ||
232 ((cur[3] & 0xc0) != 0x80))
233 goto encoding_error;
234 /* 4-byte code */
235 *len = 4;
236 val = (cur[0] & 0x7) << 18;
237 val |= (cur[1] & 0x3f) << 12;
238 val |= (cur[2] & 0x3f) << 6;
239 val |= cur[3] & 0x3f;
240 } else {
241 /* 3-byte code */
242 *len = 3;
243 val = (cur[0] & 0xf) << 12;
244 val |= (cur[1] & 0x3f) << 6;
245 val |= cur[2] & 0x3f;
246 }
247 } else {
248 /* 2-byte code */
249 *len = 2;
250 val = (cur[0] & 0x1f) << 6;
251 val |= cur[1] & 0x3f;
252 }
253 if (!IS_CHAR(val)) {
254 ctxt->errNo = XML_ERR_INVALID_ENCODING;
255 if ((ctxt->sax != NULL) &&
256 (ctxt->sax->error != NULL))
257 ctxt->sax->error(ctxt->userData,
258 "Char 0x%X out of allowed range\n", val);
259 ctxt->wellFormed = 0;
260 ctxt->disableSAX = 1;
261 }
262 return(val);
263 } else {
264 /* 1-byte code */
265 *len = 1;
266 return((int) *ctxt->input->cur);
267 }
268 }
269 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000270 * Assume it's a fixed length encoding (1) with
Owen Taylor3473f882001-02-23 17:55:21 +0000271 * a compatibke encoding for the ASCII set, since
272 * XML constructs only use < 128 chars
273 */
274 *len = 1;
275 if ((int) *ctxt->input->cur < 0x80)
276 return((int) *ctxt->input->cur);
277
278 /*
279 * Humm this is bad, do an automatic flow conversion
280 */
281 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
282 ctxt->charset = XML_CHAR_ENCODING_UTF8;
283 return(xmlCurrentChar(ctxt, len));
284
285encoding_error:
286 /*
287 * If we detect an UTF8 error that probably mean that the
288 * input encoding didn't get properly advertized in the
289 * declaration header. Report the error and switch the encoding
290 * to ISO-Latin-1 (if you don't like this policy, just declare the
291 * encoding !)
292 */
293 ctxt->errNo = XML_ERR_INVALID_ENCODING;
294 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
295 ctxt->sax->error(ctxt->userData,
296 "Input is not proper UTF-8, indicate encoding !\n");
297 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
298 ctxt->input->cur[0], ctxt->input->cur[1],
299 ctxt->input->cur[2], ctxt->input->cur[3]);
300 }
301
302 ctxt->charset = XML_CHAR_ENCODING_8859_1;
303 *len = 1;
304 return((int) *ctxt->input->cur);
305}
306
307/**
Owen Taylor3473f882001-02-23 17:55:21 +0000308 * htmlSkipBlankChars:
309 * @ctxt: the HTML parser context
310 *
311 * skip all blanks character found at that point in the input streams.
312 *
313 * Returns the number of space chars skipped
314 */
315
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000316static int
Owen Taylor3473f882001-02-23 17:55:21 +0000317htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
318 int res = 0;
319
320 while (IS_BLANK(*(ctxt->input->cur))) {
321 if ((*ctxt->input->cur == 0) &&
322 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
323 xmlPopInput(ctxt);
324 } else {
325 if (*(ctxt->input->cur) == '\n') {
326 ctxt->input->line++; ctxt->input->col = 1;
327 } else ctxt->input->col++;
328 ctxt->input->cur++;
329 ctxt->nbChars++;
330 if (*ctxt->input->cur == 0)
331 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
332 }
333 res++;
334 }
335 return(res);
336}
337
338
339
340/************************************************************************
341 * *
342 * The list of HTML elements and their properties *
343 * *
344 ************************************************************************/
345
346/*
347 * Start Tag: 1 means the start tag can be ommited
348 * End Tag: 1 means the end tag can be ommited
349 * 2 means it's forbidden (empty elements)
Daniel Veillard56098d42001-04-24 12:51:09 +0000350 * 3 means the tag is stylistic and should be closed easilly
Owen Taylor3473f882001-02-23 17:55:21 +0000351 * Depr: this element is deprecated
352 * DTD: 1 means that this element is valid only in the Loose DTD
353 * 2 means that this element is valid only in the Frameset DTD
354 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000355 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Owen Taylor3473f882001-02-23 17:55:21 +0000356 */
Daniel Veillard22090732001-07-16 00:06:07 +0000357static const htmlElemDesc
358html40ElementTable[] = {
Daniel Veillard02bb1702001-06-13 21:11:59 +0000359{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor " },
360{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form" },
361{ "acronym", 0, 0, 0, 0, 0, 0, 1, "" },
362{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author " },
363{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet " },
364{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area " },
365{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style" },
366{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri " },
367{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " },
368{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride " },
369{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style" },
370{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation " },
371{ "body", 1, 1, 0, 0, 0, 0, 0, "document body " },
372{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break " },
373{ "button", 0, 0, 0, 0, 0, 0, 2, "push button " },
374{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption " },
375{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center " },
376{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation" },
377{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment" },
378{ "col", 0, 2, 2, 1, 0, 0, 0, "table column " },
379{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group " },
380{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description " },
381{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text " },
382{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition" },
383{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list" },
384{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container"},
385{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list " },
386{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term " },
387{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis" },
388{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group " },
389{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font " },
390{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form " },
391{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " },
392{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" },
393{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading " },
394{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading " },
395{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading " },
396{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading " },
397{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading " },
398{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading " },
399{ "head", 1, 1, 0, 0, 0, 0, 0, "document head " },
400{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " },
401{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element " },
402{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style" },
403{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow " },
404{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image " },
405{ "input", 0, 2, 2, 1, 0, 0, 1, "form control " },
406{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text" },
407{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt " },
408{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user" },
409{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text " },
410{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend " },
411{ "li", 0, 1, 1, 0, 0, 0, 0, "list item " },
412{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link " },
413{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map " },
414{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list " },
415{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation " },
416{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering " },
417{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
418{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object " },
419{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list " },
420{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group " },
421{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " },
422{ "p", 0, 1, 1, 0, 0, 0, 0, "paragraph " },
423{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value " },
424{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text " },
425{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation " },
426{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style" },
427{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc." },
428{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements " },
429{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector " },
430{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style" },
431{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container " },
432{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text" },
433{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis" },
434{ "style", 0, 0, 0, 0, 0, 0, 0, "style info " },
435{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript" },
436{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript " },
437{ "table", 0, 0, 0, 0, 0, 0, 0, "&#160;" },
438{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body " },
439{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell" },
440{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field " },
441{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer " },
442{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell" },
443{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header " },
444{ "title", 0, 0, 0, 0, 0, 0, 0, "document title " },
445{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row " },
446{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style" },
447{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style" },
448{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list " },
449{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument" },
Owen Taylor3473f882001-02-23 17:55:21 +0000450};
451
452/*
Owen Taylor3473f882001-02-23 17:55:21 +0000453 * start tags that imply the end of current element
454 */
Daniel Veillard22090732001-07-16 00:06:07 +0000455static const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000456"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
457 "dl", "ul", "ol", "menu", "dir", "address", "pre",
458 "listing", "xmp", "head", NULL,
459"head", "p", NULL,
460"title", "p", NULL,
461"body", "head", "style", "link", "title", "p", NULL,
462"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
463 "pre", "listing", "xmp", "head", "li", NULL,
464"hr", "p", "head", NULL,
465"h1", "p", "head", NULL,
466"h2", "p", "head", NULL,
467"h3", "p", "head", NULL,
468"h4", "p", "head", NULL,
469"h5", "p", "head", NULL,
470"h6", "p", "head", NULL,
471"dir", "p", "head", NULL,
472"address", "p", "head", "ul", NULL,
473"pre", "p", "head", "ul", NULL,
474"listing", "p", "head", NULL,
475"xmp", "p", "head", NULL,
476"blockquote", "p", "head", NULL,
477"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
478 "xmp", "head", NULL,
479"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
480 "head", "dd", NULL,
481"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
482 "head", "dt", NULL,
483"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
484 "listing", "xmp", NULL,
485"ol", "p", "head", "ul", NULL,
486"menu", "p", "head", "ul", NULL,
487"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
488"div", "p", "head", NULL,
489"noscript", "p", "head", NULL,
490"center", "font", "b", "i", "p", "head", NULL,
491"a", "a", NULL,
492"caption", "p", NULL,
493"colgroup", "caption", "colgroup", "col", "p", NULL,
494"col", "caption", "col", "p", NULL,
495"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
496 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000497"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
498"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000499"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
500"thead", "caption", "col", "colgroup", NULL,
501"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
502 "tbody", "p", NULL,
503"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
504 "tfoot", "tbody", "p", NULL,
505"optgroup", "option", NULL,
506"option", "option", NULL,
507"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
508 "pre", "listing", "xmp", "a", NULL,
509NULL
510};
511
512/*
513 * The list of HTML elements which are supposed not to have
514 * CDATA content and where a p element will be implied
515 *
516 * TODO: extend that list by reading the HTML SGML DtD on
517 * implied paragraph
518 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000519static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000520 "html",
521 "head",
522 "body",
523 NULL
524};
525
526/*
527 * The list of HTML attributes which are of content %Script;
528 * NOTE: when adding ones, check htmlIsScriptAttribute() since
529 * it assumes the name starts with 'on'
530 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000531static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000532 "onclick",
533 "ondblclick",
534 "onmousedown",
535 "onmouseup",
536 "onmouseover",
537 "onmousemove",
538 "onmouseout",
539 "onkeypress",
540 "onkeydown",
541 "onkeyup",
542 "onload",
543 "onunload",
544 "onfocus",
545 "onblur",
546 "onsubmit",
547 "onrest",
548 "onchange",
549 "onselect"
550};
551
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000552/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000553 * This table is used by the htmlparser to know what to do with
554 * broken html pages. By assigning different priorities to different
555 * elements the parser can decide how to handle extra endtags.
556 * Endtags are only allowed to close elements with lower or equal
557 * priority.
558 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000559
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000560typedef struct {
561 const char *name;
562 int priority;
563} elementPriority;
564
Daniel Veillard22090732001-07-16 00:06:07 +0000565static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000566 {"div", 150},
567 {"td", 160},
568 {"th", 160},
569 {"tr", 170},
570 {"thead", 180},
571 {"tbody", 180},
572 {"tfoot", 180},
573 {"table", 190},
574 {"head", 200},
575 {"body", 200},
576 {"html", 220},
577 {NULL, 100} /* Default priority */
578};
Owen Taylor3473f882001-02-23 17:55:21 +0000579
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000580static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +0000581static int htmlStartCloseIndexinitialized = 0;
582
583/************************************************************************
584 * *
585 * functions to handle HTML specific data *
586 * *
587 ************************************************************************/
588
589/**
590 * htmlInitAutoClose:
591 *
592 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
593 * This is not reentrant. Call xmlInitParser() once before processing in
594 * case of use in multithreaded programs.
595 */
596void
597htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000598 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000599
600 if (htmlStartCloseIndexinitialized) return;
601
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000602 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
603 indx = 0;
604 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
605 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +0000606 while (htmlStartClose[i] != NULL) i++;
607 i++;
608 }
609 htmlStartCloseIndexinitialized = 1;
610}
611
612/**
613 * htmlTagLookup:
614 * @tag: The tag name in lowercase
615 *
616 * Lookup the HTML tag in the ElementTable
617 *
618 * Returns the related htmlElemDescPtr or NULL if not found.
619 */
Daniel Veillardbb371292001-08-16 23:26:59 +0000620const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +0000621htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000622 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000623
624 for (i = 0; i < (sizeof(html40ElementTable) /
625 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000626 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillard22090732001-07-16 00:06:07 +0000627 return((const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +0000628 }
629 return(NULL);
630}
631
632/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000633 * htmlGetEndPriority:
634 * @name: The name of the element to look up the priority for.
635 *
636 * Return value: The "endtag" priority.
637 **/
638static int
639htmlGetEndPriority (const xmlChar *name) {
640 int i = 0;
641
642 while ((htmlEndPriority[i].name != NULL) &&
643 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
644 i++;
645
646 return(htmlEndPriority[i].priority);
647}
648
649/**
Owen Taylor3473f882001-02-23 17:55:21 +0000650 * htmlCheckAutoClose:
651 * @newtag: The new tag name
652 * @oldtag: The old tag name
653 *
654 * Checks wether the new tag is one of the registered valid tags for closing old.
655 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
656 *
657 * Returns 0 if no, 1 if yes.
658 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000659static int
Owen Taylor3473f882001-02-23 17:55:21 +0000660htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000661 int i, indx;
662 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +0000663
664 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
665
666 /* inefficient, but not a big deal */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000667 for (indx = 0; indx < 100;indx++) {
668 closed = htmlStartCloseIndex[indx];
669 if (closed == NULL) return(0);
670 if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor3473f882001-02-23 17:55:21 +0000671 }
672
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000673 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +0000674 i++;
675 while (htmlStartClose[i] != NULL) {
676 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
677 return(1);
678 }
679 i++;
680 }
681 return(0);
682}
683
684/**
685 * htmlAutoCloseOnClose:
686 * @ctxt: an HTML parser context
687 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000688 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +0000689 *
690 * The HTmL DtD allows an ending tag to implicitely close other tags.
691 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000692static void
Owen Taylor3473f882001-02-23 17:55:21 +0000693htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillardbb371292001-08-16 23:26:59 +0000694 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +0000695 xmlChar *oldname;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000696 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +0000697
698#ifdef DEBUG
699 xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
700 for (i = 0;i < ctxt->nameNr;i++)
701 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
702#endif
703
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000704 priority = htmlGetEndPriority (newtag);
705
Owen Taylor3473f882001-02-23 17:55:21 +0000706 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000707
Owen Taylor3473f882001-02-23 17:55:21 +0000708 if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000709 /*
710 * A missplaced endtagad can only close elements with lower
711 * or equal priority, so if we find an element with higher
712 * priority before we find an element with
713 * matching name, we just ignore this endtag
714 */
715 if (htmlGetEndPriority (ctxt->nameTab[i]) > priority) return;
Owen Taylor3473f882001-02-23 17:55:21 +0000716 }
717 if (i < 0) return;
718
719 while (!xmlStrEqual(newtag, ctxt->name)) {
720 info = htmlTagLookup(ctxt->name);
721 if ((info == NULL) || (info->endTag == 1)) {
722#ifdef DEBUG
723 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
724#endif
Daniel Veillard56098d42001-04-24 12:51:09 +0000725 } else if (info->endTag == 3) {
726#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +0000727 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", newtag, ctxt->name);
William M. Brack1633d182001-10-05 15:41:19 +0000728
Daniel Veillard56098d42001-04-24 12:51:09 +0000729#endif
730 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
731 ctxt->sax->error(ctxt->userData,
732 "Opening and ending tag mismatch: %s and %s\n",
733 newtag, ctxt->name);
734 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000735 }
736 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
737 ctxt->sax->endElement(ctxt->userData, ctxt->name);
738 oldname = htmlnamePop(ctxt);
739 if (oldname != NULL) {
740#ifdef DEBUG
741 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
742#endif
743 xmlFree(oldname);
744 }
745 }
746}
747
748/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000749 * htmlAutoCloseOnEnd:
750 * @ctxt: an HTML parser context
751 *
752 * Close all remaining tags at the end of the stream
753 */
754static void
755htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
756 xmlChar *oldname;
757 int i;
758
759 if (ctxt->nameNr == 0)
760 return;
761#ifdef DEBUG
762 xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
763#endif
764
765 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
766#ifdef DEBUG
767 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
768#endif
769 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
770 ctxt->sax->endElement(ctxt->userData, ctxt->name);
771 oldname = htmlnamePop(ctxt);
772 if (oldname != NULL) {
773#ifdef DEBUG
774 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
775#endif
776 xmlFree(oldname);
777 }
778 }
779}
780
781/**
Owen Taylor3473f882001-02-23 17:55:21 +0000782 * htmlAutoClose:
783 * @ctxt: an HTML parser context
784 * @newtag: The new tag name or NULL
785 *
786 * The HTmL DtD allows a tag to implicitely close other tags.
787 * The list is kept in htmlStartClose array. This function is
788 * called when a new tag has been detected and generates the
789 * appropriates closes if possible/needed.
790 * If newtag is NULL this mean we are at the end of the resource
791 * and we should check
792 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000793static void
Owen Taylor3473f882001-02-23 17:55:21 +0000794htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
795 xmlChar *oldname;
796 while ((newtag != NULL) && (ctxt->name != NULL) &&
797 (htmlCheckAutoClose(newtag, ctxt->name))) {
798#ifdef DEBUG
799 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
800#endif
801 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
802 ctxt->sax->endElement(ctxt->userData, ctxt->name);
803 oldname = htmlnamePop(ctxt);
804 if (oldname != NULL) {
805#ifdef DEBUG
806 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
807#endif
808 xmlFree(oldname);
809 }
810 }
811 if (newtag == NULL) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000812 htmlAutoCloseOnEnd(ctxt);
813 return;
Owen Taylor3473f882001-02-23 17:55:21 +0000814 }
815 while ((newtag == NULL) && (ctxt->name != NULL) &&
816 ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
817 (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
818 (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
819#ifdef DEBUG
820 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
821#endif
822 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
823 ctxt->sax->endElement(ctxt->userData, ctxt->name);
824 oldname = htmlnamePop(ctxt);
825 if (oldname != NULL) {
826#ifdef DEBUG
827 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
828#endif
829 xmlFree(oldname);
830 }
831 }
832
833}
834
835/**
836 * htmlAutoCloseTag:
837 * @doc: the HTML document
838 * @name: The tag name
839 * @elem: the HTML element
840 *
841 * The HTmL DtD allows a tag to implicitely close other tags.
842 * The list is kept in htmlStartClose array. This function checks
843 * if the element or one of it's children would autoclose the
844 * given tag.
845 *
846 * Returns 1 if autoclose, 0 otherwise
847 */
848int
849htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
850 htmlNodePtr child;
851
852 if (elem == NULL) return(1);
853 if (xmlStrEqual(name, elem->name)) return(0);
854 if (htmlCheckAutoClose(elem->name, name)) return(1);
855 child = elem->children;
856 while (child != NULL) {
857 if (htmlAutoCloseTag(doc, name, child)) return(1);
858 child = child->next;
859 }
860 return(0);
861}
862
863/**
864 * htmlIsAutoClosed:
865 * @doc: the HTML document
866 * @elem: the HTML element
867 *
868 * The HTmL DtD allows a tag to implicitely close other tags.
869 * The list is kept in htmlStartClose array. This function checks
870 * if a tag is autoclosed by one of it's child
871 *
872 * Returns 1 if autoclosed, 0 otherwise
873 */
874int
875htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
876 htmlNodePtr child;
877
878 if (elem == NULL) return(1);
879 child = elem->children;
880 while (child != NULL) {
881 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
882 child = child->next;
883 }
884 return(0);
885}
886
887/**
888 * htmlCheckImplied:
889 * @ctxt: an HTML parser context
890 * @newtag: The new tag name
891 *
892 * The HTML DtD allows a tag to exists only implicitely
893 * called when a new tag has been detected and generates the
894 * appropriates implicit tags if missing
895 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000896static void
Owen Taylor3473f882001-02-23 17:55:21 +0000897htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
898 if (!htmlOmittedDefaultValue)
899 return;
900 if (xmlStrEqual(newtag, BAD_CAST"html"))
901 return;
902 if (ctxt->nameNr <= 0) {
903#ifdef DEBUG
904 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
905#endif
906 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
907 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
908 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
909 }
910 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
911 return;
912 if ((ctxt->nameNr <= 1) &&
913 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
914 (xmlStrEqual(newtag, BAD_CAST"style")) ||
915 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
916 (xmlStrEqual(newtag, BAD_CAST"link")) ||
917 (xmlStrEqual(newtag, BAD_CAST"title")) ||
918 (xmlStrEqual(newtag, BAD_CAST"base")))) {
919 /*
920 * dropped OBJECT ... i you put it first BODY will be
921 * assumed !
922 */
923#ifdef DEBUG
924 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
925#endif
926 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
927 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
928 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
929 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
930 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
931 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
932 int i;
933 for (i = 0;i < ctxt->nameNr;i++) {
934 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
935 return;
936 }
937 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
938 return;
939 }
940 }
941
942#ifdef DEBUG
943 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
944#endif
945 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
946 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
947 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
948 }
949}
950
951/**
952 * htmlCheckParagraph
953 * @ctxt: an HTML parser context
954 *
955 * Check whether a p element need to be implied before inserting
956 * characters in the current element.
957 *
958 * Returns 1 if a paragraph has been inserted, 0 if not and -1
959 * in case of error.
960 */
961
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000962static int
Owen Taylor3473f882001-02-23 17:55:21 +0000963htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
964 const xmlChar *tag;
965 int i;
966
967 if (ctxt == NULL)
968 return(-1);
969 tag = ctxt->name;
970 if (tag == NULL) {
971 htmlAutoClose(ctxt, BAD_CAST"p");
972 htmlCheckImplied(ctxt, BAD_CAST"p");
973 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
974 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
975 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
976 return(1);
977 }
978 if (!htmlOmittedDefaultValue)
979 return(0);
980 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
981 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
982#ifdef DEBUG
983 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
984#endif
985 htmlAutoClose(ctxt, BAD_CAST"p");
986 htmlCheckImplied(ctxt, BAD_CAST"p");
987 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
988 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
989 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
990 return(1);
991 }
992 }
993 return(0);
994}
995
996/**
997 * htmlIsScriptAttribute:
998 * @name: an attribute name
999 *
1000 * Check if an attribute is of content type Script
1001 *
1002 * Returns 1 is the attribute is a script 0 otherwise
1003 */
1004int
1005htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001006 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001007
1008 if (name == NULL)
1009 return(0);
1010 /*
1011 * all script attributes start with 'on'
1012 */
1013 if ((name[0] != 'o') || (name[1] != 'n'))
1014 return(0);
1015 for (i = 0;
1016 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1017 i++) {
1018 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1019 return(1);
1020 }
1021 return(0);
1022}
1023
1024/************************************************************************
1025 * *
1026 * The list of HTML predefined entities *
1027 * *
1028 ************************************************************************/
1029
1030
Daniel Veillard22090732001-07-16 00:06:07 +00001031static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001032/*
1033 * the 4 absolute ones, plus apostrophe.
1034 */
1035{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1036{ 38, "amp", "ampersand, U+0026 ISOnum" },
1037{ 39, "apos", "single quote" },
1038{ 60, "lt", "less-than sign, U+003C ISOnum" },
1039{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1040
1041/*
1042 * A bunch still in the 128-255 range
1043 * Replacing them depend really on the charset used.
1044 */
1045{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1046{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1047{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1048{ 163, "pound","pound sign, U+00A3 ISOnum" },
1049{ 164, "curren","currency sign, U+00A4 ISOnum" },
1050{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1051{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1052{ 167, "sect", "section sign, U+00A7 ISOnum" },
1053{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1054{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1055{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1056{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1057{ 172, "not", "not sign, U+00AC ISOnum" },
1058{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1059{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1060{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1061{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1062{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1063{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1064{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1065{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1066{ 181, "micro","micro sign, U+00B5 ISOnum" },
1067{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1068{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1069{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1070{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1071{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1072{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1073{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1074{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1075{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1076{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1077{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1078{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1079{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1080{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1081{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1082{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1083{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1084{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1085{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1086{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1087{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1088{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1089{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1090{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1091{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1092{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1093{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1094{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1095{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1096{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1097{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1098{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1099{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1100{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1101{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1102{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1103{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1104{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1105{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1106{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1107{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1108{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1109{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1110{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1111{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1112{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1113{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1114{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1115{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1116{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1117{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1118{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1119{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1120{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1121{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1122{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1123{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1124{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1125{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1126{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1127{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1128{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1129{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1130{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1131{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1132{ 247, "divide","division sign, U+00F7 ISOnum" },
1133{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1134{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1135{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1136{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1137{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1138{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1139{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1140{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1141
1142{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1143{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1144{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1145{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1146{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1147
1148/*
1149 * Anything below should really be kept as entities references
1150 */
1151{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1152
1153{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1154{ 732, "tilde","small tilde, U+02DC ISOdia" },
1155
1156{ 913, "Alpha","greek capital letter alpha, U+0391" },
1157{ 914, "Beta", "greek capital letter beta, U+0392" },
1158{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1159{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1160{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1161{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1162{ 919, "Eta", "greek capital letter eta, U+0397" },
1163{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1164{ 921, "Iota", "greek capital letter iota, U+0399" },
1165{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001166{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001167{ 924, "Mu", "greek capital letter mu, U+039C" },
1168{ 925, "Nu", "greek capital letter nu, U+039D" },
1169{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1170{ 927, "Omicron","greek capital letter omicron, U+039F" },
1171{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1172{ 929, "Rho", "greek capital letter rho, U+03A1" },
1173{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1174{ 932, "Tau", "greek capital letter tau, U+03A4" },
1175{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1176{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1177{ 935, "Chi", "greek capital letter chi, U+03A7" },
1178{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1179{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1180
1181{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1182{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1183{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1184{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1185{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1186{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1187{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1188{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1189{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1190{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1191{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1192{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1193{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1194{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1195{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1196{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1197{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1198{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1199{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1200{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1201{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1202{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1203{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1204{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1205{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1206{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1207{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1208{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1209
1210{ 8194, "ensp", "en space, U+2002 ISOpub" },
1211{ 8195, "emsp", "em space, U+2003 ISOpub" },
1212{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1213{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1214{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1215{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1216{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1217{ 8211, "ndash","en dash, U+2013 ISOpub" },
1218{ 8212, "mdash","em dash, U+2014 ISOpub" },
1219{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1220{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1221{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1222{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1223{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1224{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1225{ 8224, "dagger","dagger, U+2020 ISOpub" },
1226{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1227
1228{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1229{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1230
1231{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1232
1233{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1234{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1235
1236{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1237{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1238
1239{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1240{ 8260, "frasl","fraction slash, U+2044 NEW" },
1241
1242{ 8364, "euro", "euro sign, U+20AC NEW" },
1243
1244{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1245{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1246{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1247{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1248{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1249{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1250{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1251{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1252{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1253{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1254{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1255{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1256{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1257{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1258{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1259{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1260
1261{ 8704, "forall","for all, U+2200 ISOtech" },
1262{ 8706, "part", "partial differential, U+2202 ISOtech" },
1263{ 8707, "exist","there exists, U+2203 ISOtech" },
1264{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1265{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1266{ 8712, "isin", "element of, U+2208 ISOtech" },
1267{ 8713, "notin","not an element of, U+2209 ISOtech" },
1268{ 8715, "ni", "contains as member, U+220B ISOtech" },
1269{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1270{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
1271{ 8722, "minus","minus sign, U+2212 ISOtech" },
1272{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1273{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1274{ 8733, "prop", "proportional to, U+221D ISOtech" },
1275{ 8734, "infin","infinity, U+221E ISOtech" },
1276{ 8736, "ang", "angle, U+2220 ISOamso" },
1277{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1278{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1279{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1280{ 8746, "cup", "union = cup, U+222A ISOtech" },
1281{ 8747, "int", "integral, U+222B ISOtech" },
1282{ 8756, "there4","therefore, U+2234 ISOtech" },
1283{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1284{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1285{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1286{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1287{ 8801, "equiv","identical to, U+2261 ISOtech" },
1288{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1289{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1290{ 8834, "sub", "subset of, U+2282 ISOtech" },
1291{ 8835, "sup", "superset of, U+2283 ISOtech" },
1292{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1293{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1294{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1295{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1296{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1297{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1298{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1299{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1300{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1301{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1302{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1303{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1304{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1305{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1306
1307{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1308{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1309{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1310{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1311
1312};
1313
1314/************************************************************************
1315 * *
1316 * Commodity functions to handle entities *
1317 * *
1318 ************************************************************************/
1319
1320/*
1321 * Macro used to grow the current buffer.
1322 */
1323#define growBuffer(buffer) { \
1324 buffer##_size *= 2; \
1325 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1326 if (buffer == NULL) { \
1327 perror("realloc failed"); \
1328 return(NULL); \
1329 } \
1330}
1331
1332/**
1333 * htmlEntityLookup:
1334 * @name: the entity name
1335 *
1336 * Lookup the given entity in EntitiesTable
1337 *
1338 * TODO: the linear scan is really ugly, an hash table is really needed.
1339 *
1340 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1341 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001342const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001343htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001344 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001345
1346 for (i = 0;i < (sizeof(html40EntitiesTable)/
1347 sizeof(html40EntitiesTable[0]));i++) {
1348 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1349#ifdef DEBUG
1350 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
1351#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001352 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001353 }
1354 }
1355 return(NULL);
1356}
1357
1358/**
1359 * htmlEntityValueLookup:
1360 * @value: the entity's unicode value
1361 *
1362 * Lookup the given entity in EntitiesTable
1363 *
1364 * TODO: the linear scan is really ugly, an hash table is really needed.
1365 *
1366 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1367 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001368const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001369htmlEntityValueLookup(unsigned int value) {
1370 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001371#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00001372 unsigned int lv = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001373#endif
1374
1375 for (i = 0;i < (sizeof(html40EntitiesTable)/
1376 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001377 if (html40EntitiesTable[i].value >= value) {
1378 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001379 break;
1380#ifdef DEBUG
1381 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
1382#endif
Daniel Veillard22090732001-07-16 00:06:07 +00001383 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001384 }
1385#ifdef DEBUG
1386 if (lv > html40EntitiesTable[i].value) {
1387 xmlGenericError(xmlGenericErrorContext,
1388 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1389 lv, html40EntitiesTable[i].value);
1390 }
1391 lv = html40EntitiesTable[i].value;
1392#endif
1393 }
1394 return(NULL);
1395}
1396
1397/**
1398 * UTF8ToHtml:
1399 * @out: a pointer to an array of bytes to store the result
1400 * @outlen: the length of @out
1401 * @in: a pointer to an array of UTF-8 chars
1402 * @inlen: the length of @in
1403 *
1404 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1405 * plus HTML entities block of chars out.
1406 *
1407 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1408 * The value of @inlen after return is the number of octets consumed
1409 * as the return value is positive, else unpredictiable.
1410 * The value of @outlen after return is the number of octets consumed.
1411 */
1412int
1413UTF8ToHtml(unsigned char* out, int *outlen,
1414 const unsigned char* in, int *inlen) {
1415 const unsigned char* processed = in;
1416 const unsigned char* outend;
1417 const unsigned char* outstart = out;
1418 const unsigned char* instart = in;
1419 const unsigned char* inend;
1420 unsigned int c, d;
1421 int trailing;
1422
1423 if (in == NULL) {
1424 /*
1425 * initialization nothing to do
1426 */
1427 *outlen = 0;
1428 *inlen = 0;
1429 return(0);
1430 }
1431 inend = in + (*inlen);
1432 outend = out + (*outlen);
1433 while (in < inend) {
1434 d = *in++;
1435 if (d < 0x80) { c= d; trailing= 0; }
1436 else if (d < 0xC0) {
1437 /* trailing byte in leading position */
1438 *outlen = out - outstart;
1439 *inlen = processed - instart;
1440 return(-2);
1441 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1442 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1443 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1444 else {
1445 /* no chance for this in Ascii */
1446 *outlen = out - outstart;
1447 *inlen = processed - instart;
1448 return(-2);
1449 }
1450
1451 if (inend - in < trailing) {
1452 break;
1453 }
1454
1455 for ( ; trailing; trailing--) {
1456 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1457 break;
1458 c <<= 6;
1459 c |= d & 0x3F;
1460 }
1461
1462 /* assertion: c is a single UTF-4 value */
1463 if (c < 0x80) {
1464 if (out + 1 >= outend)
1465 break;
1466 *out++ = c;
1467 } else {
1468 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001469 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001470
1471 /*
1472 * Try to lookup a predefined HTML entity for it
1473 */
1474
1475 ent = htmlEntityValueLookup(c);
1476 if (ent == NULL) {
1477 /* no chance for this in Ascii */
1478 *outlen = out - outstart;
1479 *inlen = processed - instart;
1480 return(-2);
1481 }
1482 len = strlen(ent->name);
1483 if (out + 2 + len >= outend)
1484 break;
1485 *out++ = '&';
1486 memcpy(out, ent->name, len);
1487 out += len;
1488 *out++ = ';';
1489 }
1490 processed = in;
1491 }
1492 *outlen = out - outstart;
1493 *inlen = processed - instart;
1494 return(0);
1495}
1496
1497/**
1498 * htmlEncodeEntities:
1499 * @out: a pointer to an array of bytes to store the result
1500 * @outlen: the length of @out
1501 * @in: a pointer to an array of UTF-8 chars
1502 * @inlen: the length of @in
1503 * @quoteChar: the quote character to escape (' or ") or zero.
1504 *
1505 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1506 * plus HTML entities block of chars out.
1507 *
1508 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1509 * The value of @inlen after return is the number of octets consumed
1510 * as the return value is positive, else unpredictiable.
1511 * The value of @outlen after return is the number of octets consumed.
1512 */
1513int
1514htmlEncodeEntities(unsigned char* out, int *outlen,
1515 const unsigned char* in, int *inlen, int quoteChar) {
1516 const unsigned char* processed = in;
1517 const unsigned char* outend = out + (*outlen);
1518 const unsigned char* outstart = out;
1519 const unsigned char* instart = in;
1520 const unsigned char* inend = in + (*inlen);
1521 unsigned int c, d;
1522 int trailing;
1523
1524 while (in < inend) {
1525 d = *in++;
1526 if (d < 0x80) { c= d; trailing= 0; }
1527 else if (d < 0xC0) {
1528 /* trailing byte in leading position */
1529 *outlen = out - outstart;
1530 *inlen = processed - instart;
1531 return(-2);
1532 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1533 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1534 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1535 else {
1536 /* no chance for this in Ascii */
1537 *outlen = out - outstart;
1538 *inlen = processed - instart;
1539 return(-2);
1540 }
1541
1542 if (inend - in < trailing)
1543 break;
1544
1545 while (trailing--) {
1546 if (((d= *in++) & 0xC0) != 0x80) {
1547 *outlen = out - outstart;
1548 *inlen = processed - instart;
1549 return(-2);
1550 }
1551 c <<= 6;
1552 c |= d & 0x3F;
1553 }
1554
1555 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001556 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1557 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001558 if (out >= outend)
1559 break;
1560 *out++ = c;
1561 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001562 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001563 const char *cp;
1564 char nbuf[16];
1565 int len;
1566
1567 /*
1568 * Try to lookup a predefined HTML entity for it
1569 */
1570 ent = htmlEntityValueLookup(c);
1571 if (ent == NULL) {
1572 sprintf(nbuf, "#%u", c);
1573 cp = nbuf;
1574 }
1575 else
1576 cp = ent->name;
1577 len = strlen(cp);
1578 if (out + 2 + len > outend)
1579 break;
1580 *out++ = '&';
1581 memcpy(out, cp, len);
1582 out += len;
1583 *out++ = ';';
1584 }
1585 processed = in;
1586 }
1587 *outlen = out - outstart;
1588 *inlen = processed - instart;
1589 return(0);
1590}
1591
1592/**
1593 * htmlDecodeEntities:
1594 * @ctxt: the parser context
1595 * @len: the len to decode (in bytes !), -1 for no size limit
1596 * @end: an end marker xmlChar, 0 if none
1597 * @end2: an end marker xmlChar, 0 if none
1598 * @end3: an end marker xmlChar, 0 if none
1599 *
1600 * Subtitute the HTML entities by their value
1601 *
1602 * DEPRECATED !!!!
1603 *
1604 * Returns A newly allocated string with the substitution done. The caller
1605 * must deallocate it !
1606 */
1607xmlChar *
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00001608htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
1609 xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001610 static int deprecated = 0;
1611 if (!deprecated) {
1612 xmlGenericError(xmlGenericErrorContext,
1613 "htmlDecodeEntities() deprecated function reached\n");
1614 deprecated = 1;
1615 }
1616 return(NULL);
1617#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00001618 xmlChar *name = NULL;
1619 xmlChar *buffer = NULL;
1620 unsigned int buffer_size = 0;
1621 unsigned int nbchars = 0;
1622 htmlEntityDescPtr ent;
1623 unsigned int max = (unsigned int) len;
1624 int c,l;
1625
1626 if (ctxt->depth > 40) {
1627 ctxt->errNo = XML_ERR_ENTITY_LOOP;
1628 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1629 ctxt->sax->error(ctxt->userData,
1630 "Detected entity reference loop\n");
1631 ctxt->wellFormed = 0;
1632 ctxt->disableSAX = 1;
1633 return(NULL);
1634 }
1635
1636 /*
1637 * allocate a translation buffer.
1638 */
1639 buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1640 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1641 if (buffer == NULL) {
1642 perror("xmlDecodeEntities: malloc failed");
1643 return(NULL);
1644 }
1645
1646 /*
1647 * Ok loop until we reach one of the ending char or a size limit.
1648 */
1649 c = CUR_CHAR(l);
1650 while ((nbchars < max) && (c != end) &&
1651 (c != end2) && (c != end3)) {
1652
1653 if (c == 0) break;
1654 if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
1655 int val = htmlParseCharRef(ctxt);
1656 COPY_BUF(0,buffer,nbchars,val);
1657 NEXTL(l);
1658 } else if ((c == '&') && (ctxt->token != '&')) {
1659 ent = htmlParseEntityRef(ctxt, &name);
1660 if (name != NULL) {
1661 if (ent != NULL) {
1662 int val = ent->value;
1663 COPY_BUF(0,buffer,nbchars,val);
1664 NEXTL(l);
1665 } else {
1666 const xmlChar *cur = name;
1667
1668 buffer[nbchars++] = '&';
1669 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1670 growBuffer(buffer);
1671 }
1672 while (*cur != 0) {
1673 buffer[nbchars++] = *cur++;
1674 }
1675 buffer[nbchars++] = ';';
1676 }
1677 }
1678 } else {
1679 COPY_BUF(l,buffer,nbchars,c);
1680 NEXTL(l);
1681 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1682 growBuffer(buffer);
1683 }
1684 }
1685 c = CUR_CHAR(l);
1686 }
1687 buffer[nbchars++] = 0;
1688 return(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001689#endif
Owen Taylor3473f882001-02-23 17:55:21 +00001690}
1691
1692/************************************************************************
1693 * *
1694 * Commodity functions to handle streams *
1695 * *
1696 ************************************************************************/
1697
1698/**
Owen Taylor3473f882001-02-23 17:55:21 +00001699 * htmlNewInputStream:
1700 * @ctxt: an HTML parser context
1701 *
1702 * Create a new input stream structure
1703 * Returns the new input stream or NULL
1704 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001705static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001706htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1707 htmlParserInputPtr input;
1708
1709 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1710 if (input == NULL) {
1711 ctxt->errNo = XML_ERR_NO_MEMORY;
1712 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1713 ctxt->sax->error(ctxt->userData,
1714 "malloc: couldn't allocate a new input stream\n");
1715 return(NULL);
1716 }
1717 memset(input, 0, sizeof(htmlParserInput));
1718 input->filename = NULL;
1719 input->directory = NULL;
1720 input->base = NULL;
1721 input->cur = NULL;
1722 input->buf = NULL;
1723 input->line = 1;
1724 input->col = 1;
1725 input->buf = NULL;
1726 input->free = NULL;
1727 input->version = NULL;
1728 input->consumed = 0;
1729 input->length = 0;
1730 return(input);
1731}
1732
1733
1734/************************************************************************
1735 * *
1736 * Commodity functions, cleanup needed ? *
1737 * *
1738 ************************************************************************/
1739
1740/**
1741 * areBlanks:
1742 * @ctxt: an HTML parser context
1743 * @str: a xmlChar *
1744 * @len: the size of @str
1745 *
1746 * Is this a sequence of blank chars that one can ignore ?
1747 *
1748 * Returns 1 if ignorable 0 otherwise.
1749 */
1750
1751static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
1752 int i;
1753 xmlNodePtr lastChild;
1754
1755 for (i = 0;i < len;i++)
1756 if (!(IS_BLANK(str[i]))) return(0);
1757
1758 if (CUR == 0) return(1);
1759 if (CUR != '<') return(0);
1760 if (ctxt->name == NULL)
1761 return(1);
1762 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
1763 return(1);
1764 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
1765 return(1);
1766 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
1767 return(1);
1768 if (ctxt->node == NULL) return(0);
1769 lastChild = xmlGetLastChild(ctxt->node);
1770 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00001771 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
1772 (ctxt->node->content != NULL)) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001773 } else if (xmlNodeIsText(lastChild)) {
1774 return(0);
1775 } else if (xmlStrEqual(lastChild->name, BAD_CAST"b")) {
1776 return(0);
1777 } else if (xmlStrEqual(lastChild->name, BAD_CAST"bold")) {
1778 return(0);
1779 } else if (xmlStrEqual(lastChild->name, BAD_CAST"em")) {
1780 return(0);
1781 }
1782 return(1);
1783}
1784
1785/**
Owen Taylor3473f882001-02-23 17:55:21 +00001786 * htmlNewDocNoDtD:
1787 * @URI: URI for the dtd, or NULL
1788 * @ExternalID: the external ID of the DTD, or NULL
1789 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001790 * Creates a new HTML document without a DTD node if @URI and @ExternalID
1791 * are NULL
1792 *
Owen Taylor3473f882001-02-23 17:55:21 +00001793 * Returns a new document, do not intialize the DTD if not provided
1794 */
1795htmlDocPtr
1796htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
1797 xmlDocPtr cur;
1798
1799 /*
1800 * Allocate a new document and fill the fields.
1801 */
1802 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
1803 if (cur == NULL) {
1804 xmlGenericError(xmlGenericErrorContext,
1805 "xmlNewDoc : malloc failed\n");
1806 return(NULL);
1807 }
1808 memset(cur, 0, sizeof(xmlDoc));
1809
1810 cur->type = XML_HTML_DOCUMENT_NODE;
1811 cur->version = NULL;
1812 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001813 cur->doc = cur;
1814 cur->name = NULL;
1815 cur->children = NULL;
1816 cur->extSubset = NULL;
1817 cur->oldNs = NULL;
1818 cur->encoding = NULL;
1819 cur->standalone = 1;
1820 cur->compression = 0;
1821 cur->ids = NULL;
1822 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001823 cur->_private = NULL;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00001824 if ((ExternalID != NULL) ||
1825 (URI != NULL))
Daniel Veillard5151c062001-10-23 13:10:19 +00001826 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00001827 return(cur);
1828}
1829
1830/**
1831 * htmlNewDoc:
1832 * @URI: URI for the dtd, or NULL
1833 * @ExternalID: the external ID of the DTD, or NULL
1834 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001835 * Creates a new HTML document
1836 *
Owen Taylor3473f882001-02-23 17:55:21 +00001837 * Returns a new document
1838 */
1839htmlDocPtr
1840htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1841 if ((URI == NULL) && (ExternalID == NULL))
1842 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00001843 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
1844 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00001845
1846 return(htmlNewDocNoDtD(URI, ExternalID));
1847}
1848
1849
1850/************************************************************************
1851 * *
1852 * The parser itself *
1853 * Relates to http://www.w3.org/TR/html40 *
1854 * *
1855 ************************************************************************/
1856
1857/************************************************************************
1858 * *
1859 * The parser itself *
1860 * *
1861 ************************************************************************/
1862
1863/**
1864 * htmlParseHTMLName:
1865 * @ctxt: an HTML parser context
1866 *
1867 * parse an HTML tag or attribute name, note that we convert it to lowercase
1868 * since HTML names are not case-sensitive.
1869 *
1870 * Returns the Tag Name parsed or NULL
1871 */
1872
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001873static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001874htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1875 xmlChar *ret = NULL;
1876 int i = 0;
1877 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
1878
1879 if (!IS_LETTER(CUR) && (CUR != '_') &&
1880 (CUR != ':')) return(NULL);
1881
1882 while ((i < HTML_PARSER_BUFFER_SIZE) &&
1883 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1884 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
1885 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
1886 else loc[i] = CUR;
1887 i++;
1888
1889 NEXT;
1890 }
1891
1892 ret = xmlStrndup(loc, i);
1893
1894 return(ret);
1895}
1896
1897/**
1898 * htmlParseName:
1899 * @ctxt: an HTML parser context
1900 *
1901 * parse an HTML name, this routine is case sensistive.
1902 *
1903 * Returns the Name parsed or NULL
1904 */
1905
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001906static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001907htmlParseName(htmlParserCtxtPtr ctxt) {
1908 xmlChar buf[HTML_MAX_NAMELEN];
1909 int len = 0;
1910
1911 GROW;
1912 if (!IS_LETTER(CUR) && (CUR != '_')) {
1913 return(NULL);
1914 }
1915
1916 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1917 (CUR == '.') || (CUR == '-') ||
1918 (CUR == '_') || (CUR == ':') ||
1919 (IS_COMBINING(CUR)) ||
1920 (IS_EXTENDER(CUR))) {
1921 buf[len++] = CUR;
1922 NEXT;
1923 if (len >= HTML_MAX_NAMELEN) {
1924 xmlGenericError(xmlGenericErrorContext,
1925 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1926 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1927 (CUR == '.') || (CUR == '-') ||
1928 (CUR == '_') || (CUR == ':') ||
1929 (IS_COMBINING(CUR)) ||
1930 (IS_EXTENDER(CUR)))
1931 NEXT;
1932 break;
1933 }
1934 }
1935 return(xmlStrndup(buf, len));
1936}
1937
1938/**
1939 * htmlParseHTMLAttribute:
1940 * @ctxt: an HTML parser context
1941 * @stop: a char stop value
1942 *
1943 * parse an HTML attribute value till the stop (quote), if
1944 * stop is 0 then it stops at the first space
1945 *
1946 * Returns the attribute parsed or NULL
1947 */
1948
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001949static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001950htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
1951 xmlChar *buffer = NULL;
1952 int buffer_size = 0;
1953 xmlChar *out = NULL;
1954 xmlChar *name = NULL;
1955
1956 xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00001957 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001958
1959 /*
1960 * allocate a translation buffer.
1961 */
1962 buffer_size = HTML_PARSER_BUFFER_SIZE;
1963 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1964 if (buffer == NULL) {
1965 perror("htmlParseHTMLAttribute: malloc failed");
1966 return(NULL);
1967 }
1968 out = buffer;
1969
1970 /*
1971 * Ok loop until we reach one of the ending chars
1972 */
1973 while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1974 if ((stop == 0) && (IS_BLANK(CUR))) break;
1975 if (CUR == '&') {
1976 if (NXT(1) == '#') {
1977 unsigned int c;
1978 int bits;
1979
1980 c = htmlParseCharRef(ctxt);
1981 if (c < 0x80)
1982 { *out++ = c; bits= -6; }
1983 else if (c < 0x800)
1984 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1985 else if (c < 0x10000)
1986 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1987 else
1988 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1989
1990 for ( ; bits >= 0; bits-= 6) {
1991 *out++ = ((c >> bits) & 0x3F) | 0x80;
1992 }
1993 } else {
1994 ent = htmlParseEntityRef(ctxt, &name);
1995 if (name == NULL) {
1996 *out++ = '&';
1997 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001998 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00001999
2000 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002001 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002002 }
2003 } else if (ent == NULL) {
2004 *out++ = '&';
2005 cur = name;
2006 while (*cur != 0) {
2007 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002008 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002009
2010 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002011 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002012 }
2013 *out++ = *cur++;
2014 }
2015 xmlFree(name);
2016 } else {
2017 unsigned int c;
2018 int bits;
2019
2020 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002021 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002022
2023 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002024 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002025 }
2026 c = (xmlChar)ent->value;
2027 if (c < 0x80)
2028 { *out++ = c; bits= -6; }
2029 else if (c < 0x800)
2030 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2031 else if (c < 0x10000)
2032 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2033 else
2034 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2035
2036 for ( ; bits >= 0; bits-= 6) {
2037 *out++ = ((c >> bits) & 0x3F) | 0x80;
2038 }
2039 xmlFree(name);
2040 }
2041 }
2042 } else {
2043 unsigned int c;
2044 int bits, l;
2045
2046 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002047 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002048
2049 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002050 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002051 }
2052 c = CUR_CHAR(l);
2053 if (c < 0x80)
2054 { *out++ = c; bits= -6; }
2055 else if (c < 0x800)
2056 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2057 else if (c < 0x10000)
2058 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2059 else
2060 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2061
2062 for ( ; bits >= 0; bits-= 6) {
2063 *out++ = ((c >> bits) & 0x3F) | 0x80;
2064 }
2065 NEXT;
2066 }
2067 }
2068 *out++ = 0;
2069 return(buffer);
2070}
2071
2072/**
Owen Taylor3473f882001-02-23 17:55:21 +00002073 * htmlParseEntityRef:
2074 * @ctxt: an HTML parser context
2075 * @str: location to store the entity name
2076 *
2077 * parse an HTML ENTITY references
2078 *
2079 * [68] EntityRef ::= '&' Name ';'
2080 *
2081 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2082 * if non-NULL *str will have to be freed by the caller.
2083 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002084const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00002085htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2086 xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002087 const htmlEntityDesc * ent = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002088 *str = NULL;
2089
2090 if (CUR == '&') {
2091 NEXT;
2092 name = htmlParseName(ctxt);
2093 if (name == NULL) {
2094 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2095 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2096 ctxt->wellFormed = 0;
2097 } else {
2098 GROW;
2099 if (CUR == ';') {
2100 *str = name;
2101
2102 /*
2103 * Lookup the entity in the table.
2104 */
2105 ent = htmlEntityLookup(name);
2106 if (ent != NULL) /* OK that's ugly !!! */
2107 NEXT;
2108 } else {
2109 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2110 ctxt->sax->error(ctxt->userData,
2111 "htmlParseEntityRef: expecting ';'\n");
2112 *str = name;
2113 }
2114 }
2115 }
2116 return(ent);
2117}
2118
2119/**
2120 * htmlParseAttValue:
2121 * @ctxt: an HTML parser context
2122 *
2123 * parse a value for an attribute
2124 * Note: the parser won't do substitution of entities here, this
2125 * will be handled later in xmlStringGetNodeList, unless it was
2126 * asked for ctxt->replaceEntities != 0
2127 *
2128 * Returns the AttValue parsed or NULL.
2129 */
2130
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002131static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002132htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2133 xmlChar *ret = NULL;
2134
2135 if (CUR == '"') {
2136 NEXT;
2137 ret = htmlParseHTMLAttribute(ctxt, '"');
2138 if (CUR != '"') {
2139 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2140 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2141 ctxt->wellFormed = 0;
2142 } else
2143 NEXT;
2144 } else if (CUR == '\'') {
2145 NEXT;
2146 ret = htmlParseHTMLAttribute(ctxt, '\'');
2147 if (CUR != '\'') {
2148 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2149 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2150 ctxt->wellFormed = 0;
2151 } else
2152 NEXT;
2153 } else {
2154 /*
2155 * That's an HTMLism, the attribute value may not be quoted
2156 */
2157 ret = htmlParseHTMLAttribute(ctxt, 0);
2158 if (ret == NULL) {
2159 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2160 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2161 ctxt->wellFormed = 0;
2162 }
2163 }
2164 return(ret);
2165}
2166
2167/**
2168 * htmlParseSystemLiteral:
2169 * @ctxt: an HTML parser context
2170 *
2171 * parse an HTML Literal
2172 *
2173 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2174 *
2175 * Returns the SystemLiteral parsed or NULL
2176 */
2177
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002178static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002179htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2180 const xmlChar *q;
2181 xmlChar *ret = NULL;
2182
2183 if (CUR == '"') {
2184 NEXT;
2185 q = CUR_PTR;
2186 while ((IS_CHAR(CUR)) && (CUR != '"'))
2187 NEXT;
2188 if (!IS_CHAR(CUR)) {
2189 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2190 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2191 ctxt->wellFormed = 0;
2192 } else {
2193 ret = xmlStrndup(q, CUR_PTR - q);
2194 NEXT;
2195 }
2196 } else if (CUR == '\'') {
2197 NEXT;
2198 q = CUR_PTR;
2199 while ((IS_CHAR(CUR)) && (CUR != '\''))
2200 NEXT;
2201 if (!IS_CHAR(CUR)) {
2202 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2203 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2204 ctxt->wellFormed = 0;
2205 } else {
2206 ret = xmlStrndup(q, CUR_PTR - q);
2207 NEXT;
2208 }
2209 } else {
2210 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2211 ctxt->sax->error(ctxt->userData,
2212 "SystemLiteral \" or ' expected\n");
2213 ctxt->wellFormed = 0;
2214 }
2215
2216 return(ret);
2217}
2218
2219/**
2220 * htmlParsePubidLiteral:
2221 * @ctxt: an HTML parser context
2222 *
2223 * parse an HTML public literal
2224 *
2225 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2226 *
2227 * Returns the PubidLiteral parsed or NULL.
2228 */
2229
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002230static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002231htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2232 const xmlChar *q;
2233 xmlChar *ret = NULL;
2234 /*
2235 * Name ::= (Letter | '_') (NameChar)*
2236 */
2237 if (CUR == '"') {
2238 NEXT;
2239 q = CUR_PTR;
2240 while (IS_PUBIDCHAR(CUR)) NEXT;
2241 if (CUR != '"') {
2242 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2243 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2244 ctxt->wellFormed = 0;
2245 } else {
2246 ret = xmlStrndup(q, CUR_PTR - q);
2247 NEXT;
2248 }
2249 } else if (CUR == '\'') {
2250 NEXT;
2251 q = CUR_PTR;
2252 while ((IS_LETTER(CUR)) && (CUR != '\''))
2253 NEXT;
2254 if (!IS_LETTER(CUR)) {
2255 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2256 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2257 ctxt->wellFormed = 0;
2258 } else {
2259 ret = xmlStrndup(q, CUR_PTR - q);
2260 NEXT;
2261 }
2262 } else {
2263 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2264 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2265 ctxt->wellFormed = 0;
2266 }
2267
2268 return(ret);
2269}
2270
2271/**
2272 * htmlParseScript:
2273 * @ctxt: an HTML parser context
2274 *
2275 * parse the content of an HTML SCRIPT or STYLE element
2276 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2277 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2278 * http://www.w3.org/TR/html4/types.html#type-script
2279 * http://www.w3.org/TR/html4/types.html#h-6.15
2280 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2281 *
2282 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2283 * element and the value of intrinsic event attributes. User agents must
2284 * not evaluate script data as HTML markup but instead must pass it on as
2285 * data to a script engine.
2286 * NOTES:
2287 * - The content is passed like CDATA
2288 * - the attributes for style and scripting "onXXX" are also described
2289 * as CDATA but SGML allows entities references in attributes so their
2290 * processing is identical as other attributes
2291 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002292static void
Owen Taylor3473f882001-02-23 17:55:21 +00002293htmlParseScript(htmlParserCtxtPtr ctxt) {
2294 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2295 int nbchar = 0;
2296 xmlChar cur;
2297
2298 SHRINK;
2299 cur = CUR;
2300 while (IS_CHAR(cur)) {
2301 if ((cur == '<') && (NXT(1) == '/')) {
2302 /*
2303 * One should break here, the specification is clear:
2304 * Authors should therefore escape "</" within the content.
2305 * Escape mechanisms are specific to each scripting or
2306 * style sheet language.
2307 */
2308 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2309 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2310 break; /* while */
2311 }
2312 buf[nbchar++] = cur;
2313 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2314 if (ctxt->sax->cdataBlock!= NULL) {
2315 /*
2316 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2317 */
2318 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2319 }
2320 nbchar = 0;
2321 }
2322 NEXT;
2323 cur = CUR;
2324 }
2325 if (!(IS_CHAR(cur))) {
2326 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2327 ctxt->sax->error(ctxt->userData,
2328 "Invalid char in CDATA 0x%X\n", cur);
2329 ctxt->wellFormed = 0;
2330 NEXT;
2331 }
2332
2333 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2334 if (ctxt->sax->cdataBlock!= NULL) {
2335 /*
2336 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2337 */
2338 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2339 }
2340 }
2341}
2342
2343
2344/**
2345 * htmlParseCharData:
2346 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002347 *
2348 * parse a CharData section.
2349 * if we are within a CDATA section ']]>' marks an end of section.
2350 *
2351 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2352 */
2353
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002354static void
2355htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002356 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2357 int nbchar = 0;
2358 int cur, l;
2359
2360 SHRINK;
2361 cur = CUR_CHAR(l);
2362 while (((cur != '<') || (ctxt->token == '<')) &&
2363 ((cur != '&') || (ctxt->token == '&')) &&
2364 (IS_CHAR(cur))) {
2365 COPY_BUF(l,buf,nbchar,cur);
2366 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2367 /*
2368 * Ok the segment is to be consumed as chars.
2369 */
2370 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2371 if (areBlanks(ctxt, buf, nbchar)) {
2372 if (ctxt->sax->ignorableWhitespace != NULL)
2373 ctxt->sax->ignorableWhitespace(ctxt->userData,
2374 buf, nbchar);
2375 } else {
2376 htmlCheckParagraph(ctxt);
2377 if (ctxt->sax->characters != NULL)
2378 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2379 }
2380 }
2381 nbchar = 0;
2382 }
2383 NEXTL(l);
2384 cur = CUR_CHAR(l);
2385 }
2386 if (nbchar != 0) {
2387 /*
2388 * Ok the segment is to be consumed as chars.
2389 */
2390 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2391 if (areBlanks(ctxt, buf, nbchar)) {
2392 if (ctxt->sax->ignorableWhitespace != NULL)
2393 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2394 } else {
2395 htmlCheckParagraph(ctxt);
2396 if (ctxt->sax->characters != NULL)
2397 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2398 }
2399 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002400 } else {
2401 /*
2402 * Loop detection
2403 */
2404 if (cur == 0)
2405 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002406 }
2407}
2408
2409/**
2410 * htmlParseExternalID:
2411 * @ctxt: an HTML parser context
2412 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002413 *
2414 * Parse an External ID or a Public ID
2415 *
Owen Taylor3473f882001-02-23 17:55:21 +00002416 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2417 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2418 *
2419 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2420 *
2421 * Returns the function returns SystemLiteral and in the second
2422 * case publicID receives PubidLiteral, is strict is off
2423 * it is possible to return NULL and have publicID set.
2424 */
2425
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002426static xmlChar *
2427htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002428 xmlChar *URI = NULL;
2429
2430 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2431 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2432 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2433 SKIP(6);
2434 if (!IS_BLANK(CUR)) {
2435 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2436 ctxt->sax->error(ctxt->userData,
2437 "Space required after 'SYSTEM'\n");
2438 ctxt->wellFormed = 0;
2439 }
2440 SKIP_BLANKS;
2441 URI = htmlParseSystemLiteral(ctxt);
2442 if (URI == NULL) {
2443 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2444 ctxt->sax->error(ctxt->userData,
2445 "htmlParseExternalID: SYSTEM, no URI\n");
2446 ctxt->wellFormed = 0;
2447 }
2448 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2449 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2450 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2451 SKIP(6);
2452 if (!IS_BLANK(CUR)) {
2453 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2454 ctxt->sax->error(ctxt->userData,
2455 "Space required after 'PUBLIC'\n");
2456 ctxt->wellFormed = 0;
2457 }
2458 SKIP_BLANKS;
2459 *publicID = htmlParsePubidLiteral(ctxt);
2460 if (*publicID == NULL) {
2461 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2462 ctxt->sax->error(ctxt->userData,
2463 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2464 ctxt->wellFormed = 0;
2465 }
2466 SKIP_BLANKS;
2467 if ((CUR == '"') || (CUR == '\'')) {
2468 URI = htmlParseSystemLiteral(ctxt);
2469 }
2470 }
2471 return(URI);
2472}
2473
2474/**
2475 * htmlParseComment:
2476 * @ctxt: an HTML parser context
2477 *
2478 * Parse an XML (SGML) comment <!-- .... -->
2479 *
2480 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2481 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002482static void
Owen Taylor3473f882001-02-23 17:55:21 +00002483htmlParseComment(htmlParserCtxtPtr ctxt) {
2484 xmlChar *buf = NULL;
2485 int len;
2486 int size = HTML_PARSER_BUFFER_SIZE;
2487 int q, ql;
2488 int r, rl;
2489 int cur, l;
2490 xmlParserInputState state;
2491
2492 /*
2493 * Check that there is a comment right here.
2494 */
2495 if ((RAW != '<') || (NXT(1) != '!') ||
2496 (NXT(2) != '-') || (NXT(3) != '-')) return;
2497
2498 state = ctxt->instate;
2499 ctxt->instate = XML_PARSER_COMMENT;
2500 SHRINK;
2501 SKIP(4);
2502 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2503 if (buf == NULL) {
2504 xmlGenericError(xmlGenericErrorContext,
2505 "malloc of %d byte failed\n", size);
2506 ctxt->instate = state;
2507 return;
2508 }
2509 q = CUR_CHAR(ql);
2510 NEXTL(ql);
2511 r = CUR_CHAR(rl);
2512 NEXTL(rl);
2513 cur = CUR_CHAR(l);
2514 len = 0;
2515 while (IS_CHAR(cur) &&
2516 ((cur != '>') ||
2517 (r != '-') || (q != '-'))) {
2518 if (len + 5 >= size) {
2519 size *= 2;
2520 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2521 if (buf == NULL) {
2522 xmlGenericError(xmlGenericErrorContext,
2523 "realloc of %d byte failed\n", size);
2524 ctxt->instate = state;
2525 return;
2526 }
2527 }
2528 COPY_BUF(ql,buf,len,q);
2529 q = r;
2530 ql = rl;
2531 r = cur;
2532 rl = l;
2533 NEXTL(l);
2534 cur = CUR_CHAR(l);
2535 if (cur == 0) {
2536 SHRINK;
2537 GROW;
2538 cur = CUR_CHAR(l);
2539 }
2540 }
2541 buf[len] = 0;
2542 if (!IS_CHAR(cur)) {
2543 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2544 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2545 ctxt->sax->error(ctxt->userData,
2546 "Comment not terminated \n<!--%.50s\n", buf);
2547 ctxt->wellFormed = 0;
2548 xmlFree(buf);
2549 } else {
2550 NEXT;
2551 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2552 (!ctxt->disableSAX))
2553 ctxt->sax->comment(ctxt->userData, buf);
2554 xmlFree(buf);
2555 }
2556 ctxt->instate = state;
2557}
2558
2559/**
2560 * htmlParseCharRef:
2561 * @ctxt: an HTML parser context
2562 *
2563 * parse Reference declarations
2564 *
2565 * [66] CharRef ::= '&#' [0-9]+ ';' |
2566 * '&#x' [0-9a-fA-F]+ ';'
2567 *
2568 * Returns the value parsed (as an int)
2569 */
2570int
2571htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2572 int val = 0;
2573
2574 if ((CUR == '&') && (NXT(1) == '#') &&
2575 (NXT(2) == 'x')) {
2576 SKIP(3);
2577 while (CUR != ';') {
2578 if ((CUR >= '0') && (CUR <= '9'))
2579 val = val * 16 + (CUR - '0');
2580 else if ((CUR >= 'a') && (CUR <= 'f'))
2581 val = val * 16 + (CUR - 'a') + 10;
2582 else if ((CUR >= 'A') && (CUR <= 'F'))
2583 val = val * 16 + (CUR - 'A') + 10;
2584 else {
2585 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2586 ctxt->sax->error(ctxt->userData,
2587 "htmlParseCharRef: invalid hexadecimal value\n");
2588 ctxt->wellFormed = 0;
2589 return(0);
2590 }
2591 NEXT;
2592 }
2593 if (CUR == ';')
2594 NEXT;
2595 } else if ((CUR == '&') && (NXT(1) == '#')) {
2596 SKIP(2);
2597 while (CUR != ';') {
2598 if ((CUR >= '0') && (CUR <= '9'))
2599 val = val * 10 + (CUR - '0');
2600 else {
2601 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2602 ctxt->sax->error(ctxt->userData,
2603 "htmlParseCharRef: invalid decimal value\n");
2604 ctxt->wellFormed = 0;
2605 return(0);
2606 }
2607 NEXT;
2608 }
2609 if (CUR == ';')
2610 NEXT;
2611 } else {
2612 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2613 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2614 ctxt->wellFormed = 0;
2615 }
2616 /*
2617 * Check the value IS_CHAR ...
2618 */
2619 if (IS_CHAR(val)) {
2620 return(val);
2621 } else {
2622 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2623 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
2624 val);
2625 ctxt->wellFormed = 0;
2626 }
2627 return(0);
2628}
2629
2630
2631/**
2632 * htmlParseDocTypeDecl :
2633 * @ctxt: an HTML parser context
2634 *
2635 * parse a DOCTYPE declaration
2636 *
2637 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2638 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2639 */
2640
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002641static void
Owen Taylor3473f882001-02-23 17:55:21 +00002642htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
2643 xmlChar *name;
2644 xmlChar *ExternalID = NULL;
2645 xmlChar *URI = NULL;
2646
2647 /*
2648 * We know that '<!DOCTYPE' has been detected.
2649 */
2650 SKIP(9);
2651
2652 SKIP_BLANKS;
2653
2654 /*
2655 * Parse the DOCTYPE name.
2656 */
2657 name = htmlParseName(ctxt);
2658 if (name == NULL) {
2659 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2660 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2661 ctxt->wellFormed = 0;
2662 }
2663 /*
2664 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2665 */
2666
2667 SKIP_BLANKS;
2668
2669 /*
2670 * Check for SystemID and ExternalID
2671 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002672 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00002673 SKIP_BLANKS;
2674
2675 /*
2676 * We should be at the end of the DOCTYPE declaration.
2677 */
2678 if (CUR != '>') {
2679 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardf6ed8bc2001-10-02 09:22:47 +00002680 ctxt->sax->error(ctxt->userData, "DOCTYPE improperly terminated\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002681 ctxt->wellFormed = 0;
2682 /* We shouldn't try to resynchronize ... */
2683 }
2684 NEXT;
2685
2686 /*
2687 * Create or update the document accordingly to the DOCTYPE
2688 */
2689 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2690 (!ctxt->disableSAX))
2691 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
2692
2693 /*
2694 * Cleanup, since we don't use all those identifiers
2695 */
2696 if (URI != NULL) xmlFree(URI);
2697 if (ExternalID != NULL) xmlFree(ExternalID);
2698 if (name != NULL) xmlFree(name);
2699}
2700
2701/**
2702 * htmlParseAttribute:
2703 * @ctxt: an HTML parser context
2704 * @value: a xmlChar ** used to store the value of the attribute
2705 *
2706 * parse an attribute
2707 *
2708 * [41] Attribute ::= Name Eq AttValue
2709 *
2710 * [25] Eq ::= S? '=' S?
2711 *
2712 * With namespace:
2713 *
2714 * [NS 11] Attribute ::= QName Eq AttValue
2715 *
2716 * Also the case QName == xmlns:??? is handled independently as a namespace
2717 * definition.
2718 *
2719 * Returns the attribute name, and the value in *value.
2720 */
2721
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002722static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002723htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
2724 xmlChar *name, *val = NULL;
2725
2726 *value = NULL;
2727 name = htmlParseHTMLName(ctxt);
2728 if (name == NULL) {
2729 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2730 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2731 ctxt->wellFormed = 0;
2732 return(NULL);
2733 }
2734
2735 /*
2736 * read the value
2737 */
2738 SKIP_BLANKS;
2739 if (CUR == '=') {
2740 NEXT;
2741 SKIP_BLANKS;
2742 val = htmlParseAttValue(ctxt);
2743 /******
2744 } else {
2745 * TODO : some attribute must have values, some may not
2746 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2747 ctxt->sax->warning(ctxt->userData,
2748 "No value for attribute %s\n", name); */
2749 }
2750
2751 *value = val;
2752 return(name);
2753}
2754
2755/**
2756 * htmlCheckEncoding:
2757 * @ctxt: an HTML parser context
2758 * @attvalue: the attribute value
2759 *
2760 * Checks an http-equiv attribute from a Meta tag to detect
2761 * the encoding
2762 * If a new encoding is detected the parser is switched to decode
2763 * it and pass UTF8
2764 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002765static void
Owen Taylor3473f882001-02-23 17:55:21 +00002766htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2767 const xmlChar *encoding;
2768
2769 if ((ctxt == NULL) || (attvalue == NULL))
2770 return;
2771
2772 /* do not change encoding */
2773 if (ctxt->input->encoding != NULL)
2774 return;
2775
2776 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
2777 if (encoding != NULL) {
2778 encoding += 8;
2779 } else {
2780 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
2781 if (encoding != NULL)
2782 encoding += 9;
2783 }
2784 if (encoding != NULL) {
2785 xmlCharEncoding enc;
2786 xmlCharEncodingHandlerPtr handler;
2787
2788 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2789
2790 if (ctxt->input->encoding != NULL)
2791 xmlFree((xmlChar *) ctxt->input->encoding);
2792 ctxt->input->encoding = xmlStrdup(encoding);
2793
2794 enc = xmlParseCharEncoding((const char *) encoding);
2795 /*
2796 * registered set of known encodings
2797 */
2798 if (enc != XML_CHAR_ENCODING_ERROR) {
2799 xmlSwitchEncoding(ctxt, enc);
2800 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2801 } else {
2802 /*
2803 * fallback for unknown encodings
2804 */
2805 handler = xmlFindCharEncodingHandler((const char *) encoding);
2806 if (handler != NULL) {
2807 xmlSwitchToEncoding(ctxt, handler);
2808 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2809 } else {
2810 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2811 }
2812 }
2813
2814 if ((ctxt->input->buf != NULL) &&
2815 (ctxt->input->buf->encoder != NULL) &&
2816 (ctxt->input->buf->raw != NULL) &&
2817 (ctxt->input->buf->buffer != NULL)) {
2818 int nbchars;
2819 int processed;
2820
2821 /*
2822 * convert as much as possible to the parser reading buffer.
2823 */
2824 processed = ctxt->input->cur - ctxt->input->base;
2825 xmlBufferShrink(ctxt->input->buf->buffer, processed);
2826 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2827 ctxt->input->buf->buffer,
2828 ctxt->input->buf->raw);
2829 if (nbchars < 0) {
2830 ctxt->errNo = XML_ERR_INVALID_ENCODING;
2831 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2832 ctxt->sax->error(ctxt->userData,
2833 "htmlCheckEncoding: encoder error\n");
2834 }
2835 ctxt->input->base =
2836 ctxt->input->cur = ctxt->input->buf->buffer->content;
2837 }
2838 }
2839}
2840
2841/**
2842 * htmlCheckMeta:
2843 * @ctxt: an HTML parser context
2844 * @atts: the attributes values
2845 *
2846 * Checks an attributes from a Meta tag
2847 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002848static void
Owen Taylor3473f882001-02-23 17:55:21 +00002849htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2850 int i;
2851 const xmlChar *att, *value;
2852 int http = 0;
2853 const xmlChar *content = NULL;
2854
2855 if ((ctxt == NULL) || (atts == NULL))
2856 return;
2857
2858 i = 0;
2859 att = atts[i++];
2860 while (att != NULL) {
2861 value = atts[i++];
2862 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
2863 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
2864 http = 1;
2865 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
2866 content = value;
2867 att = atts[i++];
2868 }
2869 if ((http) && (content != NULL))
2870 htmlCheckEncoding(ctxt, content);
2871
2872}
2873
2874/**
2875 * htmlParseStartTag:
2876 * @ctxt: an HTML parser context
2877 *
2878 * parse a start of tag either for rule element or
2879 * EmptyElement. In both case we don't parse the tag closing chars.
2880 *
2881 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2882 *
2883 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2884 *
2885 * With namespace:
2886 *
2887 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2888 *
2889 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2890 *
2891 */
2892
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002893static void
Owen Taylor3473f882001-02-23 17:55:21 +00002894htmlParseStartTag(htmlParserCtxtPtr ctxt) {
2895 xmlChar *name;
2896 xmlChar *attname;
2897 xmlChar *attvalue;
2898 const xmlChar **atts = NULL;
2899 int nbatts = 0;
2900 int maxatts = 0;
2901 int meta = 0;
2902 int i;
2903
2904 if (CUR != '<') return;
2905 NEXT;
2906
2907 GROW;
2908 name = htmlParseHTMLName(ctxt);
2909 if (name == NULL) {
2910 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2911 ctxt->sax->error(ctxt->userData,
2912 "htmlParseStartTag: invalid element name\n");
2913 ctxt->wellFormed = 0;
2914 /* Dump the bogus tag like browsers do */
2915 while ((IS_CHAR(CUR)) && (CUR != '>'))
2916 NEXT;
2917 return;
2918 }
2919 if (xmlStrEqual(name, BAD_CAST"meta"))
2920 meta = 1;
2921
2922 /*
2923 * Check for auto-closure of HTML elements.
2924 */
2925 htmlAutoClose(ctxt, name);
2926
2927 /*
2928 * Check for implied HTML elements.
2929 */
2930 htmlCheckImplied(ctxt, name);
2931
2932 /*
2933 * Avoid html at any level > 0, head at any level != 1
2934 * or any attempt to recurse body
2935 */
2936 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
2937 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2938 ctxt->sax->error(ctxt->userData,
2939 "htmlParseStartTag: misplaced <html> tag\n");
2940 ctxt->wellFormed = 0;
2941 xmlFree(name);
2942 return;
2943 }
2944 if ((ctxt->nameNr != 1) &&
2945 (xmlStrEqual(name, BAD_CAST"head"))) {
2946 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2947 ctxt->sax->error(ctxt->userData,
2948 "htmlParseStartTag: misplaced <head> tag\n");
2949 ctxt->wellFormed = 0;
2950 xmlFree(name);
2951 return;
2952 }
2953 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002954 int indx;
2955 for (indx = 0;indx < ctxt->nameNr;indx++) {
2956 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor3473f882001-02-23 17:55:21 +00002957 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2958 ctxt->sax->error(ctxt->userData,
2959 "htmlParseStartTag: misplaced <body> tag\n");
2960 ctxt->wellFormed = 0;
2961 xmlFree(name);
2962 return;
2963 }
2964 }
2965 }
2966
2967 /*
2968 * Now parse the attributes, it ends up with the ending
2969 *
2970 * (S Attribute)* S?
2971 */
2972 SKIP_BLANKS;
2973 while ((IS_CHAR(CUR)) &&
2974 (CUR != '>') &&
2975 ((CUR != '/') || (NXT(1) != '>'))) {
2976 long cons = ctxt->nbChars;
2977
2978 GROW;
2979 attname = htmlParseAttribute(ctxt, &attvalue);
2980 if (attname != NULL) {
2981
2982 /*
2983 * Well formedness requires at most one declaration of an attribute
2984 */
2985 for (i = 0; i < nbatts;i += 2) {
2986 if (xmlStrEqual(atts[i], attname)) {
2987 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2988 ctxt->sax->error(ctxt->userData,
2989 "Attribute %s redefined\n",
2990 attname);
2991 ctxt->wellFormed = 0;
2992 xmlFree(attname);
2993 if (attvalue != NULL)
2994 xmlFree(attvalue);
2995 goto failed;
2996 }
2997 }
2998
2999 /*
3000 * Add the pair to atts
3001 */
3002 if (atts == NULL) {
3003 maxatts = 10;
3004 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
3005 if (atts == NULL) {
3006 xmlGenericError(xmlGenericErrorContext,
3007 "malloc of %ld byte failed\n",
3008 maxatts * (long)sizeof(xmlChar *));
3009 if (name != NULL) xmlFree(name);
3010 return;
3011 }
3012 } else if (nbatts + 4 > maxatts) {
3013 maxatts *= 2;
3014 atts = (const xmlChar **) xmlRealloc((void *) atts,
3015 maxatts * sizeof(xmlChar *));
3016 if (atts == NULL) {
3017 xmlGenericError(xmlGenericErrorContext,
3018 "realloc of %ld byte failed\n",
3019 maxatts * (long)sizeof(xmlChar *));
3020 if (name != NULL) xmlFree(name);
3021 return;
3022 }
3023 }
3024 atts[nbatts++] = attname;
3025 atts[nbatts++] = attvalue;
3026 atts[nbatts] = NULL;
3027 atts[nbatts + 1] = NULL;
3028 }
3029 else {
3030 /* Dump the bogus attribute string up to the next blank or
3031 * the end of the tag. */
3032 while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
3033 && ((CUR != '/') || (NXT(1) != '>')))
3034 NEXT;
3035 }
3036
3037failed:
3038 SKIP_BLANKS;
3039 if (cons == ctxt->nbChars) {
3040 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3041 ctxt->sax->error(ctxt->userData,
3042 "htmlParseStartTag: problem parsing attributes\n");
3043 ctxt->wellFormed = 0;
3044 break;
3045 }
3046 }
3047
3048 /*
3049 * Handle specific association to the META tag
3050 */
3051 if (meta)
3052 htmlCheckMeta(ctxt, atts);
3053
3054 /*
3055 * SAX: Start of Element !
3056 */
3057 htmlnamePush(ctxt, xmlStrdup(name));
3058#ifdef DEBUG
3059 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
3060#endif
3061 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3062 ctxt->sax->startElement(ctxt->userData, name, atts);
3063
3064 if (atts != NULL) {
3065 for (i = 0;i < nbatts;i++) {
3066 if (atts[i] != NULL)
3067 xmlFree((xmlChar *) atts[i]);
3068 }
3069 xmlFree((void *) atts);
3070 }
3071 if (name != NULL) xmlFree(name);
3072}
3073
3074/**
3075 * htmlParseEndTag:
3076 * @ctxt: an HTML parser context
3077 *
3078 * parse an end of tag
3079 *
3080 * [42] ETag ::= '</' Name S? '>'
3081 *
3082 * With namespace
3083 *
3084 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003085 *
3086 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003087 */
3088
Daniel Veillardf420ac52001-07-04 16:04:09 +00003089static int
Owen Taylor3473f882001-02-23 17:55:21 +00003090htmlParseEndTag(htmlParserCtxtPtr ctxt) {
3091 xmlChar *name;
3092 xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003093 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003094
3095 if ((CUR != '<') || (NXT(1) != '/')) {
3096 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3097 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3098 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003099 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003100 }
3101 SKIP(2);
3102
3103 name = htmlParseHTMLName(ctxt);
Daniel Veillardf420ac52001-07-04 16:04:09 +00003104 if (name == NULL) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003105
3106 /*
3107 * We should definitely be at the ending "S? '>'" part
3108 */
3109 SKIP_BLANKS;
3110 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3111 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3112 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3113 ctxt->wellFormed = 0;
3114 } else
3115 NEXT;
3116
3117 /*
3118 * If the name read is not one of the element in the parsing stack
3119 * then return, it's just an error.
3120 */
3121 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
3122 if (xmlStrEqual(name, ctxt->nameTab[i])) break;
3123 }
3124 if (i < 0) {
3125 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3126 ctxt->sax->error(ctxt->userData,
3127 "Unexpected end tag : %s\n", name);
3128 xmlFree(name);
3129 ctxt->wellFormed = 0;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003130 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003131 }
3132
3133
3134 /*
3135 * Check for auto-closure of HTML elements.
3136 */
3137
3138 htmlAutoCloseOnClose(ctxt, name);
3139
3140 /*
3141 * Well formedness constraints, opening and closing must match.
3142 * With the exception that the autoclose may have popped stuff out
3143 * of the stack.
3144 */
3145 if (!xmlStrEqual(name, ctxt->name)) {
3146#ifdef DEBUG
3147 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
3148#endif
3149 if ((ctxt->name != NULL) &&
3150 (!xmlStrEqual(ctxt->name, name))) {
3151 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3152 ctxt->sax->error(ctxt->userData,
3153 "Opening and ending tag mismatch: %s and %s\n",
3154 name, ctxt->name);
3155 ctxt->wellFormed = 0;
3156 }
3157 }
3158
3159 /*
3160 * SAX: End of Tag
3161 */
3162 oldname = ctxt->name;
3163 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3164 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3165 ctxt->sax->endElement(ctxt->userData, name);
3166 oldname = htmlnamePop(ctxt);
3167 if (oldname != NULL) {
3168#ifdef DEBUG
3169 xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
3170#endif
3171 xmlFree(oldname);
3172#ifdef DEBUG
3173 } else {
3174 xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
3175#endif
3176 }
Daniel Veillardf420ac52001-07-04 16:04:09 +00003177 ret = 1;
3178 } else {
3179 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003180 }
3181
3182 if (name != NULL)
3183 xmlFree(name);
3184
Daniel Veillardf420ac52001-07-04 16:04:09 +00003185 return(ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003186}
3187
3188
3189/**
3190 * htmlParseReference:
3191 * @ctxt: an HTML parser context
3192 *
3193 * parse and handle entity references in content,
3194 * this will end-up in a call to character() since this is either a
3195 * CharRef, or a predefined entity.
3196 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003197static void
Owen Taylor3473f882001-02-23 17:55:21 +00003198htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003199 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003200 xmlChar out[6];
3201 xmlChar *name;
3202 if (CUR != '&') return;
3203
3204 if (NXT(1) == '#') {
3205 unsigned int c;
3206 int bits, i = 0;
3207
3208 c = htmlParseCharRef(ctxt);
3209 if (c == 0)
3210 return;
3211
3212 if (c < 0x80) { out[i++]= c; bits= -6; }
3213 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3214 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3215 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3216
3217 for ( ; bits >= 0; bits-= 6) {
3218 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3219 }
3220 out[i] = 0;
3221
3222 htmlCheckParagraph(ctxt);
3223 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3224 ctxt->sax->characters(ctxt->userData, out, i);
3225 } else {
3226 ent = htmlParseEntityRef(ctxt, &name);
3227 if (name == NULL) {
3228 htmlCheckParagraph(ctxt);
3229 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3230 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3231 return;
3232 }
3233 if ((ent == NULL) || (ent->value <= 0)) {
3234 htmlCheckParagraph(ctxt);
3235 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3236 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3237 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3238 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3239 }
3240 } else {
3241 unsigned int c;
3242 int bits, i = 0;
3243
3244 c = ent->value;
3245 if (c < 0x80)
3246 { out[i++]= c; bits= -6; }
3247 else if (c < 0x800)
3248 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3249 else if (c < 0x10000)
3250 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3251 else
3252 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3253
3254 for ( ; bits >= 0; bits-= 6) {
3255 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3256 }
3257 out[i] = 0;
3258
3259 htmlCheckParagraph(ctxt);
3260 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3261 ctxt->sax->characters(ctxt->userData, out, i);
3262 }
3263 xmlFree(name);
3264 }
3265}
3266
3267/**
3268 * htmlParseContent:
3269 * @ctxt: an HTML parser context
3270 * @name: the node name
3271 *
3272 * Parse a content: comment, sub-element, reference or text.
3273 *
3274 */
3275
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003276static void
Owen Taylor3473f882001-02-23 17:55:21 +00003277htmlParseContent(htmlParserCtxtPtr ctxt) {
3278 xmlChar *currentNode;
3279 int depth;
3280
3281 currentNode = xmlStrdup(ctxt->name);
3282 depth = ctxt->nameNr;
3283 while (1) {
3284 long cons = ctxt->nbChars;
3285
3286 GROW;
3287 /*
3288 * Our tag or one of it's parent or children is ending.
3289 */
3290 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003291 if (htmlParseEndTag(ctxt) &&
3292 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3293 if (currentNode != NULL)
3294 xmlFree(currentNode);
3295 return;
3296 }
3297 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003298 }
3299
3300 /*
3301 * Has this node been popped out during parsing of
3302 * the next element
3303 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003304 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3305 (!xmlStrEqual(currentNode, ctxt->name)))
3306 {
Owen Taylor3473f882001-02-23 17:55:21 +00003307 if (currentNode != NULL) xmlFree(currentNode);
3308 return;
3309 }
3310
Daniel Veillardf9533d12001-03-03 10:04:57 +00003311 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3312 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003313 /*
3314 * Handle SCRIPT/STYLE separately
3315 */
3316 htmlParseScript(ctxt);
3317 } else {
3318 /*
3319 * Sometimes DOCTYPE arrives in the middle of the document
3320 */
3321 if ((CUR == '<') && (NXT(1) == '!') &&
3322 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3323 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3324 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3325 (UPP(8) == 'E')) {
3326 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3327 ctxt->sax->error(ctxt->userData,
3328 "Misplaced DOCTYPE declaration\n");
3329 ctxt->wellFormed = 0;
3330 htmlParseDocTypeDecl(ctxt);
3331 }
3332
3333 /*
3334 * First case : a comment
3335 */
3336 if ((CUR == '<') && (NXT(1) == '!') &&
3337 (NXT(2) == '-') && (NXT(3) == '-')) {
3338 htmlParseComment(ctxt);
3339 }
3340
3341 /*
3342 * Second case : a sub-element.
3343 */
3344 else if (CUR == '<') {
3345 htmlParseElement(ctxt);
3346 }
3347
3348 /*
3349 * Third case : a reference. If if has not been resolved,
3350 * parsing returns it's Name, create the node
3351 */
3352 else if (CUR == '&') {
3353 htmlParseReference(ctxt);
3354 }
3355
3356 /*
3357 * Fourth : end of the resource
3358 */
3359 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003360 htmlAutoCloseOnEnd(ctxt);
3361 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003362 }
3363
3364 /*
3365 * Last case, text. Note that References are handled directly.
3366 */
3367 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003368 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003369 }
3370
3371 if (cons == ctxt->nbChars) {
3372 if (ctxt->node != NULL) {
3373 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3374 ctxt->sax->error(ctxt->userData,
3375 "detected an error in element content\n");
3376 ctxt->wellFormed = 0;
3377 }
3378 break;
3379 }
3380 }
3381 GROW;
3382 }
3383 if (currentNode != NULL) xmlFree(currentNode);
3384}
3385
3386/**
3387 * htmlParseElement:
3388 * @ctxt: an HTML parser context
3389 *
3390 * parse an HTML element, this is highly recursive
3391 *
3392 * [39] element ::= EmptyElemTag | STag content ETag
3393 *
3394 * [41] Attribute ::= Name Eq AttValue
3395 */
3396
3397void
3398htmlParseElement(htmlParserCtxtPtr ctxt) {
3399 xmlChar *name;
3400 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003401 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003402 htmlParserNodeInfo node_info;
3403 xmlChar *oldname;
3404 int depth = ctxt->nameNr;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003405 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003406
3407 /* Capture start position */
3408 if (ctxt->record_info) {
3409 node_info.begin_pos = ctxt->input->consumed +
3410 (CUR_PTR - ctxt->input->base);
3411 node_info.begin_line = ctxt->input->line;
3412 }
3413
3414 oldname = xmlStrdup(ctxt->name);
3415 htmlParseStartTag(ctxt);
3416 name = ctxt->name;
3417#ifdef DEBUG
3418 if (oldname == NULL)
3419 xmlGenericError(xmlGenericErrorContext,
3420 "Start of element %s\n", name);
3421 else if (name == NULL)
3422 xmlGenericError(xmlGenericErrorContext,
3423 "Start of element failed, was %s\n", oldname);
3424 else
3425 xmlGenericError(xmlGenericErrorContext,
3426 "Start of element %s, was %s\n", name, oldname);
3427#endif
3428 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3429 (name == NULL)) {
3430 if (CUR == '>')
3431 NEXT;
3432 if (oldname != NULL)
3433 xmlFree(oldname);
3434 return;
3435 }
3436 if (oldname != NULL)
3437 xmlFree(oldname);
3438
3439 /*
3440 * Lookup the info for that element.
3441 */
3442 info = htmlTagLookup(name);
3443 if (info == NULL) {
3444 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3445 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3446 name);
3447 ctxt->wellFormed = 0;
3448 } else if (info->depr) {
3449/***************************
3450 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3451 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3452 name);
3453 ***************************/
3454 }
3455
3456 /*
3457 * Check for an Empty Element labelled the XML/SGML way
3458 */
3459 if ((CUR == '/') && (NXT(1) == '>')) {
3460 SKIP(2);
3461 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3462 ctxt->sax->endElement(ctxt->userData, name);
3463 oldname = htmlnamePop(ctxt);
3464#ifdef DEBUG
3465 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
3466#endif
3467 if (oldname != NULL)
3468 xmlFree(oldname);
3469 return;
3470 }
3471
3472 if (CUR == '>') {
3473 NEXT;
3474 } else {
3475 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3476 ctxt->sax->error(ctxt->userData,
3477 "Couldn't find end of Start Tag %s\n",
3478 name);
3479 ctxt->wellFormed = 0;
3480
3481 /*
3482 * end of parsing of this node.
3483 */
3484 if (xmlStrEqual(name, ctxt->name)) {
3485 nodePop(ctxt);
3486 oldname = htmlnamePop(ctxt);
3487#ifdef DEBUG
3488 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
3489#endif
3490 if (oldname != NULL)
3491 xmlFree(oldname);
3492 }
3493
3494 /*
3495 * Capture end position and add node
3496 */
3497 if ( currentNode != NULL && ctxt->record_info ) {
3498 node_info.end_pos = ctxt->input->consumed +
3499 (CUR_PTR - ctxt->input->base);
3500 node_info.end_line = ctxt->input->line;
3501 node_info.node = ctxt->node;
3502 xmlParserAddNodeInfo(ctxt, &node_info);
3503 }
3504 return;
3505 }
3506
3507 /*
3508 * Check for an Empty Element from DTD definition
3509 */
3510 if ((info != NULL) && (info->empty)) {
3511 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3512 ctxt->sax->endElement(ctxt->userData, name);
3513 oldname = htmlnamePop(ctxt);
3514#ifdef DEBUG
3515 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
3516#endif
3517 if (oldname != NULL)
3518 xmlFree(oldname);
3519 return;
3520 }
3521
3522 /*
3523 * Parse the content of the element:
3524 */
3525 currentNode = xmlStrdup(ctxt->name);
3526 depth = ctxt->nameNr;
3527 while (IS_CHAR(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00003528 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00003529 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00003530 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00003531 if (ctxt->nameNr < depth) break;
3532 }
3533
Owen Taylor3473f882001-02-23 17:55:21 +00003534 /*
3535 * Capture end position and add node
3536 */
3537 if ( currentNode != NULL && ctxt->record_info ) {
3538 node_info.end_pos = ctxt->input->consumed +
3539 (CUR_PTR - ctxt->input->base);
3540 node_info.end_line = ctxt->input->line;
3541 node_info.node = ctxt->node;
3542 xmlParserAddNodeInfo(ctxt, &node_info);
3543 }
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003544 if (!IS_CHAR(CUR)) {
3545 htmlAutoCloseOnEnd(ctxt);
3546 }
3547
Owen Taylor3473f882001-02-23 17:55:21 +00003548 if (currentNode != NULL)
3549 xmlFree(currentNode);
3550}
3551
3552/**
3553 * htmlParseDocument :
3554 * @ctxt: an HTML parser context
3555 *
3556 * parse an HTML document (and build a tree if using the standard SAX
3557 * interface).
3558 *
3559 * Returns 0, -1 in case of error. the parser context is augmented
3560 * as a result of the parsing.
3561 */
3562
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003563static int
Owen Taylor3473f882001-02-23 17:55:21 +00003564htmlParseDocument(htmlParserCtxtPtr ctxt) {
3565 xmlDtdPtr dtd;
3566
Daniel Veillardd0463562001-10-13 09:15:48 +00003567 xmlInitParser();
3568
Owen Taylor3473f882001-02-23 17:55:21 +00003569 htmlDefaultSAXHandlerInit();
3570 ctxt->html = 1;
3571
3572 GROW;
3573 /*
3574 * SAX: beginning of the document processing.
3575 */
3576 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3577 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3578
3579 /*
3580 * Wipe out everything which is before the first '<'
3581 */
3582 SKIP_BLANKS;
3583 if (CUR == 0) {
3584 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3585 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3586 ctxt->wellFormed = 0;
3587 }
3588
3589 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3590 ctxt->sax->startDocument(ctxt->userData);
3591
3592
3593 /*
3594 * Parse possible comments before any content
3595 */
3596 while ((CUR == '<') && (NXT(1) == '!') &&
3597 (NXT(2) == '-') && (NXT(3) == '-')) {
3598 htmlParseComment(ctxt);
3599 SKIP_BLANKS;
3600 }
3601
3602
3603 /*
3604 * Then possibly doc type declaration(s) and more Misc
3605 * (doctypedecl Misc*)?
3606 */
3607 if ((CUR == '<') && (NXT(1) == '!') &&
3608 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3609 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3610 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3611 (UPP(8) == 'E')) {
3612 htmlParseDocTypeDecl(ctxt);
3613 }
3614 SKIP_BLANKS;
3615
3616 /*
3617 * Parse possible comments before any content
3618 */
3619 while ((CUR == '<') && (NXT(1) == '!') &&
3620 (NXT(2) == '-') && (NXT(3) == '-')) {
3621 htmlParseComment(ctxt);
3622 SKIP_BLANKS;
3623 }
3624
3625 /*
3626 * Time to start parsing the tree itself
3627 */
3628 htmlParseContent(ctxt);
3629
3630 /*
3631 * autoclose
3632 */
3633 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003634 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003635
3636
3637 /*
3638 * SAX: end of the document processing.
3639 */
3640 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3641 ctxt->sax->endDocument(ctxt->userData);
3642
3643 if (ctxt->myDoc != NULL) {
3644 dtd = xmlGetIntSubset(ctxt->myDoc);
3645 if (dtd == NULL)
3646 ctxt->myDoc->intSubset =
3647 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3648 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3649 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3650 }
3651 if (! ctxt->wellFormed) return(-1);
3652 return(0);
3653}
3654
3655
3656/************************************************************************
3657 * *
3658 * Parser contexts handling *
3659 * *
3660 ************************************************************************/
3661
3662/**
3663 * xmlInitParserCtxt:
3664 * @ctxt: an HTML parser context
3665 *
3666 * Initialize a parser context
3667 */
3668
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003669static void
Owen Taylor3473f882001-02-23 17:55:21 +00003670htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3671{
3672 htmlSAXHandler *sax;
3673
3674 if (ctxt == NULL) return;
3675 memset(ctxt, 0, sizeof(htmlParserCtxt));
3676
3677 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3678 if (sax == NULL) {
3679 xmlGenericError(xmlGenericErrorContext,
3680 "htmlInitParserCtxt: out of memory\n");
3681 }
3682 else
3683 memset(sax, 0, sizeof(htmlSAXHandler));
3684
3685 /* Allocate the Input stack */
3686 ctxt->inputTab = (htmlParserInputPtr *)
3687 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3688 if (ctxt->inputTab == NULL) {
3689 xmlGenericError(xmlGenericErrorContext,
3690 "htmlInitParserCtxt: out of memory\n");
3691 ctxt->inputNr = 0;
3692 ctxt->inputMax = 0;
3693 ctxt->input = NULL;
3694 return;
3695 }
3696 ctxt->inputNr = 0;
3697 ctxt->inputMax = 5;
3698 ctxt->input = NULL;
3699 ctxt->version = NULL;
3700 ctxt->encoding = NULL;
3701 ctxt->standalone = -1;
3702 ctxt->instate = XML_PARSER_START;
3703
3704 /* Allocate the Node stack */
3705 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3706 if (ctxt->nodeTab == NULL) {
3707 xmlGenericError(xmlGenericErrorContext,
3708 "htmlInitParserCtxt: out of memory\n");
3709 ctxt->nodeNr = 0;
3710 ctxt->nodeMax = 0;
3711 ctxt->node = NULL;
3712 ctxt->inputNr = 0;
3713 ctxt->inputMax = 0;
3714 ctxt->input = NULL;
3715 return;
3716 }
3717 ctxt->nodeNr = 0;
3718 ctxt->nodeMax = 10;
3719 ctxt->node = NULL;
3720
3721 /* Allocate the Name stack */
3722 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
3723 if (ctxt->nameTab == NULL) {
3724 xmlGenericError(xmlGenericErrorContext,
3725 "htmlInitParserCtxt: out of memory\n");
3726 ctxt->nameNr = 0;
3727 ctxt->nameMax = 10;
3728 ctxt->name = NULL;
3729 ctxt->nodeNr = 0;
3730 ctxt->nodeMax = 0;
3731 ctxt->node = NULL;
3732 ctxt->inputNr = 0;
3733 ctxt->inputMax = 0;
3734 ctxt->input = NULL;
3735 return;
3736 }
3737 ctxt->nameNr = 0;
3738 ctxt->nameMax = 10;
3739 ctxt->name = NULL;
3740
3741 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3742 else {
3743 ctxt->sax = sax;
3744 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3745 }
3746 ctxt->userData = ctxt;
3747 ctxt->myDoc = NULL;
3748 ctxt->wellFormed = 1;
3749 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00003750 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00003751 ctxt->html = 1;
3752 ctxt->record_info = 0;
3753 ctxt->validate = 0;
3754 ctxt->nbChars = 0;
3755 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00003756 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003757 xmlInitNodeInfoSeq(&ctxt->node_seq);
3758}
3759
3760/**
3761 * htmlFreeParserCtxt:
3762 * @ctxt: an HTML parser context
3763 *
3764 * Free all the memory used by a parser context. However the parsed
3765 * document in ctxt->myDoc is not freed.
3766 */
3767
3768void
3769htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3770{
3771 xmlFreeParserCtxt(ctxt);
3772}
3773
3774/**
3775 * htmlCreateDocParserCtxt :
3776 * @cur: a pointer to an array of xmlChar
3777 * @encoding: a free form C string describing the HTML document encoding, or NULL
3778 *
3779 * Create a parser context for an HTML document.
3780 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003781 * TODO: check the need to add encoding handling there
3782 *
Owen Taylor3473f882001-02-23 17:55:21 +00003783 * Returns the new parser context or NULL
3784 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003785static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00003786htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Owen Taylor3473f882001-02-23 17:55:21 +00003787 htmlParserCtxtPtr ctxt;
3788 htmlParserInputPtr input;
3789 /* htmlCharEncoding enc; */
3790
3791 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
3792 if (ctxt == NULL) {
3793 perror("malloc");
3794 return(NULL);
3795 }
3796 htmlInitParserCtxt(ctxt);
3797 input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
3798 if (input == NULL) {
3799 perror("malloc");
3800 xmlFree(ctxt);
3801 return(NULL);
3802 }
3803 memset(input, 0, sizeof(htmlParserInput));
3804
3805 input->line = 1;
3806 input->col = 1;
3807 input->base = cur;
3808 input->cur = cur;
3809
3810 inputPush(ctxt, input);
3811 return(ctxt);
3812}
3813
3814/************************************************************************
3815 * *
3816 * Progressive parsing interfaces *
3817 * *
3818 ************************************************************************/
3819
3820/**
3821 * htmlParseLookupSequence:
3822 * @ctxt: an HTML parser context
3823 * @first: the first char to lookup
3824 * @next: the next char to lookup or zero
3825 * @third: the next char to lookup or zero
3826 *
3827 * Try to find if a sequence (first, next, third) or just (first next) or
3828 * (first) is available in the input stream.
3829 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3830 * to avoid rescanning sequences of bytes, it DOES change the state of the
3831 * parser, do not use liberally.
3832 * This is basically similar to xmlParseLookupSequence()
3833 *
3834 * Returns the index to the current parsing point if the full sequence
3835 * is available, -1 otherwise.
3836 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003837static int
Owen Taylor3473f882001-02-23 17:55:21 +00003838htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3839 xmlChar next, xmlChar third) {
3840 int base, len;
3841 htmlParserInputPtr in;
3842 const xmlChar *buf;
3843
3844 in = ctxt->input;
3845 if (in == NULL) return(-1);
3846 base = in->cur - in->base;
3847 if (base < 0) return(-1);
3848 if (ctxt->checkIndex > base)
3849 base = ctxt->checkIndex;
3850 if (in->buf == NULL) {
3851 buf = in->base;
3852 len = in->length;
3853 } else {
3854 buf = in->buf->buffer->content;
3855 len = in->buf->buffer->use;
3856 }
3857 /* take into account the sequence length */
3858 if (third) len -= 2;
3859 else if (next) len --;
3860 for (;base < len;base++) {
3861 if (buf[base] == first) {
3862 if (third != 0) {
3863 if ((buf[base + 1] != next) ||
3864 (buf[base + 2] != third)) continue;
3865 } else if (next != 0) {
3866 if (buf[base + 1] != next) continue;
3867 }
3868 ctxt->checkIndex = 0;
3869#ifdef DEBUG_PUSH
3870 if (next == 0)
3871 xmlGenericError(xmlGenericErrorContext,
3872 "HPP: lookup '%c' found at %d\n",
3873 first, base);
3874 else if (third == 0)
3875 xmlGenericError(xmlGenericErrorContext,
3876 "HPP: lookup '%c%c' found at %d\n",
3877 first, next, base);
3878 else
3879 xmlGenericError(xmlGenericErrorContext,
3880 "HPP: lookup '%c%c%c' found at %d\n",
3881 first, next, third, base);
3882#endif
3883 return(base - (in->cur - in->base));
3884 }
3885 }
3886 ctxt->checkIndex = base;
3887#ifdef DEBUG_PUSH
3888 if (next == 0)
3889 xmlGenericError(xmlGenericErrorContext,
3890 "HPP: lookup '%c' failed\n", first);
3891 else if (third == 0)
3892 xmlGenericError(xmlGenericErrorContext,
3893 "HPP: lookup '%c%c' failed\n", first, next);
3894 else
3895 xmlGenericError(xmlGenericErrorContext,
3896 "HPP: lookup '%c%c%c' failed\n", first, next, third);
3897#endif
3898 return(-1);
3899}
3900
3901/**
3902 * htmlParseTryOrFinish:
3903 * @ctxt: an HTML parser context
3904 * @terminate: last chunk indicator
3905 *
3906 * Try to progress on parsing
3907 *
3908 * Returns zero if no parsing was possible
3909 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003910static int
Owen Taylor3473f882001-02-23 17:55:21 +00003911htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
3912 int ret = 0;
3913 htmlParserInputPtr in;
3914 int avail = 0;
3915 xmlChar cur, next;
3916
3917#ifdef DEBUG_PUSH
3918 switch (ctxt->instate) {
3919 case XML_PARSER_EOF:
3920 xmlGenericError(xmlGenericErrorContext,
3921 "HPP: try EOF\n"); break;
3922 case XML_PARSER_START:
3923 xmlGenericError(xmlGenericErrorContext,
3924 "HPP: try START\n"); break;
3925 case XML_PARSER_MISC:
3926 xmlGenericError(xmlGenericErrorContext,
3927 "HPP: try MISC\n");break;
3928 case XML_PARSER_COMMENT:
3929 xmlGenericError(xmlGenericErrorContext,
3930 "HPP: try COMMENT\n");break;
3931 case XML_PARSER_PROLOG:
3932 xmlGenericError(xmlGenericErrorContext,
3933 "HPP: try PROLOG\n");break;
3934 case XML_PARSER_START_TAG:
3935 xmlGenericError(xmlGenericErrorContext,
3936 "HPP: try START_TAG\n");break;
3937 case XML_PARSER_CONTENT:
3938 xmlGenericError(xmlGenericErrorContext,
3939 "HPP: try CONTENT\n");break;
3940 case XML_PARSER_CDATA_SECTION:
3941 xmlGenericError(xmlGenericErrorContext,
3942 "HPP: try CDATA_SECTION\n");break;
3943 case XML_PARSER_END_TAG:
3944 xmlGenericError(xmlGenericErrorContext,
3945 "HPP: try END_TAG\n");break;
3946 case XML_PARSER_ENTITY_DECL:
3947 xmlGenericError(xmlGenericErrorContext,
3948 "HPP: try ENTITY_DECL\n");break;
3949 case XML_PARSER_ENTITY_VALUE:
3950 xmlGenericError(xmlGenericErrorContext,
3951 "HPP: try ENTITY_VALUE\n");break;
3952 case XML_PARSER_ATTRIBUTE_VALUE:
3953 xmlGenericError(xmlGenericErrorContext,
3954 "HPP: try ATTRIBUTE_VALUE\n");break;
3955 case XML_PARSER_DTD:
3956 xmlGenericError(xmlGenericErrorContext,
3957 "HPP: try DTD\n");break;
3958 case XML_PARSER_EPILOG:
3959 xmlGenericError(xmlGenericErrorContext,
3960 "HPP: try EPILOG\n");break;
3961 case XML_PARSER_PI:
3962 xmlGenericError(xmlGenericErrorContext,
3963 "HPP: try PI\n");break;
3964 case XML_PARSER_SYSTEM_LITERAL:
3965 xmlGenericError(xmlGenericErrorContext,
3966 "HPP: try SYSTEM_LITERAL\n");break;
3967 }
3968#endif
3969
3970 while (1) {
3971
3972 in = ctxt->input;
3973 if (in == NULL) break;
3974 if (in->buf == NULL)
3975 avail = in->length - (in->cur - in->base);
3976 else
3977 avail = in->buf->buffer->use - (in->cur - in->base);
3978 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003979 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003980 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
3981 /*
3982 * SAX: end of the document processing.
3983 */
3984 ctxt->instate = XML_PARSER_EOF;
3985 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3986 ctxt->sax->endDocument(ctxt->userData);
3987 }
3988 }
3989 if (avail < 1)
3990 goto done;
3991 switch (ctxt->instate) {
3992 case XML_PARSER_EOF:
3993 /*
3994 * Document parsing is done !
3995 */
3996 goto done;
3997 case XML_PARSER_START:
3998 /*
3999 * Very first chars read from the document flow.
4000 */
4001 cur = in->cur[0];
4002 if (IS_BLANK(cur)) {
4003 SKIP_BLANKS;
4004 if (in->buf == NULL)
4005 avail = in->length - (in->cur - in->base);
4006 else
4007 avail = in->buf->buffer->use - (in->cur - in->base);
4008 }
4009 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4010 ctxt->sax->setDocumentLocator(ctxt->userData,
4011 &xmlDefaultSAXLocator);
4012 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4013 (!ctxt->disableSAX))
4014 ctxt->sax->startDocument(ctxt->userData);
4015
4016 cur = in->cur[0];
4017 next = in->cur[1];
4018 if ((cur == '<') && (next == '!') &&
4019 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4020 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4021 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4022 (UPP(8) == 'E')) {
4023 if ((!terminate) &&
4024 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4025 goto done;
4026#ifdef DEBUG_PUSH
4027 xmlGenericError(xmlGenericErrorContext,
4028 "HPP: Parsing internal subset\n");
4029#endif
4030 htmlParseDocTypeDecl(ctxt);
4031 ctxt->instate = XML_PARSER_PROLOG;
4032#ifdef DEBUG_PUSH
4033 xmlGenericError(xmlGenericErrorContext,
4034 "HPP: entering PROLOG\n");
4035#endif
4036 } else {
4037 ctxt->instate = XML_PARSER_MISC;
4038 }
4039#ifdef DEBUG_PUSH
4040 xmlGenericError(xmlGenericErrorContext,
4041 "HPP: entering MISC\n");
4042#endif
4043 break;
4044 case XML_PARSER_MISC:
4045 SKIP_BLANKS;
4046 if (in->buf == NULL)
4047 avail = in->length - (in->cur - in->base);
4048 else
4049 avail = in->buf->buffer->use - (in->cur - in->base);
4050 if (avail < 2)
4051 goto done;
4052 cur = in->cur[0];
4053 next = in->cur[1];
4054 if ((cur == '<') && (next == '!') &&
4055 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4056 if ((!terminate) &&
4057 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4058 goto done;
4059#ifdef DEBUG_PUSH
4060 xmlGenericError(xmlGenericErrorContext,
4061 "HPP: Parsing Comment\n");
4062#endif
4063 htmlParseComment(ctxt);
4064 ctxt->instate = XML_PARSER_MISC;
4065 } else if ((cur == '<') && (next == '!') &&
4066 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4067 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4068 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4069 (UPP(8) == 'E')) {
4070 if ((!terminate) &&
4071 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4072 goto done;
4073#ifdef DEBUG_PUSH
4074 xmlGenericError(xmlGenericErrorContext,
4075 "HPP: Parsing internal subset\n");
4076#endif
4077 htmlParseDocTypeDecl(ctxt);
4078 ctxt->instate = XML_PARSER_PROLOG;
4079#ifdef DEBUG_PUSH
4080 xmlGenericError(xmlGenericErrorContext,
4081 "HPP: entering PROLOG\n");
4082#endif
4083 } else if ((cur == '<') && (next == '!') &&
4084 (avail < 9)) {
4085 goto done;
4086 } else {
4087 ctxt->instate = XML_PARSER_START_TAG;
4088#ifdef DEBUG_PUSH
4089 xmlGenericError(xmlGenericErrorContext,
4090 "HPP: entering START_TAG\n");
4091#endif
4092 }
4093 break;
4094 case XML_PARSER_PROLOG:
4095 SKIP_BLANKS;
4096 if (in->buf == NULL)
4097 avail = in->length - (in->cur - in->base);
4098 else
4099 avail = in->buf->buffer->use - (in->cur - in->base);
4100 if (avail < 2)
4101 goto done;
4102 cur = in->cur[0];
4103 next = in->cur[1];
4104 if ((cur == '<') && (next == '!') &&
4105 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4106 if ((!terminate) &&
4107 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4108 goto done;
4109#ifdef DEBUG_PUSH
4110 xmlGenericError(xmlGenericErrorContext,
4111 "HPP: Parsing Comment\n");
4112#endif
4113 htmlParseComment(ctxt);
4114 ctxt->instate = XML_PARSER_PROLOG;
4115 } else if ((cur == '<') && (next == '!') &&
4116 (avail < 4)) {
4117 goto done;
4118 } else {
4119 ctxt->instate = XML_PARSER_START_TAG;
4120#ifdef DEBUG_PUSH
4121 xmlGenericError(xmlGenericErrorContext,
4122 "HPP: entering START_TAG\n");
4123#endif
4124 }
4125 break;
4126 case XML_PARSER_EPILOG:
4127 if (in->buf == NULL)
4128 avail = in->length - (in->cur - in->base);
4129 else
4130 avail = in->buf->buffer->use - (in->cur - in->base);
4131 if (avail < 1)
4132 goto done;
4133 cur = in->cur[0];
4134 if (IS_BLANK(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004135 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004136 goto done;
4137 }
4138 if (avail < 2)
4139 goto done;
4140 next = in->cur[1];
4141 if ((cur == '<') && (next == '!') &&
4142 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4143 if ((!terminate) &&
4144 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4145 goto done;
4146#ifdef DEBUG_PUSH
4147 xmlGenericError(xmlGenericErrorContext,
4148 "HPP: Parsing Comment\n");
4149#endif
4150 htmlParseComment(ctxt);
4151 ctxt->instate = XML_PARSER_EPILOG;
4152 } else if ((cur == '<') && (next == '!') &&
4153 (avail < 4)) {
4154 goto done;
4155 } else {
4156 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004157 ctxt->wellFormed = 0;
4158 ctxt->instate = XML_PARSER_EOF;
4159#ifdef DEBUG_PUSH
4160 xmlGenericError(xmlGenericErrorContext,
4161 "HPP: entering EOF\n");
4162#endif
4163 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4164 ctxt->sax->endDocument(ctxt->userData);
4165 goto done;
4166 }
4167 break;
4168 case XML_PARSER_START_TAG: {
4169 xmlChar *name, *oldname;
4170 int depth = ctxt->nameNr;
Daniel Veillardbb371292001-08-16 23:26:59 +00004171 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004172
4173 if (avail < 2)
4174 goto done;
4175 cur = in->cur[0];
4176 if (cur != '<') {
4177 ctxt->instate = XML_PARSER_CONTENT;
4178#ifdef DEBUG_PUSH
4179 xmlGenericError(xmlGenericErrorContext,
4180 "HPP: entering CONTENT\n");
4181#endif
4182 break;
4183 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004184 if (in->cur[1] == '/') {
4185 ctxt->instate = XML_PARSER_END_TAG;
4186 ctxt->checkIndex = 0;
4187#ifdef DEBUG_PUSH
4188 xmlGenericError(xmlGenericErrorContext,
4189 "HPP: entering END_TAG\n");
4190#endif
4191 break;
4192 }
Owen Taylor3473f882001-02-23 17:55:21 +00004193 if ((!terminate) &&
4194 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4195 goto done;
4196
4197 oldname = xmlStrdup(ctxt->name);
4198 htmlParseStartTag(ctxt);
4199 name = ctxt->name;
4200#ifdef DEBUG
4201 if (oldname == NULL)
4202 xmlGenericError(xmlGenericErrorContext,
4203 "Start of element %s\n", name);
4204 else if (name == NULL)
4205 xmlGenericError(xmlGenericErrorContext,
4206 "Start of element failed, was %s\n",
4207 oldname);
4208 else
4209 xmlGenericError(xmlGenericErrorContext,
4210 "Start of element %s, was %s\n",
4211 name, oldname);
4212#endif
4213 if (((depth == ctxt->nameNr) &&
4214 (xmlStrEqual(oldname, ctxt->name))) ||
4215 (name == NULL)) {
4216 if (CUR == '>')
4217 NEXT;
4218 if (oldname != NULL)
4219 xmlFree(oldname);
4220 break;
4221 }
4222 if (oldname != NULL)
4223 xmlFree(oldname);
4224
4225 /*
4226 * Lookup the info for that element.
4227 */
4228 info = htmlTagLookup(name);
4229 if (info == NULL) {
4230 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4231 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4232 name);
4233 ctxt->wellFormed = 0;
4234 } else if (info->depr) {
4235 /***************************
4236 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4237 ctxt->sax->warning(ctxt->userData,
4238 "Tag %s is deprecated\n",
4239 name);
4240 ***************************/
4241 }
4242
4243 /*
4244 * Check for an Empty Element labelled the XML/SGML way
4245 */
4246 if ((CUR == '/') && (NXT(1) == '>')) {
4247 SKIP(2);
4248 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4249 ctxt->sax->endElement(ctxt->userData, name);
4250 oldname = htmlnamePop(ctxt);
4251#ifdef DEBUG
4252 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
4253 oldname);
4254#endif
4255 if (oldname != NULL)
4256 xmlFree(oldname);
4257 ctxt->instate = XML_PARSER_CONTENT;
4258#ifdef DEBUG_PUSH
4259 xmlGenericError(xmlGenericErrorContext,
4260 "HPP: entering CONTENT\n");
4261#endif
4262 break;
4263 }
4264
4265 if (CUR == '>') {
4266 NEXT;
4267 } else {
4268 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4269 ctxt->sax->error(ctxt->userData,
4270 "Couldn't find end of Start Tag %s\n",
4271 name);
4272 ctxt->wellFormed = 0;
4273
4274 /*
4275 * end of parsing of this node.
4276 */
4277 if (xmlStrEqual(name, ctxt->name)) {
4278 nodePop(ctxt);
4279 oldname = htmlnamePop(ctxt);
4280#ifdef DEBUG
4281 xmlGenericError(xmlGenericErrorContext,
4282 "End of start tag problem: popping out %s\n", oldname);
4283#endif
4284 if (oldname != NULL)
4285 xmlFree(oldname);
4286 }
4287
4288 ctxt->instate = XML_PARSER_CONTENT;
4289#ifdef DEBUG_PUSH
4290 xmlGenericError(xmlGenericErrorContext,
4291 "HPP: entering CONTENT\n");
4292#endif
4293 break;
4294 }
4295
4296 /*
4297 * Check for an Empty Element from DTD definition
4298 */
4299 if ((info != NULL) && (info->empty)) {
4300 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4301 ctxt->sax->endElement(ctxt->userData, name);
4302 oldname = htmlnamePop(ctxt);
4303#ifdef DEBUG
4304 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
4305#endif
4306 if (oldname != NULL)
4307 xmlFree(oldname);
4308 }
4309 ctxt->instate = XML_PARSER_CONTENT;
4310#ifdef DEBUG_PUSH
4311 xmlGenericError(xmlGenericErrorContext,
4312 "HPP: entering CONTENT\n");
4313#endif
4314 break;
4315 }
4316 case XML_PARSER_CONTENT: {
4317 long cons;
4318 /*
4319 * Handle preparsed entities and charRef
4320 */
4321 if (ctxt->token != 0) {
4322 xmlChar chr[2] = { 0 , 0 } ;
4323
4324 chr[0] = (xmlChar) ctxt->token;
4325 htmlCheckParagraph(ctxt);
4326 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4327 ctxt->sax->characters(ctxt->userData, chr, 1);
4328 ctxt->token = 0;
4329 ctxt->checkIndex = 0;
4330 }
4331 if ((avail == 1) && (terminate)) {
4332 cur = in->cur[0];
4333 if ((cur != '<') && (cur != '&')) {
4334 if (ctxt->sax != NULL) {
4335 if (IS_BLANK(cur)) {
4336 if (ctxt->sax->ignorableWhitespace != NULL)
4337 ctxt->sax->ignorableWhitespace(
4338 ctxt->userData, &cur, 1);
4339 } else {
4340 htmlCheckParagraph(ctxt);
4341 if (ctxt->sax->characters != NULL)
4342 ctxt->sax->characters(
4343 ctxt->userData, &cur, 1);
4344 }
4345 }
4346 ctxt->token = 0;
4347 ctxt->checkIndex = 0;
4348 NEXT;
William M. Brack1633d182001-10-05 15:41:19 +00004349 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004350 }
Owen Taylor3473f882001-02-23 17:55:21 +00004351 }
4352 if (avail < 2)
4353 goto done;
4354 cur = in->cur[0];
4355 next = in->cur[1];
4356 cons = ctxt->nbChars;
4357 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4358 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4359 /*
4360 * Handle SCRIPT/STYLE separately
4361 */
4362 if ((!terminate) &&
4363 (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4364 goto done;
4365 htmlParseScript(ctxt);
4366 if ((cur == '<') && (next == '/')) {
4367 ctxt->instate = XML_PARSER_END_TAG;
4368 ctxt->checkIndex = 0;
4369#ifdef DEBUG_PUSH
4370 xmlGenericError(xmlGenericErrorContext,
4371 "HPP: entering END_TAG\n");
4372#endif
4373 break;
4374 }
4375 } else {
4376 /*
4377 * Sometimes DOCTYPE arrives in the middle of the document
4378 */
4379 if ((cur == '<') && (next == '!') &&
4380 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4381 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4382 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4383 (UPP(8) == 'E')) {
4384 if ((!terminate) &&
4385 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4386 goto done;
4387 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4388 ctxt->sax->error(ctxt->userData,
4389 "Misplaced DOCTYPE declaration\n");
4390 ctxt->wellFormed = 0;
4391 htmlParseDocTypeDecl(ctxt);
4392 } else if ((cur == '<') && (next == '!') &&
4393 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4394 if ((!terminate) &&
4395 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4396 goto done;
4397#ifdef DEBUG_PUSH
4398 xmlGenericError(xmlGenericErrorContext,
4399 "HPP: Parsing Comment\n");
4400#endif
4401 htmlParseComment(ctxt);
4402 ctxt->instate = XML_PARSER_CONTENT;
4403 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4404 goto done;
4405 } else if ((cur == '<') && (next == '/')) {
4406 ctxt->instate = XML_PARSER_END_TAG;
4407 ctxt->checkIndex = 0;
4408#ifdef DEBUG_PUSH
4409 xmlGenericError(xmlGenericErrorContext,
4410 "HPP: entering END_TAG\n");
4411#endif
4412 break;
4413 } else if (cur == '<') {
4414 ctxt->instate = XML_PARSER_START_TAG;
4415 ctxt->checkIndex = 0;
4416#ifdef DEBUG_PUSH
4417 xmlGenericError(xmlGenericErrorContext,
4418 "HPP: entering START_TAG\n");
4419#endif
4420 break;
4421 } else if (cur == '&') {
4422 if ((!terminate) &&
4423 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4424 goto done;
4425#ifdef DEBUG_PUSH
4426 xmlGenericError(xmlGenericErrorContext,
4427 "HPP: Parsing Reference\n");
4428#endif
4429 /* TODO: check generation of subtrees if noent !!! */
4430 htmlParseReference(ctxt);
4431 } else {
4432 /* TODO Avoid the extra copy, handle directly !!!!!! */
4433 /*
4434 * Goal of the following test is :
4435 * - minimize calls to the SAX 'character' callback
4436 * when they are mergeable
4437 */
4438 if ((ctxt->inputNr == 1) &&
4439 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4440 if ((!terminate) &&
4441 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4442 goto done;
4443 }
4444 ctxt->checkIndex = 0;
4445#ifdef DEBUG_PUSH
4446 xmlGenericError(xmlGenericErrorContext,
4447 "HPP: Parsing char data\n");
4448#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004449 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004450 }
4451 }
4452 if (cons == ctxt->nbChars) {
4453 if (ctxt->node != NULL) {
4454 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4455 ctxt->sax->error(ctxt->userData,
4456 "detected an error in element content\n");
4457 ctxt->wellFormed = 0;
4458 }
4459 NEXT;
4460 break;
4461 }
4462
4463 break;
4464 }
4465 case XML_PARSER_END_TAG:
4466 if (avail < 2)
4467 goto done;
4468 if ((!terminate) &&
4469 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4470 goto done;
4471 htmlParseEndTag(ctxt);
4472 if (ctxt->nameNr == 0) {
4473 ctxt->instate = XML_PARSER_EPILOG;
4474 } else {
4475 ctxt->instate = XML_PARSER_CONTENT;
4476 }
4477 ctxt->checkIndex = 0;
4478#ifdef DEBUG_PUSH
4479 xmlGenericError(xmlGenericErrorContext,
4480 "HPP: entering CONTENT\n");
4481#endif
4482 break;
4483 case XML_PARSER_CDATA_SECTION:
4484 xmlGenericError(xmlGenericErrorContext,
4485 "HPP: internal error, state == CDATA\n");
4486 ctxt->instate = XML_PARSER_CONTENT;
4487 ctxt->checkIndex = 0;
4488#ifdef DEBUG_PUSH
4489 xmlGenericError(xmlGenericErrorContext,
4490 "HPP: entering CONTENT\n");
4491#endif
4492 break;
4493 case XML_PARSER_DTD:
4494 xmlGenericError(xmlGenericErrorContext,
4495 "HPP: internal error, state == DTD\n");
4496 ctxt->instate = XML_PARSER_CONTENT;
4497 ctxt->checkIndex = 0;
4498#ifdef DEBUG_PUSH
4499 xmlGenericError(xmlGenericErrorContext,
4500 "HPP: entering CONTENT\n");
4501#endif
4502 break;
4503 case XML_PARSER_COMMENT:
4504 xmlGenericError(xmlGenericErrorContext,
4505 "HPP: internal error, state == COMMENT\n");
4506 ctxt->instate = XML_PARSER_CONTENT;
4507 ctxt->checkIndex = 0;
4508#ifdef DEBUG_PUSH
4509 xmlGenericError(xmlGenericErrorContext,
4510 "HPP: entering CONTENT\n");
4511#endif
4512 break;
4513 case XML_PARSER_PI:
4514 xmlGenericError(xmlGenericErrorContext,
4515 "HPP: internal error, state == PI\n");
4516 ctxt->instate = XML_PARSER_CONTENT;
4517 ctxt->checkIndex = 0;
4518#ifdef DEBUG_PUSH
4519 xmlGenericError(xmlGenericErrorContext,
4520 "HPP: entering CONTENT\n");
4521#endif
4522 break;
4523 case XML_PARSER_ENTITY_DECL:
4524 xmlGenericError(xmlGenericErrorContext,
4525 "HPP: internal error, state == ENTITY_DECL\n");
4526 ctxt->instate = XML_PARSER_CONTENT;
4527 ctxt->checkIndex = 0;
4528#ifdef DEBUG_PUSH
4529 xmlGenericError(xmlGenericErrorContext,
4530 "HPP: entering CONTENT\n");
4531#endif
4532 break;
4533 case XML_PARSER_ENTITY_VALUE:
4534 xmlGenericError(xmlGenericErrorContext,
4535 "HPP: internal error, state == ENTITY_VALUE\n");
4536 ctxt->instate = XML_PARSER_CONTENT;
4537 ctxt->checkIndex = 0;
4538#ifdef DEBUG_PUSH
4539 xmlGenericError(xmlGenericErrorContext,
4540 "HPP: entering DTD\n");
4541#endif
4542 break;
4543 case XML_PARSER_ATTRIBUTE_VALUE:
4544 xmlGenericError(xmlGenericErrorContext,
4545 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
4546 ctxt->instate = XML_PARSER_START_TAG;
4547 ctxt->checkIndex = 0;
4548#ifdef DEBUG_PUSH
4549 xmlGenericError(xmlGenericErrorContext,
4550 "HPP: entering START_TAG\n");
4551#endif
4552 break;
4553 case XML_PARSER_SYSTEM_LITERAL:
4554 xmlGenericError(xmlGenericErrorContext,
4555 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
4556 ctxt->instate = XML_PARSER_CONTENT;
4557 ctxt->checkIndex = 0;
4558#ifdef DEBUG_PUSH
4559 xmlGenericError(xmlGenericErrorContext,
4560 "HPP: entering CONTENT\n");
4561#endif
4562 break;
4563 case XML_PARSER_IGNORE:
4564 xmlGenericError(xmlGenericErrorContext,
4565 "HPP: internal error, state == XML_PARSER_IGNORE\n");
4566 ctxt->instate = XML_PARSER_CONTENT;
4567 ctxt->checkIndex = 0;
4568#ifdef DEBUG_PUSH
4569 xmlGenericError(xmlGenericErrorContext,
4570 "HPP: entering CONTENT\n");
4571#endif
4572 break;
4573 }
4574 }
4575done:
4576 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004577 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004578 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4579 /*
4580 * SAX: end of the document processing.
4581 */
4582 ctxt->instate = XML_PARSER_EOF;
4583 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4584 ctxt->sax->endDocument(ctxt->userData);
4585 }
4586 }
4587 if ((ctxt->myDoc != NULL) &&
4588 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4589 (ctxt->instate == XML_PARSER_EPILOG))) {
4590 xmlDtdPtr dtd;
4591 dtd = xmlGetIntSubset(ctxt->myDoc);
4592 if (dtd == NULL)
4593 ctxt->myDoc->intSubset =
4594 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4595 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4596 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4597 }
4598#ifdef DEBUG_PUSH
4599 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4600#endif
4601 return(ret);
4602}
4603
4604/**
Owen Taylor3473f882001-02-23 17:55:21 +00004605 * htmlParseChunk:
4606 * @ctxt: an XML parser context
4607 * @chunk: an char array
4608 * @size: the size in byte of the chunk
4609 * @terminate: last chunk indicator
4610 *
4611 * Parse a Chunk of memory
4612 *
4613 * Returns zero if no error, the xmlParserErrors otherwise.
4614 */
4615int
4616htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4617 int terminate) {
4618 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4619 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4620 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4621 int cur = ctxt->input->cur - ctxt->input->base;
4622
4623 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4624 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4625 ctxt->input->cur = ctxt->input->base + cur;
4626#ifdef DEBUG_PUSH
4627 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4628#endif
4629
4630 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4631 htmlParseTryOrFinish(ctxt, terminate);
4632 } else if (ctxt->instate != XML_PARSER_EOF) {
4633 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
4634 htmlParseTryOrFinish(ctxt, terminate);
4635 }
4636 if (terminate) {
4637 if ((ctxt->instate != XML_PARSER_EOF) &&
4638 (ctxt->instate != XML_PARSER_EPILOG) &&
4639 (ctxt->instate != XML_PARSER_MISC)) {
4640 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004641 ctxt->wellFormed = 0;
4642 }
4643 if (ctxt->instate != XML_PARSER_EOF) {
4644 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4645 ctxt->sax->endDocument(ctxt->userData);
4646 }
4647 ctxt->instate = XML_PARSER_EOF;
4648 }
4649 return((xmlParserErrors) ctxt->errNo);
4650}
4651
4652/************************************************************************
4653 * *
4654 * User entry points *
4655 * *
4656 ************************************************************************/
4657
4658/**
4659 * htmlCreatePushParserCtxt :
4660 * @sax: a SAX handler
4661 * @user_data: The user data returned on SAX callbacks
4662 * @chunk: a pointer to an array of chars
4663 * @size: number of chars in the array
4664 * @filename: an optional file name or URI
4665 * @enc: an optional encoding
4666 *
4667 * Create a parser context for using the HTML parser in push mode
4668 * To allow content encoding detection, @size should be >= 4
4669 * The value of @filename is used for fetching external entities
4670 * and error/warning reports.
4671 *
4672 * Returns the new parser context or NULL
4673 */
4674htmlParserCtxtPtr
4675htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4676 const char *chunk, int size, const char *filename,
4677 xmlCharEncoding enc) {
4678 htmlParserCtxtPtr ctxt;
4679 htmlParserInputPtr inputStream;
4680 xmlParserInputBufferPtr buf;
4681
Daniel Veillardd0463562001-10-13 09:15:48 +00004682 xmlInitParser();
4683
Owen Taylor3473f882001-02-23 17:55:21 +00004684 buf = xmlAllocParserInputBuffer(enc);
4685 if (buf == NULL) return(NULL);
4686
4687 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4688 if (ctxt == NULL) {
4689 xmlFree(buf);
4690 return(NULL);
4691 }
4692 memset(ctxt, 0, sizeof(htmlParserCtxt));
4693 htmlInitParserCtxt(ctxt);
4694 if (sax != NULL) {
4695 if (ctxt->sax != &htmlDefaultSAXHandler)
4696 xmlFree(ctxt->sax);
4697 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4698 if (ctxt->sax == NULL) {
4699 xmlFree(buf);
4700 xmlFree(ctxt);
4701 return(NULL);
4702 }
4703 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4704 if (user_data != NULL)
4705 ctxt->userData = user_data;
4706 }
4707 if (filename == NULL) {
4708 ctxt->directory = NULL;
4709 } else {
4710 ctxt->directory = xmlParserGetDirectory(filename);
4711 }
4712
4713 inputStream = htmlNewInputStream(ctxt);
4714 if (inputStream == NULL) {
4715 xmlFreeParserCtxt(ctxt);
4716 return(NULL);
4717 }
4718
4719 if (filename == NULL)
4720 inputStream->filename = NULL;
4721 else
4722 inputStream->filename = xmlMemStrdup(filename);
4723 inputStream->buf = buf;
4724 inputStream->base = inputStream->buf->buffer->content;
4725 inputStream->cur = inputStream->buf->buffer->content;
4726
4727 inputPush(ctxt, inputStream);
4728
4729 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4730 (ctxt->input->buf != NULL)) {
4731 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4732#ifdef DEBUG_PUSH
4733 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4734#endif
4735 }
4736
4737 return(ctxt);
4738}
4739
4740/**
4741 * htmlSAXParseDoc :
4742 * @cur: a pointer to an array of xmlChar
4743 * @encoding: a free form C string describing the HTML document encoding, or NULL
4744 * @sax: the SAX handler block
4745 * @userData: if using SAX, this pointer will be provided on callbacks.
4746 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004747 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
4748 * to handle parse events. If sax is NULL, fallback to the default DOM
4749 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00004750 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004751 * Returns the resulting document tree unless SAX is NULL or the document is
4752 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00004753 */
4754
4755htmlDocPtr
4756htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
4757 htmlDocPtr ret;
4758 htmlParserCtxtPtr ctxt;
4759
Daniel Veillardd0463562001-10-13 09:15:48 +00004760 xmlInitParser();
4761
Owen Taylor3473f882001-02-23 17:55:21 +00004762 if (cur == NULL) return(NULL);
4763
4764
4765 ctxt = htmlCreateDocParserCtxt(cur, encoding);
4766 if (ctxt == NULL) return(NULL);
4767 if (sax != NULL) {
4768 ctxt->sax = sax;
4769 ctxt->userData = userData;
4770 }
4771
4772 htmlParseDocument(ctxt);
4773 ret = ctxt->myDoc;
4774 if (sax != NULL) {
4775 ctxt->sax = NULL;
4776 ctxt->userData = NULL;
4777 }
4778 htmlFreeParserCtxt(ctxt);
4779
4780 return(ret);
4781}
4782
4783/**
4784 * htmlParseDoc :
4785 * @cur: a pointer to an array of xmlChar
4786 * @encoding: a free form C string describing the HTML document encoding, or NULL
4787 *
4788 * parse an HTML in-memory document and build a tree.
4789 *
4790 * Returns the resulting document tree
4791 */
4792
4793htmlDocPtr
4794htmlParseDoc(xmlChar *cur, const char *encoding) {
4795 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4796}
4797
4798
4799/**
4800 * htmlCreateFileParserCtxt :
4801 * @filename: the filename
4802 * @encoding: a free form C string describing the HTML document encoding, or NULL
4803 *
4804 * Create a parser context for a file content.
4805 * Automatic support for ZLIB/Compress compressed document is provided
4806 * by default if found at compile-time.
4807 *
4808 * Returns the new parser context or NULL
4809 */
4810htmlParserCtxtPtr
4811htmlCreateFileParserCtxt(const char *filename, const char *encoding)
4812{
4813 htmlParserCtxtPtr ctxt;
4814 htmlParserInputPtr inputStream;
4815 xmlParserInputBufferPtr buf;
4816 /* htmlCharEncoding enc; */
4817 xmlChar *content, *content_line = (xmlChar *) "charset=";
4818
4819 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
4820 if (buf == NULL) return(NULL);
4821
4822 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4823 if (ctxt == NULL) {
4824 perror("malloc");
4825 return(NULL);
4826 }
4827 memset(ctxt, 0, sizeof(htmlParserCtxt));
4828 htmlInitParserCtxt(ctxt);
4829 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
4830 if (inputStream == NULL) {
4831 perror("malloc");
4832 xmlFree(ctxt);
4833 return(NULL);
4834 }
4835 memset(inputStream, 0, sizeof(htmlParserInput));
4836
4837 inputStream->filename = xmlMemStrdup(filename);
4838 inputStream->line = 1;
4839 inputStream->col = 1;
4840 inputStream->buf = buf;
4841 inputStream->directory = NULL;
4842
4843 inputStream->base = inputStream->buf->buffer->content;
4844 inputStream->cur = inputStream->buf->buffer->content;
4845 inputStream->free = NULL;
4846
4847 inputPush(ctxt, inputStream);
4848
4849 /* set encoding */
4850 if (encoding) {
4851 content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
4852 if (content) {
4853 strcpy ((char *)content, (char *)content_line);
4854 strcat ((char *)content, (char *)encoding);
4855 htmlCheckEncoding (ctxt, content);
4856 xmlFree (content);
4857 }
4858 }
4859
4860 return(ctxt);
4861}
4862
4863/**
4864 * htmlSAXParseFile :
4865 * @filename: the filename
4866 * @encoding: a free form C string describing the HTML document encoding, or NULL
4867 * @sax: the SAX handler block
4868 * @userData: if using SAX, this pointer will be provided on callbacks.
4869 *
4870 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4871 * compressed document is provided by default if found at compile-time.
4872 * It use the given SAX function block to handle the parsing callback.
4873 * If sax is NULL, fallback to the default DOM tree building routines.
4874 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00004875 * Returns the resulting document tree unless SAX is NULL or the document is
4876 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00004877 */
4878
4879htmlDocPtr
4880htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
4881 void *userData) {
4882 htmlDocPtr ret;
4883 htmlParserCtxtPtr ctxt;
4884 htmlSAXHandlerPtr oldsax = NULL;
4885
Daniel Veillardd0463562001-10-13 09:15:48 +00004886 xmlInitParser();
4887
Owen Taylor3473f882001-02-23 17:55:21 +00004888 ctxt = htmlCreateFileParserCtxt(filename, encoding);
4889 if (ctxt == NULL) return(NULL);
4890 if (sax != NULL) {
4891 oldsax = ctxt->sax;
4892 ctxt->sax = sax;
4893 ctxt->userData = userData;
4894 }
4895
4896 htmlParseDocument(ctxt);
4897
4898 ret = ctxt->myDoc;
4899 if (sax != NULL) {
4900 ctxt->sax = oldsax;
4901 ctxt->userData = NULL;
4902 }
4903 htmlFreeParserCtxt(ctxt);
4904
4905 return(ret);
4906}
4907
4908/**
4909 * htmlParseFile :
4910 * @filename: the filename
4911 * @encoding: a free form C string describing the HTML document encoding, or NULL
4912 *
4913 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4914 * compressed document is provided by default if found at compile-time.
4915 *
4916 * Returns the resulting document tree
4917 */
4918
4919htmlDocPtr
4920htmlParseFile(const char *filename, const char *encoding) {
4921 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
4922}
4923
4924/**
4925 * htmlHandleOmittedElem:
4926 * @val: int 0 or 1
4927 *
4928 * Set and return the previous value for handling HTML omitted tags.
4929 *
4930 * Returns the last value for 0 for no handling, 1 for auto insertion.
4931 */
4932
4933int
4934htmlHandleOmittedElem(int val) {
4935 int old = htmlOmittedDefaultValue;
4936
4937 htmlOmittedDefaultValue = val;
4938 return(old);
4939}
4940
4941#endif /* LIBXML_HTML_ENABLED */