blob: 39447e3ab0af93936016fdcd5cb2c3abf330bf3e [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * Daniel.Veillard@w3.org
7 */
8
9#ifdef WIN32
10#include "win32config.h"
11#else
12#include "config.h"
13#endif
14
15#include <libxml/xmlversion.h>
16#ifdef LIBXML_HTML_ENABLED
17#include <stdio.h>
18#include <string.h>
19#ifdef HAVE_CTYPE_H
20#include <ctype.h>
21#endif
22#ifdef HAVE_STDLIB_H
23#include <stdlib.h>
24#endif
25#ifdef HAVE_SYS_STAT_H
26#include <sys/stat.h>
27#endif
28#ifdef HAVE_FCNTL_H
29#include <fcntl.h>
30#endif
31#ifdef HAVE_UNISTD_H
32#include <unistd.h>
33#endif
34#ifdef HAVE_ZLIB_H
35#include <zlib.h>
36#endif
37
38#include <libxml/xmlmemory.h>
39#include <libxml/tree.h>
40#include <libxml/parser.h>
41#include <libxml/parserInternals.h>
42#include <libxml/xmlerror.h>
43#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000044#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000045#include <libxml/entities.h>
46#include <libxml/encoding.h>
47#include <libxml/valid.h>
48#include <libxml/xmlIO.h>
49
50#define HTML_MAX_NAMELEN 1000
51#define HTML_PARSER_BIG_BUFFER_SIZE 1000
52#define HTML_PARSER_BUFFER_SIZE 100
53
54/* #define DEBUG */
55/* #define DEBUG_PUSH */
56
57int htmlOmittedDefaultValue = 1;
58
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
60 xmlChar end, xmlChar end2, xmlChar end3);
61
62/************************************************************************
63 * *
Owen Taylor3473f882001-02-23 17:55:21 +000064 * Parser stacks related functions and macros *
65 * *
66 ************************************************************************/
67
68/*
69 * Generic function for accessing stacks in the Parser Context
70 */
71
72#define PUSH_AND_POP(scope, type, name) \
73scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
74 if (ctxt->name##Nr >= ctxt->name##Max) { \
75 ctxt->name##Max *= 2; \
76 ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
77 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
78 if (ctxt->name##Tab == NULL) { \
79 xmlGenericError(xmlGenericErrorContext, \
80 "realloc failed !\n"); \
81 return(0); \
82 } \
83 } \
84 ctxt->name##Tab[ctxt->name##Nr] = value; \
85 ctxt->name = value; \
86 return(ctxt->name##Nr++); \
87} \
88scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
89 type ret; \
90 if (ctxt->name##Nr < 0) return(0); \
91 ctxt->name##Nr--; \
92 if (ctxt->name##Nr < 0) return(0); \
93 if (ctxt->name##Nr > 0) \
94 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
95 else \
96 ctxt->name = NULL; \
97 ret = ctxt->name##Tab[ctxt->name##Nr]; \
98 ctxt->name##Tab[ctxt->name##Nr] = 0; \
99 return(ret); \
100} \
101
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000102/* PUSH_AND_POP(static, xmlNodePtr, node) */
103PUSH_AND_POP(static, xmlChar*, name)
Owen Taylor3473f882001-02-23 17:55:21 +0000104
105/*
106 * Macros for accessing the content. Those should be used only by the parser,
107 * and not exported.
108 *
109 * Dirty macros, i.e. one need to make assumption on the context to use them
110 *
111 * CUR_PTR return the current pointer to the xmlChar to be parsed.
112 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
113 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
114 * in UNICODE mode. This should be used internally by the parser
115 * only to compare to ASCII values otherwise it would break when
116 * running with UTF-8 encoding.
117 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
118 * to compare on ASCII based substring.
119 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
120 * it should be used only to compare on ASCII based substring.
121 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
122 * strings within the parser.
123 *
124 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
125 *
126 * CURRENT Returns the current char value, with the full decoding of
127 * UTF-8 if we are using this mode. It returns an int.
128 * NEXT Skip to the next character, this does the proper decoding
129 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
130 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
131 */
132
133#define UPPER (toupper(*ctxt->input->cur))
134
135#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
136
137#define NXT(val) ctxt->input->cur[(val)]
138
139#define UPP(val) (toupper(ctxt->input->cur[(val)]))
140
141#define CUR_PTR ctxt->input->cur
142
143#define SHRINK xmlParserInputShrink(ctxt->input)
144
145#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
146
147#define CURRENT ((int) (*ctxt->input->cur))
148
149#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
150
151/* Inported from XML */
152
153/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
154#define CUR ((int) (*ctxt->input->cur))
155#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
156
157#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
158#define NXT(val) ctxt->input->cur[(val)]
159#define CUR_PTR ctxt->input->cur
160
161
162#define NEXTL(l) do { \
163 if (*(ctxt->input->cur) == '\n') { \
164 ctxt->input->line++; ctxt->input->col = 1; \
165 } else ctxt->input->col++; \
166 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
167 } while (0)
168
169/************
170 \
171 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
172 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
173 ************/
174
175#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
176#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
177
178#define COPY_BUF(l,b,i,v) \
179 if (l == 1) b[i++] = (xmlChar) v; \
180 else i += xmlCopyChar(l,&b[i],v)
181
182/**
183 * htmlCurrentChar:
184 * @ctxt: the HTML parser context
185 * @len: pointer to the length of the char read
186 *
187 * The current char value, if using UTF-8 this may actaully span multiple
188 * bytes in the input buffer. Implement the end of line normalization:
189 * 2.11 End-of-Line Handling
190 * If the encoding is unspecified, in the case we find an ISO-Latin-1
191 * char, then the encoding converter is plugged in automatically.
192 *
193 * Returns the current char value and its lenght
194 */
195
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000196static int
Owen Taylor3473f882001-02-23 17:55:21 +0000197htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
198 if (ctxt->instate == XML_PARSER_EOF)
199 return(0);
200
201 if (ctxt->token != 0) {
202 *len = 0;
203 return(ctxt->token);
204 }
205 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
206 /*
207 * We are supposed to handle UTF8, check it's valid
208 * From rfc2044: encoding of the Unicode values on UTF-8:
209 *
210 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
211 * 0000 0000-0000 007F 0xxxxxxx
212 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
213 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
214 *
215 * Check for the 0x110000 limit too
216 */
217 const unsigned char *cur = ctxt->input->cur;
218 unsigned char c;
219 unsigned int val;
220
221 c = *cur;
222 if (c & 0x80) {
223 if (cur[1] == 0)
224 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
225 if ((cur[1] & 0xc0) != 0x80)
226 goto encoding_error;
227 if ((c & 0xe0) == 0xe0) {
228
229 if (cur[2] == 0)
230 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
231 if ((cur[2] & 0xc0) != 0x80)
232 goto encoding_error;
233 if ((c & 0xf0) == 0xf0) {
234 if (cur[3] == 0)
235 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
236 if (((c & 0xf8) != 0xf0) ||
237 ((cur[3] & 0xc0) != 0x80))
238 goto encoding_error;
239 /* 4-byte code */
240 *len = 4;
241 val = (cur[0] & 0x7) << 18;
242 val |= (cur[1] & 0x3f) << 12;
243 val |= (cur[2] & 0x3f) << 6;
244 val |= cur[3] & 0x3f;
245 } else {
246 /* 3-byte code */
247 *len = 3;
248 val = (cur[0] & 0xf) << 12;
249 val |= (cur[1] & 0x3f) << 6;
250 val |= cur[2] & 0x3f;
251 }
252 } else {
253 /* 2-byte code */
254 *len = 2;
255 val = (cur[0] & 0x1f) << 6;
256 val |= cur[1] & 0x3f;
257 }
258 if (!IS_CHAR(val)) {
259 ctxt->errNo = XML_ERR_INVALID_ENCODING;
260 if ((ctxt->sax != NULL) &&
261 (ctxt->sax->error != NULL))
262 ctxt->sax->error(ctxt->userData,
263 "Char 0x%X out of allowed range\n", val);
264 ctxt->wellFormed = 0;
265 ctxt->disableSAX = 1;
266 }
267 return(val);
268 } else {
269 /* 1-byte code */
270 *len = 1;
271 return((int) *ctxt->input->cur);
272 }
273 }
274 /*
275 * Assume it's a fixed lenght encoding (1) with
276 * a compatibke encoding for the ASCII set, since
277 * XML constructs only use < 128 chars
278 */
279 *len = 1;
280 if ((int) *ctxt->input->cur < 0x80)
281 return((int) *ctxt->input->cur);
282
283 /*
284 * Humm this is bad, do an automatic flow conversion
285 */
286 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
287 ctxt->charset = XML_CHAR_ENCODING_UTF8;
288 return(xmlCurrentChar(ctxt, len));
289
290encoding_error:
291 /*
292 * If we detect an UTF8 error that probably mean that the
293 * input encoding didn't get properly advertized in the
294 * declaration header. Report the error and switch the encoding
295 * to ISO-Latin-1 (if you don't like this policy, just declare the
296 * encoding !)
297 */
298 ctxt->errNo = XML_ERR_INVALID_ENCODING;
299 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
300 ctxt->sax->error(ctxt->userData,
301 "Input is not proper UTF-8, indicate encoding !\n");
302 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
303 ctxt->input->cur[0], ctxt->input->cur[1],
304 ctxt->input->cur[2], ctxt->input->cur[3]);
305 }
306
307 ctxt->charset = XML_CHAR_ENCODING_8859_1;
308 *len = 1;
309 return((int) *ctxt->input->cur);
310}
311
312/**
Owen Taylor3473f882001-02-23 17:55:21 +0000313 * htmlSkipBlankChars:
314 * @ctxt: the HTML parser context
315 *
316 * skip all blanks character found at that point in the input streams.
317 *
318 * Returns the number of space chars skipped
319 */
320
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000321static int
Owen Taylor3473f882001-02-23 17:55:21 +0000322htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
323 int res = 0;
324
325 while (IS_BLANK(*(ctxt->input->cur))) {
326 if ((*ctxt->input->cur == 0) &&
327 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
328 xmlPopInput(ctxt);
329 } else {
330 if (*(ctxt->input->cur) == '\n') {
331 ctxt->input->line++; ctxt->input->col = 1;
332 } else ctxt->input->col++;
333 ctxt->input->cur++;
334 ctxt->nbChars++;
335 if (*ctxt->input->cur == 0)
336 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
337 }
338 res++;
339 }
340 return(res);
341}
342
343
344
345/************************************************************************
346 * *
347 * The list of HTML elements and their properties *
348 * *
349 ************************************************************************/
350
351/*
352 * Start Tag: 1 means the start tag can be ommited
353 * End Tag: 1 means the end tag can be ommited
354 * 2 means it's forbidden (empty elements)
355 * Depr: this element is deprecated
356 * DTD: 1 means that this element is valid only in the Loose DTD
357 * 2 means that this element is valid only in the Frameset DTD
358 *
359 * Name,Start Tag,End Tag,Save End, Empty, Depr., DTD, Description
360 */
361htmlElemDesc html40ElementTable[] = {
362{ "a", 0, 0, 0, 0, 0, 0, "anchor " },
363{ "abbr", 0, 0, 0, 0, 0, 0, "abbreviated form" },
364{ "acronym", 0, 0, 0, 0, 0, 0, "" },
365{ "address", 0, 0, 0, 0, 0, 0, "information on author " },
366{ "applet", 0, 0, 0, 0, 1, 1, "java applet " },
367{ "area", 0, 2, 2, 1, 0, 0, "client-side image map area " },
368{ "b", 0, 0, 0, 0, 0, 0, "bold text style" },
369{ "base", 0, 2, 2, 1, 0, 0, "document base uri " },
370{ "basefont", 0, 2, 2, 1, 1, 1, "base font size " },
371{ "bdo", 0, 0, 0, 0, 0, 0, "i18n bidi over-ride " },
372{ "big", 0, 0, 0, 0, 0, 0, "large text style" },
373{ "blockquote", 0, 0, 0, 0, 0, 0, "long quotation " },
374{ "body", 1, 1, 0, 0, 0, 0, "document body " },
375{ "br", 0, 2, 2, 1, 0, 0, "forced line break " },
376{ "button", 0, 0, 0, 0, 0, 0, "push button " },
377{ "caption", 0, 0, 0, 0, 0, 0, "table caption " },
378{ "center", 0, 0, 0, 0, 1, 1, "shorthand for div align=center " },
379{ "cite", 0, 0, 0, 0, 0, 0, "citation" },
380{ "code", 0, 0, 0, 0, 0, 0, "computer code fragment" },
381{ "col", 0, 2, 2, 1, 0, 0, "table column " },
382{ "colgroup", 0, 1, 0, 0, 0, 0, "table column group " },
383{ "dd", 0, 1, 0, 0, 0, 0, "definition description " },
384{ "del", 0, 0, 0, 0, 0, 0, "deleted text " },
385{ "dfn", 0, 0, 0, 0, 0, 0, "instance definition" },
386{ "dir", 0, 0, 0, 0, 1, 1, "directory list" },
387{ "div", 0, 0, 0, 0, 0, 0, "generic language/style container"},
388{ "dl", 0, 0, 0, 0, 0, 0, "definition list " },
389{ "dt", 0, 1, 0, 0, 0, 0, "definition term " },
390{ "em", 0, 0, 0, 0, 0, 0, "emphasis" },
391{ "fieldset", 0, 0, 0, 0, 0, 0, "form control group " },
392{ "font", 0, 0, 0, 0, 1, 1, "local change to font " },
393{ "form", 0, 0, 0, 0, 0, 0, "interactive form " },
394{ "frame", 0, 2, 2, 1, 0, 2, "subwindow " },
395{ "frameset", 0, 0, 0, 0, 0, 2, "window subdivision" },
396{ "h1", 0, 0, 0, 0, 0, 0, "heading " },
397{ "h2", 0, 0, 0, 0, 0, 0, "heading " },
398{ "h3", 0, 0, 0, 0, 0, 0, "heading " },
399{ "h4", 0, 0, 0, 0, 0, 0, "heading " },
400{ "h5", 0, 0, 0, 0, 0, 0, "heading " },
401{ "h6", 0, 0, 0, 0, 0, 0, "heading " },
402{ "head", 1, 1, 0, 0, 0, 0, "document head " },
403{ "hr", 0, 2, 2, 1, 0, 0, "horizontal rule " },
404{ "html", 1, 1, 0, 0, 0, 0, "document root element " },
405{ "i", 0, 0, 0, 0, 0, 0, "italic text style" },
406{ "iframe", 0, 0, 0, 0, 0, 1, "inline subwindow " },
407{ "img", 0, 2, 2, 1, 0, 0, "embedded image " },
408{ "input", 0, 2, 2, 1, 0, 0, "form control " },
409{ "ins", 0, 0, 0, 0, 0, 0, "inserted text" },
410{ "isindex", 0, 2, 2, 1, 1, 1, "single line prompt " },
411{ "kbd", 0, 0, 0, 0, 0, 0, "text to be entered by the user" },
412{ "label", 0, 0, 0, 0, 0, 0, "form field label text " },
413{ "legend", 0, 0, 0, 0, 0, 0, "fieldset legend " },
414{ "li", 0, 1, 1, 0, 0, 0, "list item " },
415{ "link", 0, 2, 2, 1, 0, 0, "a media-independent link " },
416{ "map", 0, 0, 0, 0, 0, 0, "client-side image map " },
417{ "menu", 0, 0, 0, 0, 1, 1, "menu list " },
418{ "meta", 0, 2, 2, 1, 0, 0, "generic metainformation " },
419{ "noframes", 0, 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " },
420{ "noscript", 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
421{ "object", 0, 0, 0, 0, 0, 0, "generic embedded object " },
422{ "ol", 0, 0, 0, 0, 0, 0, "ordered list " },
423{ "optgroup", 0, 0, 0, 0, 0, 0, "option group " },
424{ "option", 0, 1, 0, 0, 0, 0, "selectable choice " },
425{ "p", 0, 1, 1, 0, 0, 0, "paragraph " },
426{ "param", 0, 2, 2, 1, 0, 0, "named property value " },
427{ "pre", 0, 0, 0, 0, 0, 0, "preformatted text " },
428{ "q", 0, 0, 0, 0, 0, 0, "short inline quotation " },
429{ "s", 0, 0, 0, 0, 1, 1, "strike-through text style" },
430{ "samp", 0, 0, 0, 0, 0, 0, "sample program output, scripts, etc." },
431{ "script", 0, 0, 0, 0, 0, 0, "script statements " },
432{ "select", 0, 0, 0, 0, 0, 0, "option selector " },
433{ "small", 0, 0, 0, 0, 0, 0, "small text style" },
434{ "span", 0, 0, 0, 0, 0, 0, "generic language/style container " },
435{ "strike", 0, 0, 0, 0, 1, 1, "strike-through text" },
436{ "strong", 0, 0, 0, 0, 0, 0, "strong emphasis" },
437{ "style", 0, 0, 0, 0, 0, 0, "style info " },
438{ "sub", 0, 0, 0, 0, 0, 0, "subscript" },
439{ "sup", 0, 0, 0, 0, 0, 0, "superscript " },
440{ "table", 0, 0, 0, 0, 0, 0, "&#160;" },
441{ "tbody", 1, 0, 0, 0, 0, 0, "table body " },
442{ "td", 0, 0, 0, 0, 0, 0, "table data cell" },
443{ "textarea", 0, 0, 0, 0, 0, 0, "multi-line text field " },
444{ "tfoot", 0, 1, 0, 0, 0, 0, "table footer " },
445{ "th", 0, 1, 0, 0, 0, 0, "table header cell" },
446{ "thead", 0, 1, 0, 0, 0, 0, "table header " },
447{ "title", 0, 0, 0, 0, 0, 0, "document title " },
448{ "tr", 0, 1, 0, 0, 0, 0, "table row " },
449{ "tt", 0, 0, 0, 0, 0, 0, "teletype or monospaced text style" },
450{ "u", 0, 0, 0, 0, 1, 1, "underlined text style" },
451{ "ul", 0, 0, 0, 0, 0, 0, "unordered list " },
452{ "var", 0, 0, 0, 0, 0, 0, "instance of a variable or program argument" },
453};
454
455/*
456 * start tags that imply the end of a current element
457 * any tag of each line implies the end of the current element if the type of
458 * that element is in the same line
459 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000460const char *htmlEquEnd[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000461"dt", "dd", "li", "option", NULL,
462"h1", "h2", "h3", "h4", "h5", "h6", NULL,
463"ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,
464NULL
465};
466/*
467 * acording the HTML DTD, HR should be added to the 2nd line above, as it
468 * is not allowed within a H1, H2, H3, etc. But we should tolerate that case
469 * because many documents contain rules in headings...
470 */
471
472/*
473 * start tags that imply the end of current element
474 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000475const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000476"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
477 "dl", "ul", "ol", "menu", "dir", "address", "pre",
478 "listing", "xmp", "head", NULL,
479"head", "p", NULL,
480"title", "p", NULL,
481"body", "head", "style", "link", "title", "p", NULL,
482"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
483 "pre", "listing", "xmp", "head", "li", NULL,
484"hr", "p", "head", NULL,
485"h1", "p", "head", NULL,
486"h2", "p", "head", NULL,
487"h3", "p", "head", NULL,
488"h4", "p", "head", NULL,
489"h5", "p", "head", NULL,
490"h6", "p", "head", NULL,
491"dir", "p", "head", NULL,
492"address", "p", "head", "ul", NULL,
493"pre", "p", "head", "ul", NULL,
494"listing", "p", "head", NULL,
495"xmp", "p", "head", NULL,
496"blockquote", "p", "head", NULL,
497"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
498 "xmp", "head", NULL,
499"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
500 "head", "dd", NULL,
501"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
502 "head", "dt", NULL,
503"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
504 "listing", "xmp", NULL,
505"ol", "p", "head", "ul", NULL,
506"menu", "p", "head", "ul", NULL,
507"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
508"div", "p", "head", NULL,
509"noscript", "p", "head", NULL,
510"center", "font", "b", "i", "p", "head", NULL,
511"a", "a", NULL,
512"caption", "p", NULL,
513"colgroup", "caption", "colgroup", "col", "p", NULL,
514"col", "caption", "col", "p", NULL,
515"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
516 "listing", "xmp", "a", NULL,
517"th", "th", "td", NULL,
518"td", "th", "td", "p", NULL,
519"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
520"thead", "caption", "col", "colgroup", NULL,
521"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
522 "tbody", "p", NULL,
523"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
524 "tfoot", "tbody", "p", NULL,
525"optgroup", "option", NULL,
526"option", "option", NULL,
527"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
528 "pre", "listing", "xmp", "a", NULL,
529NULL
530};
531
532/*
533 * The list of HTML elements which are supposed not to have
534 * CDATA content and where a p element will be implied
535 *
536 * TODO: extend that list by reading the HTML SGML DtD on
537 * implied paragraph
538 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000539static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000540 "html",
541 "head",
542 "body",
543 NULL
544};
545
546/*
547 * The list of HTML attributes which are of content %Script;
548 * NOTE: when adding ones, check htmlIsScriptAttribute() since
549 * it assumes the name starts with 'on'
550 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000551static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000552 "onclick",
553 "ondblclick",
554 "onmousedown",
555 "onmouseup",
556 "onmouseover",
557 "onmousemove",
558 "onmouseout",
559 "onkeypress",
560 "onkeydown",
561 "onkeyup",
562 "onload",
563 "onunload",
564 "onfocus",
565 "onblur",
566 "onsubmit",
567 "onrest",
568 "onchange",
569 "onselect"
570};
571
572
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000573static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +0000574static int htmlStartCloseIndexinitialized = 0;
575
576/************************************************************************
577 * *
578 * functions to handle HTML specific data *
579 * *
580 ************************************************************************/
581
582/**
583 * htmlInitAutoClose:
584 *
585 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
586 * This is not reentrant. Call xmlInitParser() once before processing in
587 * case of use in multithreaded programs.
588 */
589void
590htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000591 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000592
593 if (htmlStartCloseIndexinitialized) return;
594
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000595 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
596 indx = 0;
597 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
598 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +0000599 while (htmlStartClose[i] != NULL) i++;
600 i++;
601 }
602 htmlStartCloseIndexinitialized = 1;
603}
604
605/**
606 * htmlTagLookup:
607 * @tag: The tag name in lowercase
608 *
609 * Lookup the HTML tag in the ElementTable
610 *
611 * Returns the related htmlElemDescPtr or NULL if not found.
612 */
613htmlElemDescPtr
614htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000615 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000616
617 for (i = 0; i < (sizeof(html40ElementTable) /
618 sizeof(html40ElementTable[0]));i++) {
619 if (xmlStrEqual(tag, BAD_CAST html40ElementTable[i].name))
620 return(&html40ElementTable[i]);
621 }
622 return(NULL);
623}
624
625/**
626 * htmlCheckAutoClose:
627 * @newtag: The new tag name
628 * @oldtag: The old tag name
629 *
630 * Checks wether the new tag is one of the registered valid tags for closing old.
631 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
632 *
633 * Returns 0 if no, 1 if yes.
634 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000635static int
Owen Taylor3473f882001-02-23 17:55:21 +0000636htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000637 int i, indx;
638 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +0000639
640 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
641
642 /* inefficient, but not a big deal */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000643 for (indx = 0; indx < 100;indx++) {
644 closed = htmlStartCloseIndex[indx];
645 if (closed == NULL) return(0);
646 if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor3473f882001-02-23 17:55:21 +0000647 }
648
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000649 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +0000650 i++;
651 while (htmlStartClose[i] != NULL) {
652 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
653 return(1);
654 }
655 i++;
656 }
657 return(0);
658}
659
660/**
661 * htmlAutoCloseOnClose:
662 * @ctxt: an HTML parser context
663 * @newtag: The new tag name
664 *
665 * The HTmL DtD allows an ending tag to implicitely close other tags.
666 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000667static void
Owen Taylor3473f882001-02-23 17:55:21 +0000668htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
669 htmlElemDescPtr info;
670 xmlChar *oldname;
671 int i;
672
673#ifdef DEBUG
674 xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
675 for (i = 0;i < ctxt->nameNr;i++)
676 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
677#endif
678
679 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
680 if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
681 }
682 if (i < 0) return;
683
684 while (!xmlStrEqual(newtag, ctxt->name)) {
685 info = htmlTagLookup(ctxt->name);
686 if ((info == NULL) || (info->endTag == 1)) {
687#ifdef DEBUG
688 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
689#endif
690 } else {
691 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
692 ctxt->sax->error(ctxt->userData,
693 "Opening and ending tag mismatch: %s and %s\n",
694 newtag, ctxt->name);
695 ctxt->wellFormed = 0;
696 }
697 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
698 ctxt->sax->endElement(ctxt->userData, ctxt->name);
699 oldname = htmlnamePop(ctxt);
700 if (oldname != NULL) {
701#ifdef DEBUG
702 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
703#endif
704 xmlFree(oldname);
705 }
706 }
707}
708
709/**
710 * htmlAutoClose:
711 * @ctxt: an HTML parser context
712 * @newtag: The new tag name or NULL
713 *
714 * The HTmL DtD allows a tag to implicitely close other tags.
715 * The list is kept in htmlStartClose array. This function is
716 * called when a new tag has been detected and generates the
717 * appropriates closes if possible/needed.
718 * If newtag is NULL this mean we are at the end of the resource
719 * and we should check
720 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000721static void
Owen Taylor3473f882001-02-23 17:55:21 +0000722htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
723 xmlChar *oldname;
724 while ((newtag != NULL) && (ctxt->name != NULL) &&
725 (htmlCheckAutoClose(newtag, ctxt->name))) {
726#ifdef DEBUG
727 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
728#endif
729 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
730 ctxt->sax->endElement(ctxt->userData, ctxt->name);
731 oldname = htmlnamePop(ctxt);
732 if (oldname != NULL) {
733#ifdef DEBUG
734 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
735#endif
736 xmlFree(oldname);
737 }
738 }
739 if (newtag == NULL) {
740 htmlAutoCloseOnClose(ctxt, BAD_CAST"head");
741 htmlAutoCloseOnClose(ctxt, BAD_CAST"body");
742 htmlAutoCloseOnClose(ctxt, BAD_CAST"html");
743 }
744 while ((newtag == NULL) && (ctxt->name != NULL) &&
745 ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
746 (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
747 (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
748#ifdef DEBUG
749 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
750#endif
751 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
752 ctxt->sax->endElement(ctxt->userData, ctxt->name);
753 oldname = htmlnamePop(ctxt);
754 if (oldname != NULL) {
755#ifdef DEBUG
756 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
757#endif
758 xmlFree(oldname);
759 }
760 }
761
762}
763
764/**
765 * htmlAutoCloseTag:
766 * @doc: the HTML document
767 * @name: The tag name
768 * @elem: the HTML element
769 *
770 * The HTmL DtD allows a tag to implicitely close other tags.
771 * The list is kept in htmlStartClose array. This function checks
772 * if the element or one of it's children would autoclose the
773 * given tag.
774 *
775 * Returns 1 if autoclose, 0 otherwise
776 */
777int
778htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
779 htmlNodePtr child;
780
781 if (elem == NULL) return(1);
782 if (xmlStrEqual(name, elem->name)) return(0);
783 if (htmlCheckAutoClose(elem->name, name)) return(1);
784 child = elem->children;
785 while (child != NULL) {
786 if (htmlAutoCloseTag(doc, name, child)) return(1);
787 child = child->next;
788 }
789 return(0);
790}
791
792/**
793 * htmlIsAutoClosed:
794 * @doc: the HTML document
795 * @elem: the HTML element
796 *
797 * The HTmL DtD allows a tag to implicitely close other tags.
798 * The list is kept in htmlStartClose array. This function checks
799 * if a tag is autoclosed by one of it's child
800 *
801 * Returns 1 if autoclosed, 0 otherwise
802 */
803int
804htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
805 htmlNodePtr child;
806
807 if (elem == NULL) return(1);
808 child = elem->children;
809 while (child != NULL) {
810 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
811 child = child->next;
812 }
813 return(0);
814}
815
816/**
817 * htmlCheckImplied:
818 * @ctxt: an HTML parser context
819 * @newtag: The new tag name
820 *
821 * The HTML DtD allows a tag to exists only implicitely
822 * called when a new tag has been detected and generates the
823 * appropriates implicit tags if missing
824 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000825static void
Owen Taylor3473f882001-02-23 17:55:21 +0000826htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
827 if (!htmlOmittedDefaultValue)
828 return;
829 if (xmlStrEqual(newtag, BAD_CAST"html"))
830 return;
831 if (ctxt->nameNr <= 0) {
832#ifdef DEBUG
833 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
834#endif
835 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
836 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
837 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
838 }
839 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
840 return;
841 if ((ctxt->nameNr <= 1) &&
842 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
843 (xmlStrEqual(newtag, BAD_CAST"style")) ||
844 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
845 (xmlStrEqual(newtag, BAD_CAST"link")) ||
846 (xmlStrEqual(newtag, BAD_CAST"title")) ||
847 (xmlStrEqual(newtag, BAD_CAST"base")))) {
848 /*
849 * dropped OBJECT ... i you put it first BODY will be
850 * assumed !
851 */
852#ifdef DEBUG
853 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
854#endif
855 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
856 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
857 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
858 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
859 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
860 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
861 int i;
862 for (i = 0;i < ctxt->nameNr;i++) {
863 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
864 return;
865 }
866 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
867 return;
868 }
869 }
870
871#ifdef DEBUG
872 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
873#endif
874 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
875 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
876 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
877 }
878}
879
880/**
881 * htmlCheckParagraph
882 * @ctxt: an HTML parser context
883 *
884 * Check whether a p element need to be implied before inserting
885 * characters in the current element.
886 *
887 * Returns 1 if a paragraph has been inserted, 0 if not and -1
888 * in case of error.
889 */
890
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000891static int
Owen Taylor3473f882001-02-23 17:55:21 +0000892htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
893 const xmlChar *tag;
894 int i;
895
896 if (ctxt == NULL)
897 return(-1);
898 tag = ctxt->name;
899 if (tag == NULL) {
900 htmlAutoClose(ctxt, BAD_CAST"p");
901 htmlCheckImplied(ctxt, BAD_CAST"p");
902 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
903 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
904 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
905 return(1);
906 }
907 if (!htmlOmittedDefaultValue)
908 return(0);
909 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
910 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
911#ifdef DEBUG
912 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
913#endif
914 htmlAutoClose(ctxt, BAD_CAST"p");
915 htmlCheckImplied(ctxt, BAD_CAST"p");
916 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
917 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
918 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
919 return(1);
920 }
921 }
922 return(0);
923}
924
925/**
926 * htmlIsScriptAttribute:
927 * @name: an attribute name
928 *
929 * Check if an attribute is of content type Script
930 *
931 * Returns 1 is the attribute is a script 0 otherwise
932 */
933int
934htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000935 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000936
937 if (name == NULL)
938 return(0);
939 /*
940 * all script attributes start with 'on'
941 */
942 if ((name[0] != 'o') || (name[1] != 'n'))
943 return(0);
944 for (i = 0;
945 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
946 i++) {
947 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
948 return(1);
949 }
950 return(0);
951}
952
953/************************************************************************
954 * *
955 * The list of HTML predefined entities *
956 * *
957 ************************************************************************/
958
959
960htmlEntityDesc html40EntitiesTable[] = {
961/*
962 * the 4 absolute ones, plus apostrophe.
963 */
964{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
965{ 38, "amp", "ampersand, U+0026 ISOnum" },
966{ 39, "apos", "single quote" },
967{ 60, "lt", "less-than sign, U+003C ISOnum" },
968{ 62, "gt", "greater-than sign, U+003E ISOnum" },
969
970/*
971 * A bunch still in the 128-255 range
972 * Replacing them depend really on the charset used.
973 */
974{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
975{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
976{ 162, "cent", "cent sign, U+00A2 ISOnum" },
977{ 163, "pound","pound sign, U+00A3 ISOnum" },
978{ 164, "curren","currency sign, U+00A4 ISOnum" },
979{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
980{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
981{ 167, "sect", "section sign, U+00A7 ISOnum" },
982{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
983{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
984{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
985{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
986{ 172, "not", "not sign, U+00AC ISOnum" },
987{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
988{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
989{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
990{ 176, "deg", "degree sign, U+00B0 ISOnum" },
991{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
992{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
993{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
994{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
995{ 181, "micro","micro sign, U+00B5 ISOnum" },
996{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
997{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
998{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
999{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1000{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1001{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1002{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1003{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1004{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1005{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1006{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1007{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1008{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1009{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1010{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1011{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1012{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1013{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1014{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1015{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1016{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1017{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1018{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1019{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1020{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1021{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1022{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1023{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1024{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1025{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1026{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1027{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1028{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1029{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1030{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1031{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1032{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1033{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1034{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1035{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1036{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1037{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1038{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1039{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1040{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1041{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1042{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1043{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1044{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1045{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1046{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1047{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1048{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1049{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1050{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1051{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1052{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1053{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1054{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1055{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1056{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1057{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1058{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1059{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1060{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1061{ 247, "divide","division sign, U+00F7 ISOnum" },
1062{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1063{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1064{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1065{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1066{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1067{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1068{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1069{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1070
1071{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1072{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1073{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1074{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1075{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1076
1077/*
1078 * Anything below should really be kept as entities references
1079 */
1080{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1081
1082{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1083{ 732, "tilde","small tilde, U+02DC ISOdia" },
1084
1085{ 913, "Alpha","greek capital letter alpha, U+0391" },
1086{ 914, "Beta", "greek capital letter beta, U+0392" },
1087{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1088{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1089{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1090{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1091{ 919, "Eta", "greek capital letter eta, U+0397" },
1092{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1093{ 921, "Iota", "greek capital letter iota, U+0399" },
1094{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001095{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001096{ 924, "Mu", "greek capital letter mu, U+039C" },
1097{ 925, "Nu", "greek capital letter nu, U+039D" },
1098{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1099{ 927, "Omicron","greek capital letter omicron, U+039F" },
1100{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1101{ 929, "Rho", "greek capital letter rho, U+03A1" },
1102{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1103{ 932, "Tau", "greek capital letter tau, U+03A4" },
1104{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1105{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1106{ 935, "Chi", "greek capital letter chi, U+03A7" },
1107{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1108{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1109
1110{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1111{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1112{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1113{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1114{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1115{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1116{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1117{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1118{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1119{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1120{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1121{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1122{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1123{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1124{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1125{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1126{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1127{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1128{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1129{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1130{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1131{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1132{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1133{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1134{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1135{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1136{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1137{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1138
1139{ 8194, "ensp", "en space, U+2002 ISOpub" },
1140{ 8195, "emsp", "em space, U+2003 ISOpub" },
1141{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1142{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1143{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1144{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1145{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1146{ 8211, "ndash","en dash, U+2013 ISOpub" },
1147{ 8212, "mdash","em dash, U+2014 ISOpub" },
1148{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1149{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1150{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1151{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1152{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1153{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1154{ 8224, "dagger","dagger, U+2020 ISOpub" },
1155{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1156
1157{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1158{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1159
1160{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1161
1162{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1163{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1164
1165{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1166{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1167
1168{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1169{ 8260, "frasl","fraction slash, U+2044 NEW" },
1170
1171{ 8364, "euro", "euro sign, U+20AC NEW" },
1172
1173{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1174{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1175{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1176{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1177{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1178{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1179{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1180{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1181{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1182{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1183{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1184{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1185{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1186{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1187{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1188{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1189
1190{ 8704, "forall","for all, U+2200 ISOtech" },
1191{ 8706, "part", "partial differential, U+2202 ISOtech" },
1192{ 8707, "exist","there exists, U+2203 ISOtech" },
1193{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1194{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1195{ 8712, "isin", "element of, U+2208 ISOtech" },
1196{ 8713, "notin","not an element of, U+2209 ISOtech" },
1197{ 8715, "ni", "contains as member, U+220B ISOtech" },
1198{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1199{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
1200{ 8722, "minus","minus sign, U+2212 ISOtech" },
1201{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1202{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1203{ 8733, "prop", "proportional to, U+221D ISOtech" },
1204{ 8734, "infin","infinity, U+221E ISOtech" },
1205{ 8736, "ang", "angle, U+2220 ISOamso" },
1206{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1207{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1208{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1209{ 8746, "cup", "union = cup, U+222A ISOtech" },
1210{ 8747, "int", "integral, U+222B ISOtech" },
1211{ 8756, "there4","therefore, U+2234 ISOtech" },
1212{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1213{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1214{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1215{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1216{ 8801, "equiv","identical to, U+2261 ISOtech" },
1217{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1218{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1219{ 8834, "sub", "subset of, U+2282 ISOtech" },
1220{ 8835, "sup", "superset of, U+2283 ISOtech" },
1221{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1222{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1223{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1224{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1225{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1226{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1227{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1228{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1229{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1230{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1231{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1232{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1233{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1234{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1235
1236{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1237{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1238{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1239{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1240
1241};
1242
1243/************************************************************************
1244 * *
1245 * Commodity functions to handle entities *
1246 * *
1247 ************************************************************************/
1248
1249/*
1250 * Macro used to grow the current buffer.
1251 */
1252#define growBuffer(buffer) { \
1253 buffer##_size *= 2; \
1254 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1255 if (buffer == NULL) { \
1256 perror("realloc failed"); \
1257 return(NULL); \
1258 } \
1259}
1260
1261/**
1262 * htmlEntityLookup:
1263 * @name: the entity name
1264 *
1265 * Lookup the given entity in EntitiesTable
1266 *
1267 * TODO: the linear scan is really ugly, an hash table is really needed.
1268 *
1269 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1270 */
1271htmlEntityDescPtr
1272htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001273 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001274
1275 for (i = 0;i < (sizeof(html40EntitiesTable)/
1276 sizeof(html40EntitiesTable[0]));i++) {
1277 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1278#ifdef DEBUG
1279 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
1280#endif
1281 return(&html40EntitiesTable[i]);
1282 }
1283 }
1284 return(NULL);
1285}
1286
1287/**
1288 * htmlEntityValueLookup:
1289 * @value: the entity's unicode value
1290 *
1291 * Lookup the given entity in EntitiesTable
1292 *
1293 * TODO: the linear scan is really ugly, an hash table is really needed.
1294 *
1295 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1296 */
1297htmlEntityDescPtr
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001298htmlEntityValueLookup(unsigned int value) {
1299 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001300#ifdef DEBUG
1301 int lv = 0;
1302#endif
1303
1304 for (i = 0;i < (sizeof(html40EntitiesTable)/
1305 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001306 if (html40EntitiesTable[i].value >= value) {
1307 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001308 break;
1309#ifdef DEBUG
1310 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
1311#endif
1312 return(&html40EntitiesTable[i]);
1313 }
1314#ifdef DEBUG
1315 if (lv > html40EntitiesTable[i].value) {
1316 xmlGenericError(xmlGenericErrorContext,
1317 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1318 lv, html40EntitiesTable[i].value);
1319 }
1320 lv = html40EntitiesTable[i].value;
1321#endif
1322 }
1323 return(NULL);
1324}
1325
1326/**
1327 * UTF8ToHtml:
1328 * @out: a pointer to an array of bytes to store the result
1329 * @outlen: the length of @out
1330 * @in: a pointer to an array of UTF-8 chars
1331 * @inlen: the length of @in
1332 *
1333 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1334 * plus HTML entities block of chars out.
1335 *
1336 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1337 * The value of @inlen after return is the number of octets consumed
1338 * as the return value is positive, else unpredictiable.
1339 * The value of @outlen after return is the number of octets consumed.
1340 */
1341int
1342UTF8ToHtml(unsigned char* out, int *outlen,
1343 const unsigned char* in, int *inlen) {
1344 const unsigned char* processed = in;
1345 const unsigned char* outend;
1346 const unsigned char* outstart = out;
1347 const unsigned char* instart = in;
1348 const unsigned char* inend;
1349 unsigned int c, d;
1350 int trailing;
1351
1352 if (in == NULL) {
1353 /*
1354 * initialization nothing to do
1355 */
1356 *outlen = 0;
1357 *inlen = 0;
1358 return(0);
1359 }
1360 inend = in + (*inlen);
1361 outend = out + (*outlen);
1362 while (in < inend) {
1363 d = *in++;
1364 if (d < 0x80) { c= d; trailing= 0; }
1365 else if (d < 0xC0) {
1366 /* trailing byte in leading position */
1367 *outlen = out - outstart;
1368 *inlen = processed - instart;
1369 return(-2);
1370 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1371 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1372 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1373 else {
1374 /* no chance for this in Ascii */
1375 *outlen = out - outstart;
1376 *inlen = processed - instart;
1377 return(-2);
1378 }
1379
1380 if (inend - in < trailing) {
1381 break;
1382 }
1383
1384 for ( ; trailing; trailing--) {
1385 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1386 break;
1387 c <<= 6;
1388 c |= d & 0x3F;
1389 }
1390
1391 /* assertion: c is a single UTF-4 value */
1392 if (c < 0x80) {
1393 if (out + 1 >= outend)
1394 break;
1395 *out++ = c;
1396 } else {
1397 int len;
1398 htmlEntityDescPtr ent;
1399
1400 /*
1401 * Try to lookup a predefined HTML entity for it
1402 */
1403
1404 ent = htmlEntityValueLookup(c);
1405 if (ent == NULL) {
1406 /* no chance for this in Ascii */
1407 *outlen = out - outstart;
1408 *inlen = processed - instart;
1409 return(-2);
1410 }
1411 len = strlen(ent->name);
1412 if (out + 2 + len >= outend)
1413 break;
1414 *out++ = '&';
1415 memcpy(out, ent->name, len);
1416 out += len;
1417 *out++ = ';';
1418 }
1419 processed = in;
1420 }
1421 *outlen = out - outstart;
1422 *inlen = processed - instart;
1423 return(0);
1424}
1425
1426/**
1427 * htmlEncodeEntities:
1428 * @out: a pointer to an array of bytes to store the result
1429 * @outlen: the length of @out
1430 * @in: a pointer to an array of UTF-8 chars
1431 * @inlen: the length of @in
1432 * @quoteChar: the quote character to escape (' or ") or zero.
1433 *
1434 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1435 * plus HTML entities block of chars out.
1436 *
1437 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1438 * The value of @inlen after return is the number of octets consumed
1439 * as the return value is positive, else unpredictiable.
1440 * The value of @outlen after return is the number of octets consumed.
1441 */
1442int
1443htmlEncodeEntities(unsigned char* out, int *outlen,
1444 const unsigned char* in, int *inlen, int quoteChar) {
1445 const unsigned char* processed = in;
1446 const unsigned char* outend = out + (*outlen);
1447 const unsigned char* outstart = out;
1448 const unsigned char* instart = in;
1449 const unsigned char* inend = in + (*inlen);
1450 unsigned int c, d;
1451 int trailing;
1452
1453 while (in < inend) {
1454 d = *in++;
1455 if (d < 0x80) { c= d; trailing= 0; }
1456 else if (d < 0xC0) {
1457 /* trailing byte in leading position */
1458 *outlen = out - outstart;
1459 *inlen = processed - instart;
1460 return(-2);
1461 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1462 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1463 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1464 else {
1465 /* no chance for this in Ascii */
1466 *outlen = out - outstart;
1467 *inlen = processed - instart;
1468 return(-2);
1469 }
1470
1471 if (inend - in < trailing)
1472 break;
1473
1474 while (trailing--) {
1475 if (((d= *in++) & 0xC0) != 0x80) {
1476 *outlen = out - outstart;
1477 *inlen = processed - instart;
1478 return(-2);
1479 }
1480 c <<= 6;
1481 c |= d & 0x3F;
1482 }
1483
1484 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001485 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1486 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001487 if (out >= outend)
1488 break;
1489 *out++ = c;
1490 } else {
1491 htmlEntityDescPtr ent;
1492 const char *cp;
1493 char nbuf[16];
1494 int len;
1495
1496 /*
1497 * Try to lookup a predefined HTML entity for it
1498 */
1499 ent = htmlEntityValueLookup(c);
1500 if (ent == NULL) {
1501 sprintf(nbuf, "#%u", c);
1502 cp = nbuf;
1503 }
1504 else
1505 cp = ent->name;
1506 len = strlen(cp);
1507 if (out + 2 + len > outend)
1508 break;
1509 *out++ = '&';
1510 memcpy(out, cp, len);
1511 out += len;
1512 *out++ = ';';
1513 }
1514 processed = in;
1515 }
1516 *outlen = out - outstart;
1517 *inlen = processed - instart;
1518 return(0);
1519}
1520
1521/**
1522 * htmlDecodeEntities:
1523 * @ctxt: the parser context
1524 * @len: the len to decode (in bytes !), -1 for no size limit
1525 * @end: an end marker xmlChar, 0 if none
1526 * @end2: an end marker xmlChar, 0 if none
1527 * @end3: an end marker xmlChar, 0 if none
1528 *
1529 * Subtitute the HTML entities by their value
1530 *
1531 * DEPRECATED !!!!
1532 *
1533 * Returns A newly allocated string with the substitution done. The caller
1534 * must deallocate it !
1535 */
1536xmlChar *
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00001537htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
1538 xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001539 static int deprecated = 0;
1540 if (!deprecated) {
1541 xmlGenericError(xmlGenericErrorContext,
1542 "htmlDecodeEntities() deprecated function reached\n");
1543 deprecated = 1;
1544 }
1545 return(NULL);
1546#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00001547 xmlChar *name = NULL;
1548 xmlChar *buffer = NULL;
1549 unsigned int buffer_size = 0;
1550 unsigned int nbchars = 0;
1551 htmlEntityDescPtr ent;
1552 unsigned int max = (unsigned int) len;
1553 int c,l;
1554
1555 if (ctxt->depth > 40) {
1556 ctxt->errNo = XML_ERR_ENTITY_LOOP;
1557 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1558 ctxt->sax->error(ctxt->userData,
1559 "Detected entity reference loop\n");
1560 ctxt->wellFormed = 0;
1561 ctxt->disableSAX = 1;
1562 return(NULL);
1563 }
1564
1565 /*
1566 * allocate a translation buffer.
1567 */
1568 buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1569 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1570 if (buffer == NULL) {
1571 perror("xmlDecodeEntities: malloc failed");
1572 return(NULL);
1573 }
1574
1575 /*
1576 * Ok loop until we reach one of the ending char or a size limit.
1577 */
1578 c = CUR_CHAR(l);
1579 while ((nbchars < max) && (c != end) &&
1580 (c != end2) && (c != end3)) {
1581
1582 if (c == 0) break;
1583 if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
1584 int val = htmlParseCharRef(ctxt);
1585 COPY_BUF(0,buffer,nbchars,val);
1586 NEXTL(l);
1587 } else if ((c == '&') && (ctxt->token != '&')) {
1588 ent = htmlParseEntityRef(ctxt, &name);
1589 if (name != NULL) {
1590 if (ent != NULL) {
1591 int val = ent->value;
1592 COPY_BUF(0,buffer,nbchars,val);
1593 NEXTL(l);
1594 } else {
1595 const xmlChar *cur = name;
1596
1597 buffer[nbchars++] = '&';
1598 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1599 growBuffer(buffer);
1600 }
1601 while (*cur != 0) {
1602 buffer[nbchars++] = *cur++;
1603 }
1604 buffer[nbchars++] = ';';
1605 }
1606 }
1607 } else {
1608 COPY_BUF(l,buffer,nbchars,c);
1609 NEXTL(l);
1610 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1611 growBuffer(buffer);
1612 }
1613 }
1614 c = CUR_CHAR(l);
1615 }
1616 buffer[nbchars++] = 0;
1617 return(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001618#endif
Owen Taylor3473f882001-02-23 17:55:21 +00001619}
1620
1621/************************************************************************
1622 * *
1623 * Commodity functions to handle streams *
1624 * *
1625 ************************************************************************/
1626
1627/**
Owen Taylor3473f882001-02-23 17:55:21 +00001628 * htmlNewInputStream:
1629 * @ctxt: an HTML parser context
1630 *
1631 * Create a new input stream structure
1632 * Returns the new input stream or NULL
1633 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001634static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001635htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1636 htmlParserInputPtr input;
1637
1638 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1639 if (input == NULL) {
1640 ctxt->errNo = XML_ERR_NO_MEMORY;
1641 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1642 ctxt->sax->error(ctxt->userData,
1643 "malloc: couldn't allocate a new input stream\n");
1644 return(NULL);
1645 }
1646 memset(input, 0, sizeof(htmlParserInput));
1647 input->filename = NULL;
1648 input->directory = NULL;
1649 input->base = NULL;
1650 input->cur = NULL;
1651 input->buf = NULL;
1652 input->line = 1;
1653 input->col = 1;
1654 input->buf = NULL;
1655 input->free = NULL;
1656 input->version = NULL;
1657 input->consumed = 0;
1658 input->length = 0;
1659 return(input);
1660}
1661
1662
1663/************************************************************************
1664 * *
1665 * Commodity functions, cleanup needed ? *
1666 * *
1667 ************************************************************************/
1668
1669/**
1670 * areBlanks:
1671 * @ctxt: an HTML parser context
1672 * @str: a xmlChar *
1673 * @len: the size of @str
1674 *
1675 * Is this a sequence of blank chars that one can ignore ?
1676 *
1677 * Returns 1 if ignorable 0 otherwise.
1678 */
1679
1680static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
1681 int i;
1682 xmlNodePtr lastChild;
1683
1684 for (i = 0;i < len;i++)
1685 if (!(IS_BLANK(str[i]))) return(0);
1686
1687 if (CUR == 0) return(1);
1688 if (CUR != '<') return(0);
1689 if (ctxt->name == NULL)
1690 return(1);
1691 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
1692 return(1);
1693 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
1694 return(1);
1695 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
1696 return(1);
1697 if (ctxt->node == NULL) return(0);
1698 lastChild = xmlGetLastChild(ctxt->node);
1699 if (lastChild == NULL) {
1700 if (ctxt->node->content != NULL) return(0);
1701 } else if (xmlNodeIsText(lastChild)) {
1702 return(0);
1703 } else if (xmlStrEqual(lastChild->name, BAD_CAST"b")) {
1704 return(0);
1705 } else if (xmlStrEqual(lastChild->name, BAD_CAST"bold")) {
1706 return(0);
1707 } else if (xmlStrEqual(lastChild->name, BAD_CAST"em")) {
1708 return(0);
1709 }
1710 return(1);
1711}
1712
1713/**
Owen Taylor3473f882001-02-23 17:55:21 +00001714 * htmlNewDocNoDtD:
1715 * @URI: URI for the dtd, or NULL
1716 * @ExternalID: the external ID of the DTD, or NULL
1717 *
1718 * Returns a new document, do not intialize the DTD if not provided
1719 */
1720htmlDocPtr
1721htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
1722 xmlDocPtr cur;
1723
1724 /*
1725 * Allocate a new document and fill the fields.
1726 */
1727 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
1728 if (cur == NULL) {
1729 xmlGenericError(xmlGenericErrorContext,
1730 "xmlNewDoc : malloc failed\n");
1731 return(NULL);
1732 }
1733 memset(cur, 0, sizeof(xmlDoc));
1734
1735 cur->type = XML_HTML_DOCUMENT_NODE;
1736 cur->version = NULL;
1737 cur->intSubset = NULL;
1738 if ((ExternalID != NULL) ||
1739 (URI != NULL))
1740 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
1741 cur->doc = cur;
1742 cur->name = NULL;
1743 cur->children = NULL;
1744 cur->extSubset = NULL;
1745 cur->oldNs = NULL;
1746 cur->encoding = NULL;
1747 cur->standalone = 1;
1748 cur->compression = 0;
1749 cur->ids = NULL;
1750 cur->refs = NULL;
1751#ifndef XML_WITHOUT_CORBA
1752 cur->_private = NULL;
1753#endif
1754 return(cur);
1755}
1756
1757/**
1758 * htmlNewDoc:
1759 * @URI: URI for the dtd, or NULL
1760 * @ExternalID: the external ID of the DTD, or NULL
1761 *
1762 * Returns a new document
1763 */
1764htmlDocPtr
1765htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1766 if ((URI == NULL) && (ExternalID == NULL))
1767 return(htmlNewDocNoDtD(
1768 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
1769 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"));
1770
1771 return(htmlNewDocNoDtD(URI, ExternalID));
1772}
1773
1774
1775/************************************************************************
1776 * *
1777 * The parser itself *
1778 * Relates to http://www.w3.org/TR/html40 *
1779 * *
1780 ************************************************************************/
1781
1782/************************************************************************
1783 * *
1784 * The parser itself *
1785 * *
1786 ************************************************************************/
1787
1788/**
1789 * htmlParseHTMLName:
1790 * @ctxt: an HTML parser context
1791 *
1792 * parse an HTML tag or attribute name, note that we convert it to lowercase
1793 * since HTML names are not case-sensitive.
1794 *
1795 * Returns the Tag Name parsed or NULL
1796 */
1797
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001798static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001799htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1800 xmlChar *ret = NULL;
1801 int i = 0;
1802 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
1803
1804 if (!IS_LETTER(CUR) && (CUR != '_') &&
1805 (CUR != ':')) return(NULL);
1806
1807 while ((i < HTML_PARSER_BUFFER_SIZE) &&
1808 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1809 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
1810 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
1811 else loc[i] = CUR;
1812 i++;
1813
1814 NEXT;
1815 }
1816
1817 ret = xmlStrndup(loc, i);
1818
1819 return(ret);
1820}
1821
1822/**
1823 * htmlParseName:
1824 * @ctxt: an HTML parser context
1825 *
1826 * parse an HTML name, this routine is case sensistive.
1827 *
1828 * Returns the Name parsed or NULL
1829 */
1830
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001831static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001832htmlParseName(htmlParserCtxtPtr ctxt) {
1833 xmlChar buf[HTML_MAX_NAMELEN];
1834 int len = 0;
1835
1836 GROW;
1837 if (!IS_LETTER(CUR) && (CUR != '_')) {
1838 return(NULL);
1839 }
1840
1841 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1842 (CUR == '.') || (CUR == '-') ||
1843 (CUR == '_') || (CUR == ':') ||
1844 (IS_COMBINING(CUR)) ||
1845 (IS_EXTENDER(CUR))) {
1846 buf[len++] = CUR;
1847 NEXT;
1848 if (len >= HTML_MAX_NAMELEN) {
1849 xmlGenericError(xmlGenericErrorContext,
1850 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1851 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1852 (CUR == '.') || (CUR == '-') ||
1853 (CUR == '_') || (CUR == ':') ||
1854 (IS_COMBINING(CUR)) ||
1855 (IS_EXTENDER(CUR)))
1856 NEXT;
1857 break;
1858 }
1859 }
1860 return(xmlStrndup(buf, len));
1861}
1862
1863/**
1864 * htmlParseHTMLAttribute:
1865 * @ctxt: an HTML parser context
1866 * @stop: a char stop value
1867 *
1868 * parse an HTML attribute value till the stop (quote), if
1869 * stop is 0 then it stops at the first space
1870 *
1871 * Returns the attribute parsed or NULL
1872 */
1873
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001874static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001875htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
1876 xmlChar *buffer = NULL;
1877 int buffer_size = 0;
1878 xmlChar *out = NULL;
1879 xmlChar *name = NULL;
1880
1881 xmlChar *cur = NULL;
1882 htmlEntityDescPtr ent;
1883
1884 /*
1885 * allocate a translation buffer.
1886 */
1887 buffer_size = HTML_PARSER_BUFFER_SIZE;
1888 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1889 if (buffer == NULL) {
1890 perror("htmlParseHTMLAttribute: malloc failed");
1891 return(NULL);
1892 }
1893 out = buffer;
1894
1895 /*
1896 * Ok loop until we reach one of the ending chars
1897 */
1898 while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1899 if ((stop == 0) && (IS_BLANK(CUR))) break;
1900 if (CUR == '&') {
1901 if (NXT(1) == '#') {
1902 unsigned int c;
1903 int bits;
1904
1905 c = htmlParseCharRef(ctxt);
1906 if (c < 0x80)
1907 { *out++ = c; bits= -6; }
1908 else if (c < 0x800)
1909 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1910 else if (c < 0x10000)
1911 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1912 else
1913 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1914
1915 for ( ; bits >= 0; bits-= 6) {
1916 *out++ = ((c >> bits) & 0x3F) | 0x80;
1917 }
1918 } else {
1919 ent = htmlParseEntityRef(ctxt, &name);
1920 if (name == NULL) {
1921 *out++ = '&';
1922 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001923 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00001924
1925 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001926 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00001927 }
1928 } else if (ent == NULL) {
1929 *out++ = '&';
1930 cur = name;
1931 while (*cur != 0) {
1932 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001933 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00001934
1935 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001936 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00001937 }
1938 *out++ = *cur++;
1939 }
1940 xmlFree(name);
1941 } else {
1942 unsigned int c;
1943 int bits;
1944
1945 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001946 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00001947
1948 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001949 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00001950 }
1951 c = (xmlChar)ent->value;
1952 if (c < 0x80)
1953 { *out++ = c; bits= -6; }
1954 else if (c < 0x800)
1955 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1956 else if (c < 0x10000)
1957 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1958 else
1959 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1960
1961 for ( ; bits >= 0; bits-= 6) {
1962 *out++ = ((c >> bits) & 0x3F) | 0x80;
1963 }
1964 xmlFree(name);
1965 }
1966 }
1967 } else {
1968 unsigned int c;
1969 int bits, l;
1970
1971 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001972 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00001973
1974 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001975 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00001976 }
1977 c = CUR_CHAR(l);
1978 if (c < 0x80)
1979 { *out++ = c; bits= -6; }
1980 else if (c < 0x800)
1981 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1982 else if (c < 0x10000)
1983 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1984 else
1985 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1986
1987 for ( ; bits >= 0; bits-= 6) {
1988 *out++ = ((c >> bits) & 0x3F) | 0x80;
1989 }
1990 NEXT;
1991 }
1992 }
1993 *out++ = 0;
1994 return(buffer);
1995}
1996
1997/**
Owen Taylor3473f882001-02-23 17:55:21 +00001998 * htmlParseEntityRef:
1999 * @ctxt: an HTML parser context
2000 * @str: location to store the entity name
2001 *
2002 * parse an HTML ENTITY references
2003 *
2004 * [68] EntityRef ::= '&' Name ';'
2005 *
2006 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2007 * if non-NULL *str will have to be freed by the caller.
2008 */
2009htmlEntityDescPtr
2010htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2011 xmlChar *name;
2012 htmlEntityDescPtr ent = NULL;
2013 *str = NULL;
2014
2015 if (CUR == '&') {
2016 NEXT;
2017 name = htmlParseName(ctxt);
2018 if (name == NULL) {
2019 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2020 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2021 ctxt->wellFormed = 0;
2022 } else {
2023 GROW;
2024 if (CUR == ';') {
2025 *str = name;
2026
2027 /*
2028 * Lookup the entity in the table.
2029 */
2030 ent = htmlEntityLookup(name);
2031 if (ent != NULL) /* OK that's ugly !!! */
2032 NEXT;
2033 } else {
2034 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2035 ctxt->sax->error(ctxt->userData,
2036 "htmlParseEntityRef: expecting ';'\n");
2037 *str = name;
2038 }
2039 }
2040 }
2041 return(ent);
2042}
2043
2044/**
2045 * htmlParseAttValue:
2046 * @ctxt: an HTML parser context
2047 *
2048 * parse a value for an attribute
2049 * Note: the parser won't do substitution of entities here, this
2050 * will be handled later in xmlStringGetNodeList, unless it was
2051 * asked for ctxt->replaceEntities != 0
2052 *
2053 * Returns the AttValue parsed or NULL.
2054 */
2055
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002056static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002057htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2058 xmlChar *ret = NULL;
2059
2060 if (CUR == '"') {
2061 NEXT;
2062 ret = htmlParseHTMLAttribute(ctxt, '"');
2063 if (CUR != '"') {
2064 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2065 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2066 ctxt->wellFormed = 0;
2067 } else
2068 NEXT;
2069 } else if (CUR == '\'') {
2070 NEXT;
2071 ret = htmlParseHTMLAttribute(ctxt, '\'');
2072 if (CUR != '\'') {
2073 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2074 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2075 ctxt->wellFormed = 0;
2076 } else
2077 NEXT;
2078 } else {
2079 /*
2080 * That's an HTMLism, the attribute value may not be quoted
2081 */
2082 ret = htmlParseHTMLAttribute(ctxt, 0);
2083 if (ret == NULL) {
2084 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2085 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2086 ctxt->wellFormed = 0;
2087 }
2088 }
2089 return(ret);
2090}
2091
2092/**
2093 * htmlParseSystemLiteral:
2094 * @ctxt: an HTML parser context
2095 *
2096 * parse an HTML Literal
2097 *
2098 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2099 *
2100 * Returns the SystemLiteral parsed or NULL
2101 */
2102
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002103static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002104htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2105 const xmlChar *q;
2106 xmlChar *ret = NULL;
2107
2108 if (CUR == '"') {
2109 NEXT;
2110 q = CUR_PTR;
2111 while ((IS_CHAR(CUR)) && (CUR != '"'))
2112 NEXT;
2113 if (!IS_CHAR(CUR)) {
2114 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2115 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2116 ctxt->wellFormed = 0;
2117 } else {
2118 ret = xmlStrndup(q, CUR_PTR - q);
2119 NEXT;
2120 }
2121 } else if (CUR == '\'') {
2122 NEXT;
2123 q = CUR_PTR;
2124 while ((IS_CHAR(CUR)) && (CUR != '\''))
2125 NEXT;
2126 if (!IS_CHAR(CUR)) {
2127 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2128 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2129 ctxt->wellFormed = 0;
2130 } else {
2131 ret = xmlStrndup(q, CUR_PTR - q);
2132 NEXT;
2133 }
2134 } else {
2135 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2136 ctxt->sax->error(ctxt->userData,
2137 "SystemLiteral \" or ' expected\n");
2138 ctxt->wellFormed = 0;
2139 }
2140
2141 return(ret);
2142}
2143
2144/**
2145 * htmlParsePubidLiteral:
2146 * @ctxt: an HTML parser context
2147 *
2148 * parse an HTML public literal
2149 *
2150 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2151 *
2152 * Returns the PubidLiteral parsed or NULL.
2153 */
2154
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002155static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002156htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2157 const xmlChar *q;
2158 xmlChar *ret = NULL;
2159 /*
2160 * Name ::= (Letter | '_') (NameChar)*
2161 */
2162 if (CUR == '"') {
2163 NEXT;
2164 q = CUR_PTR;
2165 while (IS_PUBIDCHAR(CUR)) NEXT;
2166 if (CUR != '"') {
2167 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2168 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2169 ctxt->wellFormed = 0;
2170 } else {
2171 ret = xmlStrndup(q, CUR_PTR - q);
2172 NEXT;
2173 }
2174 } else if (CUR == '\'') {
2175 NEXT;
2176 q = CUR_PTR;
2177 while ((IS_LETTER(CUR)) && (CUR != '\''))
2178 NEXT;
2179 if (!IS_LETTER(CUR)) {
2180 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2181 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2182 ctxt->wellFormed = 0;
2183 } else {
2184 ret = xmlStrndup(q, CUR_PTR - q);
2185 NEXT;
2186 }
2187 } else {
2188 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2189 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2190 ctxt->wellFormed = 0;
2191 }
2192
2193 return(ret);
2194}
2195
2196/**
2197 * htmlParseScript:
2198 * @ctxt: an HTML parser context
2199 *
2200 * parse the content of an HTML SCRIPT or STYLE element
2201 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2202 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2203 * http://www.w3.org/TR/html4/types.html#type-script
2204 * http://www.w3.org/TR/html4/types.html#h-6.15
2205 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2206 *
2207 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2208 * element and the value of intrinsic event attributes. User agents must
2209 * not evaluate script data as HTML markup but instead must pass it on as
2210 * data to a script engine.
2211 * NOTES:
2212 * - The content is passed like CDATA
2213 * - the attributes for style and scripting "onXXX" are also described
2214 * as CDATA but SGML allows entities references in attributes so their
2215 * processing is identical as other attributes
2216 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002217static void
Owen Taylor3473f882001-02-23 17:55:21 +00002218htmlParseScript(htmlParserCtxtPtr ctxt) {
2219 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2220 int nbchar = 0;
2221 xmlChar cur;
2222
2223 SHRINK;
2224 cur = CUR;
2225 while (IS_CHAR(cur)) {
2226 if ((cur == '<') && (NXT(1) == '/')) {
2227 /*
2228 * One should break here, the specification is clear:
2229 * Authors should therefore escape "</" within the content.
2230 * Escape mechanisms are specific to each scripting or
2231 * style sheet language.
2232 */
2233 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2234 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2235 break; /* while */
2236 }
2237 buf[nbchar++] = cur;
2238 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2239 if (ctxt->sax->cdataBlock!= NULL) {
2240 /*
2241 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2242 */
2243 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2244 }
2245 nbchar = 0;
2246 }
2247 NEXT;
2248 cur = CUR;
2249 }
2250 if (!(IS_CHAR(cur))) {
2251 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2252 ctxt->sax->error(ctxt->userData,
2253 "Invalid char in CDATA 0x%X\n", cur);
2254 ctxt->wellFormed = 0;
2255 NEXT;
2256 }
2257
2258 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2259 if (ctxt->sax->cdataBlock!= NULL) {
2260 /*
2261 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2262 */
2263 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2264 }
2265 }
2266}
2267
2268
2269/**
2270 * htmlParseCharData:
2271 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002272 *
2273 * parse a CharData section.
2274 * if we are within a CDATA section ']]>' marks an end of section.
2275 *
2276 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2277 */
2278
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002279static void
2280htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002281 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2282 int nbchar = 0;
2283 int cur, l;
2284
2285 SHRINK;
2286 cur = CUR_CHAR(l);
2287 while (((cur != '<') || (ctxt->token == '<')) &&
2288 ((cur != '&') || (ctxt->token == '&')) &&
2289 (IS_CHAR(cur))) {
2290 COPY_BUF(l,buf,nbchar,cur);
2291 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2292 /*
2293 * Ok the segment is to be consumed as chars.
2294 */
2295 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2296 if (areBlanks(ctxt, buf, nbchar)) {
2297 if (ctxt->sax->ignorableWhitespace != NULL)
2298 ctxt->sax->ignorableWhitespace(ctxt->userData,
2299 buf, nbchar);
2300 } else {
2301 htmlCheckParagraph(ctxt);
2302 if (ctxt->sax->characters != NULL)
2303 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2304 }
2305 }
2306 nbchar = 0;
2307 }
2308 NEXTL(l);
2309 cur = CUR_CHAR(l);
2310 }
2311 if (nbchar != 0) {
2312 /*
2313 * Ok the segment is to be consumed as chars.
2314 */
2315 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2316 if (areBlanks(ctxt, buf, nbchar)) {
2317 if (ctxt->sax->ignorableWhitespace != NULL)
2318 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2319 } else {
2320 htmlCheckParagraph(ctxt);
2321 if (ctxt->sax->characters != NULL)
2322 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2323 }
2324 }
2325 }
2326}
2327
2328/**
2329 * htmlParseExternalID:
2330 * @ctxt: an HTML parser context
2331 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002332 *
2333 * Parse an External ID or a Public ID
2334 *
Owen Taylor3473f882001-02-23 17:55:21 +00002335 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2336 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2337 *
2338 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2339 *
2340 * Returns the function returns SystemLiteral and in the second
2341 * case publicID receives PubidLiteral, is strict is off
2342 * it is possible to return NULL and have publicID set.
2343 */
2344
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002345static xmlChar *
2346htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002347 xmlChar *URI = NULL;
2348
2349 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2350 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2351 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2352 SKIP(6);
2353 if (!IS_BLANK(CUR)) {
2354 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2355 ctxt->sax->error(ctxt->userData,
2356 "Space required after 'SYSTEM'\n");
2357 ctxt->wellFormed = 0;
2358 }
2359 SKIP_BLANKS;
2360 URI = htmlParseSystemLiteral(ctxt);
2361 if (URI == NULL) {
2362 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2363 ctxt->sax->error(ctxt->userData,
2364 "htmlParseExternalID: SYSTEM, no URI\n");
2365 ctxt->wellFormed = 0;
2366 }
2367 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2368 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2369 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2370 SKIP(6);
2371 if (!IS_BLANK(CUR)) {
2372 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2373 ctxt->sax->error(ctxt->userData,
2374 "Space required after 'PUBLIC'\n");
2375 ctxt->wellFormed = 0;
2376 }
2377 SKIP_BLANKS;
2378 *publicID = htmlParsePubidLiteral(ctxt);
2379 if (*publicID == NULL) {
2380 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2381 ctxt->sax->error(ctxt->userData,
2382 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2383 ctxt->wellFormed = 0;
2384 }
2385 SKIP_BLANKS;
2386 if ((CUR == '"') || (CUR == '\'')) {
2387 URI = htmlParseSystemLiteral(ctxt);
2388 }
2389 }
2390 return(URI);
2391}
2392
2393/**
2394 * htmlParseComment:
2395 * @ctxt: an HTML parser context
2396 *
2397 * Parse an XML (SGML) comment <!-- .... -->
2398 *
2399 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2400 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002401static void
Owen Taylor3473f882001-02-23 17:55:21 +00002402htmlParseComment(htmlParserCtxtPtr ctxt) {
2403 xmlChar *buf = NULL;
2404 int len;
2405 int size = HTML_PARSER_BUFFER_SIZE;
2406 int q, ql;
2407 int r, rl;
2408 int cur, l;
2409 xmlParserInputState state;
2410
2411 /*
2412 * Check that there is a comment right here.
2413 */
2414 if ((RAW != '<') || (NXT(1) != '!') ||
2415 (NXT(2) != '-') || (NXT(3) != '-')) return;
2416
2417 state = ctxt->instate;
2418 ctxt->instate = XML_PARSER_COMMENT;
2419 SHRINK;
2420 SKIP(4);
2421 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2422 if (buf == NULL) {
2423 xmlGenericError(xmlGenericErrorContext,
2424 "malloc of %d byte failed\n", size);
2425 ctxt->instate = state;
2426 return;
2427 }
2428 q = CUR_CHAR(ql);
2429 NEXTL(ql);
2430 r = CUR_CHAR(rl);
2431 NEXTL(rl);
2432 cur = CUR_CHAR(l);
2433 len = 0;
2434 while (IS_CHAR(cur) &&
2435 ((cur != '>') ||
2436 (r != '-') || (q != '-'))) {
2437 if (len + 5 >= size) {
2438 size *= 2;
2439 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2440 if (buf == NULL) {
2441 xmlGenericError(xmlGenericErrorContext,
2442 "realloc of %d byte failed\n", size);
2443 ctxt->instate = state;
2444 return;
2445 }
2446 }
2447 COPY_BUF(ql,buf,len,q);
2448 q = r;
2449 ql = rl;
2450 r = cur;
2451 rl = l;
2452 NEXTL(l);
2453 cur = CUR_CHAR(l);
2454 if (cur == 0) {
2455 SHRINK;
2456 GROW;
2457 cur = CUR_CHAR(l);
2458 }
2459 }
2460 buf[len] = 0;
2461 if (!IS_CHAR(cur)) {
2462 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2463 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2464 ctxt->sax->error(ctxt->userData,
2465 "Comment not terminated \n<!--%.50s\n", buf);
2466 ctxt->wellFormed = 0;
2467 xmlFree(buf);
2468 } else {
2469 NEXT;
2470 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2471 (!ctxt->disableSAX))
2472 ctxt->sax->comment(ctxt->userData, buf);
2473 xmlFree(buf);
2474 }
2475 ctxt->instate = state;
2476}
2477
2478/**
2479 * htmlParseCharRef:
2480 * @ctxt: an HTML parser context
2481 *
2482 * parse Reference declarations
2483 *
2484 * [66] CharRef ::= '&#' [0-9]+ ';' |
2485 * '&#x' [0-9a-fA-F]+ ';'
2486 *
2487 * Returns the value parsed (as an int)
2488 */
2489int
2490htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2491 int val = 0;
2492
2493 if ((CUR == '&') && (NXT(1) == '#') &&
2494 (NXT(2) == 'x')) {
2495 SKIP(3);
2496 while (CUR != ';') {
2497 if ((CUR >= '0') && (CUR <= '9'))
2498 val = val * 16 + (CUR - '0');
2499 else if ((CUR >= 'a') && (CUR <= 'f'))
2500 val = val * 16 + (CUR - 'a') + 10;
2501 else if ((CUR >= 'A') && (CUR <= 'F'))
2502 val = val * 16 + (CUR - 'A') + 10;
2503 else {
2504 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2505 ctxt->sax->error(ctxt->userData,
2506 "htmlParseCharRef: invalid hexadecimal value\n");
2507 ctxt->wellFormed = 0;
2508 return(0);
2509 }
2510 NEXT;
2511 }
2512 if (CUR == ';')
2513 NEXT;
2514 } else if ((CUR == '&') && (NXT(1) == '#')) {
2515 SKIP(2);
2516 while (CUR != ';') {
2517 if ((CUR >= '0') && (CUR <= '9'))
2518 val = val * 10 + (CUR - '0');
2519 else {
2520 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2521 ctxt->sax->error(ctxt->userData,
2522 "htmlParseCharRef: invalid decimal value\n");
2523 ctxt->wellFormed = 0;
2524 return(0);
2525 }
2526 NEXT;
2527 }
2528 if (CUR == ';')
2529 NEXT;
2530 } else {
2531 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2532 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2533 ctxt->wellFormed = 0;
2534 }
2535 /*
2536 * Check the value IS_CHAR ...
2537 */
2538 if (IS_CHAR(val)) {
2539 return(val);
2540 } else {
2541 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2542 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
2543 val);
2544 ctxt->wellFormed = 0;
2545 }
2546 return(0);
2547}
2548
2549
2550/**
2551 * htmlParseDocTypeDecl :
2552 * @ctxt: an HTML parser context
2553 *
2554 * parse a DOCTYPE declaration
2555 *
2556 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2557 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2558 */
2559
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002560static void
Owen Taylor3473f882001-02-23 17:55:21 +00002561htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
2562 xmlChar *name;
2563 xmlChar *ExternalID = NULL;
2564 xmlChar *URI = NULL;
2565
2566 /*
2567 * We know that '<!DOCTYPE' has been detected.
2568 */
2569 SKIP(9);
2570
2571 SKIP_BLANKS;
2572
2573 /*
2574 * Parse the DOCTYPE name.
2575 */
2576 name = htmlParseName(ctxt);
2577 if (name == NULL) {
2578 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2579 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2580 ctxt->wellFormed = 0;
2581 }
2582 /*
2583 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2584 */
2585
2586 SKIP_BLANKS;
2587
2588 /*
2589 * Check for SystemID and ExternalID
2590 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002591 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00002592 SKIP_BLANKS;
2593
2594 /*
2595 * We should be at the end of the DOCTYPE declaration.
2596 */
2597 if (CUR != '>') {
2598 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2599 ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
2600 ctxt->wellFormed = 0;
2601 /* We shouldn't try to resynchronize ... */
2602 }
2603 NEXT;
2604
2605 /*
2606 * Create or update the document accordingly to the DOCTYPE
2607 */
2608 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2609 (!ctxt->disableSAX))
2610 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
2611
2612 /*
2613 * Cleanup, since we don't use all those identifiers
2614 */
2615 if (URI != NULL) xmlFree(URI);
2616 if (ExternalID != NULL) xmlFree(ExternalID);
2617 if (name != NULL) xmlFree(name);
2618}
2619
2620/**
2621 * htmlParseAttribute:
2622 * @ctxt: an HTML parser context
2623 * @value: a xmlChar ** used to store the value of the attribute
2624 *
2625 * parse an attribute
2626 *
2627 * [41] Attribute ::= Name Eq AttValue
2628 *
2629 * [25] Eq ::= S? '=' S?
2630 *
2631 * With namespace:
2632 *
2633 * [NS 11] Attribute ::= QName Eq AttValue
2634 *
2635 * Also the case QName == xmlns:??? is handled independently as a namespace
2636 * definition.
2637 *
2638 * Returns the attribute name, and the value in *value.
2639 */
2640
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002641static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002642htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
2643 xmlChar *name, *val = NULL;
2644
2645 *value = NULL;
2646 name = htmlParseHTMLName(ctxt);
2647 if (name == NULL) {
2648 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2649 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2650 ctxt->wellFormed = 0;
2651 return(NULL);
2652 }
2653
2654 /*
2655 * read the value
2656 */
2657 SKIP_BLANKS;
2658 if (CUR == '=') {
2659 NEXT;
2660 SKIP_BLANKS;
2661 val = htmlParseAttValue(ctxt);
2662 /******
2663 } else {
2664 * TODO : some attribute must have values, some may not
2665 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2666 ctxt->sax->warning(ctxt->userData,
2667 "No value for attribute %s\n", name); */
2668 }
2669
2670 *value = val;
2671 return(name);
2672}
2673
2674/**
2675 * htmlCheckEncoding:
2676 * @ctxt: an HTML parser context
2677 * @attvalue: the attribute value
2678 *
2679 * Checks an http-equiv attribute from a Meta tag to detect
2680 * the encoding
2681 * If a new encoding is detected the parser is switched to decode
2682 * it and pass UTF8
2683 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002684static void
Owen Taylor3473f882001-02-23 17:55:21 +00002685htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2686 const xmlChar *encoding;
2687
2688 if ((ctxt == NULL) || (attvalue == NULL))
2689 return;
2690
2691 /* do not change encoding */
2692 if (ctxt->input->encoding != NULL)
2693 return;
2694
2695 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
2696 if (encoding != NULL) {
2697 encoding += 8;
2698 } else {
2699 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
2700 if (encoding != NULL)
2701 encoding += 9;
2702 }
2703 if (encoding != NULL) {
2704 xmlCharEncoding enc;
2705 xmlCharEncodingHandlerPtr handler;
2706
2707 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2708
2709 if (ctxt->input->encoding != NULL)
2710 xmlFree((xmlChar *) ctxt->input->encoding);
2711 ctxt->input->encoding = xmlStrdup(encoding);
2712
2713 enc = xmlParseCharEncoding((const char *) encoding);
2714 /*
2715 * registered set of known encodings
2716 */
2717 if (enc != XML_CHAR_ENCODING_ERROR) {
2718 xmlSwitchEncoding(ctxt, enc);
2719 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2720 } else {
2721 /*
2722 * fallback for unknown encodings
2723 */
2724 handler = xmlFindCharEncodingHandler((const char *) encoding);
2725 if (handler != NULL) {
2726 xmlSwitchToEncoding(ctxt, handler);
2727 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2728 } else {
2729 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2730 }
2731 }
2732
2733 if ((ctxt->input->buf != NULL) &&
2734 (ctxt->input->buf->encoder != NULL) &&
2735 (ctxt->input->buf->raw != NULL) &&
2736 (ctxt->input->buf->buffer != NULL)) {
2737 int nbchars;
2738 int processed;
2739
2740 /*
2741 * convert as much as possible to the parser reading buffer.
2742 */
2743 processed = ctxt->input->cur - ctxt->input->base;
2744 xmlBufferShrink(ctxt->input->buf->buffer, processed);
2745 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2746 ctxt->input->buf->buffer,
2747 ctxt->input->buf->raw);
2748 if (nbchars < 0) {
2749 ctxt->errNo = XML_ERR_INVALID_ENCODING;
2750 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2751 ctxt->sax->error(ctxt->userData,
2752 "htmlCheckEncoding: encoder error\n");
2753 }
2754 ctxt->input->base =
2755 ctxt->input->cur = ctxt->input->buf->buffer->content;
2756 }
2757 }
2758}
2759
2760/**
2761 * htmlCheckMeta:
2762 * @ctxt: an HTML parser context
2763 * @atts: the attributes values
2764 *
2765 * Checks an attributes from a Meta tag
2766 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002767static void
Owen Taylor3473f882001-02-23 17:55:21 +00002768htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2769 int i;
2770 const xmlChar *att, *value;
2771 int http = 0;
2772 const xmlChar *content = NULL;
2773
2774 if ((ctxt == NULL) || (atts == NULL))
2775 return;
2776
2777 i = 0;
2778 att = atts[i++];
2779 while (att != NULL) {
2780 value = atts[i++];
2781 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
2782 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
2783 http = 1;
2784 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
2785 content = value;
2786 att = atts[i++];
2787 }
2788 if ((http) && (content != NULL))
2789 htmlCheckEncoding(ctxt, content);
2790
2791}
2792
2793/**
2794 * htmlParseStartTag:
2795 * @ctxt: an HTML parser context
2796 *
2797 * parse a start of tag either for rule element or
2798 * EmptyElement. In both case we don't parse the tag closing chars.
2799 *
2800 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2801 *
2802 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2803 *
2804 * With namespace:
2805 *
2806 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2807 *
2808 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2809 *
2810 */
2811
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002812static void
Owen Taylor3473f882001-02-23 17:55:21 +00002813htmlParseStartTag(htmlParserCtxtPtr ctxt) {
2814 xmlChar *name;
2815 xmlChar *attname;
2816 xmlChar *attvalue;
2817 const xmlChar **atts = NULL;
2818 int nbatts = 0;
2819 int maxatts = 0;
2820 int meta = 0;
2821 int i;
2822
2823 if (CUR != '<') return;
2824 NEXT;
2825
2826 GROW;
2827 name = htmlParseHTMLName(ctxt);
2828 if (name == NULL) {
2829 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2830 ctxt->sax->error(ctxt->userData,
2831 "htmlParseStartTag: invalid element name\n");
2832 ctxt->wellFormed = 0;
2833 /* Dump the bogus tag like browsers do */
2834 while ((IS_CHAR(CUR)) && (CUR != '>'))
2835 NEXT;
2836 return;
2837 }
2838 if (xmlStrEqual(name, BAD_CAST"meta"))
2839 meta = 1;
2840
2841 /*
2842 * Check for auto-closure of HTML elements.
2843 */
2844 htmlAutoClose(ctxt, name);
2845
2846 /*
2847 * Check for implied HTML elements.
2848 */
2849 htmlCheckImplied(ctxt, name);
2850
2851 /*
2852 * Avoid html at any level > 0, head at any level != 1
2853 * or any attempt to recurse body
2854 */
2855 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
2856 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2857 ctxt->sax->error(ctxt->userData,
2858 "htmlParseStartTag: misplaced <html> tag\n");
2859 ctxt->wellFormed = 0;
2860 xmlFree(name);
2861 return;
2862 }
2863 if ((ctxt->nameNr != 1) &&
2864 (xmlStrEqual(name, BAD_CAST"head"))) {
2865 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2866 ctxt->sax->error(ctxt->userData,
2867 "htmlParseStartTag: misplaced <head> tag\n");
2868 ctxt->wellFormed = 0;
2869 xmlFree(name);
2870 return;
2871 }
2872 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002873 int indx;
2874 for (indx = 0;indx < ctxt->nameNr;indx++) {
2875 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor3473f882001-02-23 17:55:21 +00002876 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2877 ctxt->sax->error(ctxt->userData,
2878 "htmlParseStartTag: misplaced <body> tag\n");
2879 ctxt->wellFormed = 0;
2880 xmlFree(name);
2881 return;
2882 }
2883 }
2884 }
2885
2886 /*
2887 * Now parse the attributes, it ends up with the ending
2888 *
2889 * (S Attribute)* S?
2890 */
2891 SKIP_BLANKS;
2892 while ((IS_CHAR(CUR)) &&
2893 (CUR != '>') &&
2894 ((CUR != '/') || (NXT(1) != '>'))) {
2895 long cons = ctxt->nbChars;
2896
2897 GROW;
2898 attname = htmlParseAttribute(ctxt, &attvalue);
2899 if (attname != NULL) {
2900
2901 /*
2902 * Well formedness requires at most one declaration of an attribute
2903 */
2904 for (i = 0; i < nbatts;i += 2) {
2905 if (xmlStrEqual(atts[i], attname)) {
2906 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2907 ctxt->sax->error(ctxt->userData,
2908 "Attribute %s redefined\n",
2909 attname);
2910 ctxt->wellFormed = 0;
2911 xmlFree(attname);
2912 if (attvalue != NULL)
2913 xmlFree(attvalue);
2914 goto failed;
2915 }
2916 }
2917
2918 /*
2919 * Add the pair to atts
2920 */
2921 if (atts == NULL) {
2922 maxatts = 10;
2923 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
2924 if (atts == NULL) {
2925 xmlGenericError(xmlGenericErrorContext,
2926 "malloc of %ld byte failed\n",
2927 maxatts * (long)sizeof(xmlChar *));
2928 if (name != NULL) xmlFree(name);
2929 return;
2930 }
2931 } else if (nbatts + 4 > maxatts) {
2932 maxatts *= 2;
2933 atts = (const xmlChar **) xmlRealloc((void *) atts,
2934 maxatts * sizeof(xmlChar *));
2935 if (atts == NULL) {
2936 xmlGenericError(xmlGenericErrorContext,
2937 "realloc of %ld byte failed\n",
2938 maxatts * (long)sizeof(xmlChar *));
2939 if (name != NULL) xmlFree(name);
2940 return;
2941 }
2942 }
2943 atts[nbatts++] = attname;
2944 atts[nbatts++] = attvalue;
2945 atts[nbatts] = NULL;
2946 atts[nbatts + 1] = NULL;
2947 }
2948 else {
2949 /* Dump the bogus attribute string up to the next blank or
2950 * the end of the tag. */
2951 while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
2952 && ((CUR != '/') || (NXT(1) != '>')))
2953 NEXT;
2954 }
2955
2956failed:
2957 SKIP_BLANKS;
2958 if (cons == ctxt->nbChars) {
2959 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2960 ctxt->sax->error(ctxt->userData,
2961 "htmlParseStartTag: problem parsing attributes\n");
2962 ctxt->wellFormed = 0;
2963 break;
2964 }
2965 }
2966
2967 /*
2968 * Handle specific association to the META tag
2969 */
2970 if (meta)
2971 htmlCheckMeta(ctxt, atts);
2972
2973 /*
2974 * SAX: Start of Element !
2975 */
2976 htmlnamePush(ctxt, xmlStrdup(name));
2977#ifdef DEBUG
2978 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
2979#endif
2980 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
2981 ctxt->sax->startElement(ctxt->userData, name, atts);
2982
2983 if (atts != NULL) {
2984 for (i = 0;i < nbatts;i++) {
2985 if (atts[i] != NULL)
2986 xmlFree((xmlChar *) atts[i]);
2987 }
2988 xmlFree((void *) atts);
2989 }
2990 if (name != NULL) xmlFree(name);
2991}
2992
2993/**
2994 * htmlParseEndTag:
2995 * @ctxt: an HTML parser context
2996 *
2997 * parse an end of tag
2998 *
2999 * [42] ETag ::= '</' Name S? '>'
3000 *
3001 * With namespace
3002 *
3003 * [NS 9] ETag ::= '</' QName S? '>'
3004 */
3005
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003006static void
Owen Taylor3473f882001-02-23 17:55:21 +00003007htmlParseEndTag(htmlParserCtxtPtr ctxt) {
3008 xmlChar *name;
3009 xmlChar *oldname;
3010 int i;
3011
3012 if ((CUR != '<') || (NXT(1) != '/')) {
3013 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3014 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3015 ctxt->wellFormed = 0;
3016 return;
3017 }
3018 SKIP(2);
3019
3020 name = htmlParseHTMLName(ctxt);
3021 if (name == NULL) return;
3022
3023 /*
3024 * We should definitely be at the ending "S? '>'" part
3025 */
3026 SKIP_BLANKS;
3027 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3028 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3029 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3030 ctxt->wellFormed = 0;
3031 } else
3032 NEXT;
3033
3034 /*
3035 * If the name read is not one of the element in the parsing stack
3036 * then return, it's just an error.
3037 */
3038 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
3039 if (xmlStrEqual(name, ctxt->nameTab[i])) break;
3040 }
3041 if (i < 0) {
3042 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3043 ctxt->sax->error(ctxt->userData,
3044 "Unexpected end tag : %s\n", name);
3045 xmlFree(name);
3046 ctxt->wellFormed = 0;
3047 return;
3048 }
3049
3050
3051 /*
3052 * Check for auto-closure of HTML elements.
3053 */
3054
3055 htmlAutoCloseOnClose(ctxt, name);
3056
3057 /*
3058 * Well formedness constraints, opening and closing must match.
3059 * With the exception that the autoclose may have popped stuff out
3060 * of the stack.
3061 */
3062 if (!xmlStrEqual(name, ctxt->name)) {
3063#ifdef DEBUG
3064 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
3065#endif
3066 if ((ctxt->name != NULL) &&
3067 (!xmlStrEqual(ctxt->name, name))) {
3068 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3069 ctxt->sax->error(ctxt->userData,
3070 "Opening and ending tag mismatch: %s and %s\n",
3071 name, ctxt->name);
3072 ctxt->wellFormed = 0;
3073 }
3074 }
3075
3076 /*
3077 * SAX: End of Tag
3078 */
3079 oldname = ctxt->name;
3080 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3081 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3082 ctxt->sax->endElement(ctxt->userData, name);
3083 oldname = htmlnamePop(ctxt);
3084 if (oldname != NULL) {
3085#ifdef DEBUG
3086 xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
3087#endif
3088 xmlFree(oldname);
3089#ifdef DEBUG
3090 } else {
3091 xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
3092#endif
3093 }
3094 }
3095
3096 if (name != NULL)
3097 xmlFree(name);
3098
3099 return;
3100}
3101
3102
3103/**
3104 * htmlParseReference:
3105 * @ctxt: an HTML parser context
3106 *
3107 * parse and handle entity references in content,
3108 * this will end-up in a call to character() since this is either a
3109 * CharRef, or a predefined entity.
3110 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003111static void
Owen Taylor3473f882001-02-23 17:55:21 +00003112htmlParseReference(htmlParserCtxtPtr ctxt) {
3113 htmlEntityDescPtr ent;
3114 xmlChar out[6];
3115 xmlChar *name;
3116 if (CUR != '&') return;
3117
3118 if (NXT(1) == '#') {
3119 unsigned int c;
3120 int bits, i = 0;
3121
3122 c = htmlParseCharRef(ctxt);
3123 if (c == 0)
3124 return;
3125
3126 if (c < 0x80) { out[i++]= c; bits= -6; }
3127 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3128 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3129 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3130
3131 for ( ; bits >= 0; bits-= 6) {
3132 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3133 }
3134 out[i] = 0;
3135
3136 htmlCheckParagraph(ctxt);
3137 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3138 ctxt->sax->characters(ctxt->userData, out, i);
3139 } else {
3140 ent = htmlParseEntityRef(ctxt, &name);
3141 if (name == NULL) {
3142 htmlCheckParagraph(ctxt);
3143 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3144 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3145 return;
3146 }
3147 if ((ent == NULL) || (ent->value <= 0)) {
3148 htmlCheckParagraph(ctxt);
3149 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3150 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3151 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3152 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3153 }
3154 } else {
3155 unsigned int c;
3156 int bits, i = 0;
3157
3158 c = ent->value;
3159 if (c < 0x80)
3160 { out[i++]= c; bits= -6; }
3161 else if (c < 0x800)
3162 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3163 else if (c < 0x10000)
3164 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3165 else
3166 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3167
3168 for ( ; bits >= 0; bits-= 6) {
3169 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3170 }
3171 out[i] = 0;
3172
3173 htmlCheckParagraph(ctxt);
3174 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3175 ctxt->sax->characters(ctxt->userData, out, i);
3176 }
3177 xmlFree(name);
3178 }
3179}
3180
3181/**
3182 * htmlParseContent:
3183 * @ctxt: an HTML parser context
3184 * @name: the node name
3185 *
3186 * Parse a content: comment, sub-element, reference or text.
3187 *
3188 */
3189
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003190static void
Owen Taylor3473f882001-02-23 17:55:21 +00003191htmlParseContent(htmlParserCtxtPtr ctxt) {
3192 xmlChar *currentNode;
3193 int depth;
3194
3195 currentNode = xmlStrdup(ctxt->name);
3196 depth = ctxt->nameNr;
3197 while (1) {
3198 long cons = ctxt->nbChars;
3199
3200 GROW;
3201 /*
3202 * Our tag or one of it's parent or children is ending.
3203 */
3204 if ((CUR == '<') && (NXT(1) == '/')) {
3205 htmlParseEndTag(ctxt);
3206 if (currentNode != NULL) xmlFree(currentNode);
3207 return;
3208 }
3209
3210 /*
3211 * Has this node been popped out during parsing of
3212 * the next element
3213 */
3214 if ((!xmlStrEqual(currentNode, ctxt->name)) &&
3215 (depth >= ctxt->nameNr)) {
3216 if (currentNode != NULL) xmlFree(currentNode);
3217 return;
3218 }
3219
Daniel Veillardf9533d12001-03-03 10:04:57 +00003220 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3221 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003222 /*
3223 * Handle SCRIPT/STYLE separately
3224 */
3225 htmlParseScript(ctxt);
3226 } else {
3227 /*
3228 * Sometimes DOCTYPE arrives in the middle of the document
3229 */
3230 if ((CUR == '<') && (NXT(1) == '!') &&
3231 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3232 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3233 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3234 (UPP(8) == 'E')) {
3235 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3236 ctxt->sax->error(ctxt->userData,
3237 "Misplaced DOCTYPE declaration\n");
3238 ctxt->wellFormed = 0;
3239 htmlParseDocTypeDecl(ctxt);
3240 }
3241
3242 /*
3243 * First case : a comment
3244 */
3245 if ((CUR == '<') && (NXT(1) == '!') &&
3246 (NXT(2) == '-') && (NXT(3) == '-')) {
3247 htmlParseComment(ctxt);
3248 }
3249
3250 /*
3251 * Second case : a sub-element.
3252 */
3253 else if (CUR == '<') {
3254 htmlParseElement(ctxt);
3255 }
3256
3257 /*
3258 * Third case : a reference. If if has not been resolved,
3259 * parsing returns it's Name, create the node
3260 */
3261 else if (CUR == '&') {
3262 htmlParseReference(ctxt);
3263 }
3264
3265 /*
3266 * Fourth : end of the resource
3267 */
3268 else if (CUR == 0) {
Daniel Veillardf9533d12001-03-03 10:04:57 +00003269 int level = ctxt->nodeNr;
Owen Taylor3473f882001-02-23 17:55:21 +00003270 htmlAutoClose(ctxt, NULL);
Daniel Veillardf9533d12001-03-03 10:04:57 +00003271 if (level == ctxt->nodeNr)
3272 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003273 }
3274
3275 /*
3276 * Last case, text. Note that References are handled directly.
3277 */
3278 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003279 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003280 }
3281
3282 if (cons == ctxt->nbChars) {
3283 if (ctxt->node != NULL) {
3284 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3285 ctxt->sax->error(ctxt->userData,
3286 "detected an error in element content\n");
3287 ctxt->wellFormed = 0;
3288 }
3289 break;
3290 }
3291 }
3292 GROW;
3293 }
3294 if (currentNode != NULL) xmlFree(currentNode);
3295}
3296
3297/**
3298 * htmlParseElement:
3299 * @ctxt: an HTML parser context
3300 *
3301 * parse an HTML element, this is highly recursive
3302 *
3303 * [39] element ::= EmptyElemTag | STag content ETag
3304 *
3305 * [41] Attribute ::= Name Eq AttValue
3306 */
3307
3308void
3309htmlParseElement(htmlParserCtxtPtr ctxt) {
3310 xmlChar *name;
3311 xmlChar *currentNode = NULL;
3312 htmlElemDescPtr info;
3313 htmlParserNodeInfo node_info;
3314 xmlChar *oldname;
3315 int depth = ctxt->nameNr;
3316
3317 /* Capture start position */
3318 if (ctxt->record_info) {
3319 node_info.begin_pos = ctxt->input->consumed +
3320 (CUR_PTR - ctxt->input->base);
3321 node_info.begin_line = ctxt->input->line;
3322 }
3323
3324 oldname = xmlStrdup(ctxt->name);
3325 htmlParseStartTag(ctxt);
3326 name = ctxt->name;
3327#ifdef DEBUG
3328 if (oldname == NULL)
3329 xmlGenericError(xmlGenericErrorContext,
3330 "Start of element %s\n", name);
3331 else if (name == NULL)
3332 xmlGenericError(xmlGenericErrorContext,
3333 "Start of element failed, was %s\n", oldname);
3334 else
3335 xmlGenericError(xmlGenericErrorContext,
3336 "Start of element %s, was %s\n", name, oldname);
3337#endif
3338 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3339 (name == NULL)) {
3340 if (CUR == '>')
3341 NEXT;
3342 if (oldname != NULL)
3343 xmlFree(oldname);
3344 return;
3345 }
3346 if (oldname != NULL)
3347 xmlFree(oldname);
3348
3349 /*
3350 * Lookup the info for that element.
3351 */
3352 info = htmlTagLookup(name);
3353 if (info == NULL) {
3354 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3355 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3356 name);
3357 ctxt->wellFormed = 0;
3358 } else if (info->depr) {
3359/***************************
3360 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3361 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3362 name);
3363 ***************************/
3364 }
3365
3366 /*
3367 * Check for an Empty Element labelled the XML/SGML way
3368 */
3369 if ((CUR == '/') && (NXT(1) == '>')) {
3370 SKIP(2);
3371 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3372 ctxt->sax->endElement(ctxt->userData, name);
3373 oldname = htmlnamePop(ctxt);
3374#ifdef DEBUG
3375 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
3376#endif
3377 if (oldname != NULL)
3378 xmlFree(oldname);
3379 return;
3380 }
3381
3382 if (CUR == '>') {
3383 NEXT;
3384 } else {
3385 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3386 ctxt->sax->error(ctxt->userData,
3387 "Couldn't find end of Start Tag %s\n",
3388 name);
3389 ctxt->wellFormed = 0;
3390
3391 /*
3392 * end of parsing of this node.
3393 */
3394 if (xmlStrEqual(name, ctxt->name)) {
3395 nodePop(ctxt);
3396 oldname = htmlnamePop(ctxt);
3397#ifdef DEBUG
3398 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
3399#endif
3400 if (oldname != NULL)
3401 xmlFree(oldname);
3402 }
3403
3404 /*
3405 * Capture end position and add node
3406 */
3407 if ( currentNode != NULL && ctxt->record_info ) {
3408 node_info.end_pos = ctxt->input->consumed +
3409 (CUR_PTR - ctxt->input->base);
3410 node_info.end_line = ctxt->input->line;
3411 node_info.node = ctxt->node;
3412 xmlParserAddNodeInfo(ctxt, &node_info);
3413 }
3414 return;
3415 }
3416
3417 /*
3418 * Check for an Empty Element from DTD definition
3419 */
3420 if ((info != NULL) && (info->empty)) {
3421 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3422 ctxt->sax->endElement(ctxt->userData, name);
3423 oldname = htmlnamePop(ctxt);
3424#ifdef DEBUG
3425 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
3426#endif
3427 if (oldname != NULL)
3428 xmlFree(oldname);
3429 return;
3430 }
3431
3432 /*
3433 * Parse the content of the element:
3434 */
3435 currentNode = xmlStrdup(ctxt->name);
3436 depth = ctxt->nameNr;
3437 while (IS_CHAR(CUR)) {
3438 htmlParseContent(ctxt);
3439 if (ctxt->nameNr < depth) break;
3440 }
3441
3442 if (!IS_CHAR(CUR)) {
3443 /************
3444 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3445 ctxt->sax->error(ctxt->userData,
3446 "Premature end of data in tag %s\n", currentNode);
3447 ctxt->wellFormed = 0;
3448 *************/
3449
3450 /*
3451 * end of parsing of this node.
3452 */
3453 nodePop(ctxt);
3454 oldname = htmlnamePop(ctxt);
3455#ifdef DEBUG
3456 xmlGenericError(xmlGenericErrorContext,"Premature end of tag %s : popping out %s\n", name, oldname);
3457#endif
3458 if (oldname != NULL)
3459 xmlFree(oldname);
3460 if (currentNode != NULL)
3461 xmlFree(currentNode);
3462 return;
3463 }
3464
3465 /*
3466 * Capture end position and add node
3467 */
3468 if ( currentNode != NULL && ctxt->record_info ) {
3469 node_info.end_pos = ctxt->input->consumed +
3470 (CUR_PTR - ctxt->input->base);
3471 node_info.end_line = ctxt->input->line;
3472 node_info.node = ctxt->node;
3473 xmlParserAddNodeInfo(ctxt, &node_info);
3474 }
3475 if (currentNode != NULL)
3476 xmlFree(currentNode);
3477}
3478
3479/**
3480 * htmlParseDocument :
3481 * @ctxt: an HTML parser context
3482 *
3483 * parse an HTML document (and build a tree if using the standard SAX
3484 * interface).
3485 *
3486 * Returns 0, -1 in case of error. the parser context is augmented
3487 * as a result of the parsing.
3488 */
3489
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003490static int
Owen Taylor3473f882001-02-23 17:55:21 +00003491htmlParseDocument(htmlParserCtxtPtr ctxt) {
3492 xmlDtdPtr dtd;
3493
3494 htmlDefaultSAXHandlerInit();
3495 ctxt->html = 1;
3496
3497 GROW;
3498 /*
3499 * SAX: beginning of the document processing.
3500 */
3501 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3502 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3503
3504 /*
3505 * Wipe out everything which is before the first '<'
3506 */
3507 SKIP_BLANKS;
3508 if (CUR == 0) {
3509 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3510 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3511 ctxt->wellFormed = 0;
3512 }
3513
3514 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3515 ctxt->sax->startDocument(ctxt->userData);
3516
3517
3518 /*
3519 * Parse possible comments before any content
3520 */
3521 while ((CUR == '<') && (NXT(1) == '!') &&
3522 (NXT(2) == '-') && (NXT(3) == '-')) {
3523 htmlParseComment(ctxt);
3524 SKIP_BLANKS;
3525 }
3526
3527
3528 /*
3529 * Then possibly doc type declaration(s) and more Misc
3530 * (doctypedecl Misc*)?
3531 */
3532 if ((CUR == '<') && (NXT(1) == '!') &&
3533 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3534 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3535 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3536 (UPP(8) == 'E')) {
3537 htmlParseDocTypeDecl(ctxt);
3538 }
3539 SKIP_BLANKS;
3540
3541 /*
3542 * Parse possible comments before any content
3543 */
3544 while ((CUR == '<') && (NXT(1) == '!') &&
3545 (NXT(2) == '-') && (NXT(3) == '-')) {
3546 htmlParseComment(ctxt);
3547 SKIP_BLANKS;
3548 }
3549
3550 /*
3551 * Time to start parsing the tree itself
3552 */
3553 htmlParseContent(ctxt);
3554
3555 /*
3556 * autoclose
3557 */
3558 if (CUR == 0)
3559 htmlAutoClose(ctxt, NULL);
3560
3561
3562 /*
3563 * SAX: end of the document processing.
3564 */
3565 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3566 ctxt->sax->endDocument(ctxt->userData);
3567
3568 if (ctxt->myDoc != NULL) {
3569 dtd = xmlGetIntSubset(ctxt->myDoc);
3570 if (dtd == NULL)
3571 ctxt->myDoc->intSubset =
3572 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3573 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3574 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3575 }
3576 if (! ctxt->wellFormed) return(-1);
3577 return(0);
3578}
3579
3580
3581/************************************************************************
3582 * *
3583 * Parser contexts handling *
3584 * *
3585 ************************************************************************/
3586
3587/**
3588 * xmlInitParserCtxt:
3589 * @ctxt: an HTML parser context
3590 *
3591 * Initialize a parser context
3592 */
3593
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003594static void
Owen Taylor3473f882001-02-23 17:55:21 +00003595htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3596{
3597 htmlSAXHandler *sax;
3598
3599 if (ctxt == NULL) return;
3600 memset(ctxt, 0, sizeof(htmlParserCtxt));
3601
3602 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3603 if (sax == NULL) {
3604 xmlGenericError(xmlGenericErrorContext,
3605 "htmlInitParserCtxt: out of memory\n");
3606 }
3607 else
3608 memset(sax, 0, sizeof(htmlSAXHandler));
3609
3610 /* Allocate the Input stack */
3611 ctxt->inputTab = (htmlParserInputPtr *)
3612 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3613 if (ctxt->inputTab == NULL) {
3614 xmlGenericError(xmlGenericErrorContext,
3615 "htmlInitParserCtxt: out of memory\n");
3616 ctxt->inputNr = 0;
3617 ctxt->inputMax = 0;
3618 ctxt->input = NULL;
3619 return;
3620 }
3621 ctxt->inputNr = 0;
3622 ctxt->inputMax = 5;
3623 ctxt->input = NULL;
3624 ctxt->version = NULL;
3625 ctxt->encoding = NULL;
3626 ctxt->standalone = -1;
3627 ctxt->instate = XML_PARSER_START;
3628
3629 /* Allocate the Node stack */
3630 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3631 if (ctxt->nodeTab == NULL) {
3632 xmlGenericError(xmlGenericErrorContext,
3633 "htmlInitParserCtxt: out of memory\n");
3634 ctxt->nodeNr = 0;
3635 ctxt->nodeMax = 0;
3636 ctxt->node = NULL;
3637 ctxt->inputNr = 0;
3638 ctxt->inputMax = 0;
3639 ctxt->input = NULL;
3640 return;
3641 }
3642 ctxt->nodeNr = 0;
3643 ctxt->nodeMax = 10;
3644 ctxt->node = NULL;
3645
3646 /* Allocate the Name stack */
3647 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
3648 if (ctxt->nameTab == NULL) {
3649 xmlGenericError(xmlGenericErrorContext,
3650 "htmlInitParserCtxt: out of memory\n");
3651 ctxt->nameNr = 0;
3652 ctxt->nameMax = 10;
3653 ctxt->name = NULL;
3654 ctxt->nodeNr = 0;
3655 ctxt->nodeMax = 0;
3656 ctxt->node = NULL;
3657 ctxt->inputNr = 0;
3658 ctxt->inputMax = 0;
3659 ctxt->input = NULL;
3660 return;
3661 }
3662 ctxt->nameNr = 0;
3663 ctxt->nameMax = 10;
3664 ctxt->name = NULL;
3665
3666 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3667 else {
3668 ctxt->sax = sax;
3669 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3670 }
3671 ctxt->userData = ctxt;
3672 ctxt->myDoc = NULL;
3673 ctxt->wellFormed = 1;
3674 ctxt->replaceEntities = 0;
3675 ctxt->html = 1;
3676 ctxt->record_info = 0;
3677 ctxt->validate = 0;
3678 ctxt->nbChars = 0;
3679 ctxt->checkIndex = 0;
3680 xmlInitNodeInfoSeq(&ctxt->node_seq);
3681}
3682
3683/**
3684 * htmlFreeParserCtxt:
3685 * @ctxt: an HTML parser context
3686 *
3687 * Free all the memory used by a parser context. However the parsed
3688 * document in ctxt->myDoc is not freed.
3689 */
3690
3691void
3692htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3693{
3694 xmlFreeParserCtxt(ctxt);
3695}
3696
3697/**
3698 * htmlCreateDocParserCtxt :
3699 * @cur: a pointer to an array of xmlChar
3700 * @encoding: a free form C string describing the HTML document encoding, or NULL
3701 *
3702 * Create a parser context for an HTML document.
3703 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003704 * TODO: check the need to add encoding handling there
3705 *
Owen Taylor3473f882001-02-23 17:55:21 +00003706 * Returns the new parser context or NULL
3707 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003708static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00003709htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Owen Taylor3473f882001-02-23 17:55:21 +00003710 htmlParserCtxtPtr ctxt;
3711 htmlParserInputPtr input;
3712 /* htmlCharEncoding enc; */
3713
3714 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
3715 if (ctxt == NULL) {
3716 perror("malloc");
3717 return(NULL);
3718 }
3719 htmlInitParserCtxt(ctxt);
3720 input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
3721 if (input == NULL) {
3722 perror("malloc");
3723 xmlFree(ctxt);
3724 return(NULL);
3725 }
3726 memset(input, 0, sizeof(htmlParserInput));
3727
3728 input->line = 1;
3729 input->col = 1;
3730 input->base = cur;
3731 input->cur = cur;
3732
3733 inputPush(ctxt, input);
3734 return(ctxt);
3735}
3736
3737/************************************************************************
3738 * *
3739 * Progressive parsing interfaces *
3740 * *
3741 ************************************************************************/
3742
3743/**
3744 * htmlParseLookupSequence:
3745 * @ctxt: an HTML parser context
3746 * @first: the first char to lookup
3747 * @next: the next char to lookup or zero
3748 * @third: the next char to lookup or zero
3749 *
3750 * Try to find if a sequence (first, next, third) or just (first next) or
3751 * (first) is available in the input stream.
3752 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3753 * to avoid rescanning sequences of bytes, it DOES change the state of the
3754 * parser, do not use liberally.
3755 * This is basically similar to xmlParseLookupSequence()
3756 *
3757 * Returns the index to the current parsing point if the full sequence
3758 * is available, -1 otherwise.
3759 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003760static int
Owen Taylor3473f882001-02-23 17:55:21 +00003761htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3762 xmlChar next, xmlChar third) {
3763 int base, len;
3764 htmlParserInputPtr in;
3765 const xmlChar *buf;
3766
3767 in = ctxt->input;
3768 if (in == NULL) return(-1);
3769 base = in->cur - in->base;
3770 if (base < 0) return(-1);
3771 if (ctxt->checkIndex > base)
3772 base = ctxt->checkIndex;
3773 if (in->buf == NULL) {
3774 buf = in->base;
3775 len = in->length;
3776 } else {
3777 buf = in->buf->buffer->content;
3778 len = in->buf->buffer->use;
3779 }
3780 /* take into account the sequence length */
3781 if (third) len -= 2;
3782 else if (next) len --;
3783 for (;base < len;base++) {
3784 if (buf[base] == first) {
3785 if (third != 0) {
3786 if ((buf[base + 1] != next) ||
3787 (buf[base + 2] != third)) continue;
3788 } else if (next != 0) {
3789 if (buf[base + 1] != next) continue;
3790 }
3791 ctxt->checkIndex = 0;
3792#ifdef DEBUG_PUSH
3793 if (next == 0)
3794 xmlGenericError(xmlGenericErrorContext,
3795 "HPP: lookup '%c' found at %d\n",
3796 first, base);
3797 else if (third == 0)
3798 xmlGenericError(xmlGenericErrorContext,
3799 "HPP: lookup '%c%c' found at %d\n",
3800 first, next, base);
3801 else
3802 xmlGenericError(xmlGenericErrorContext,
3803 "HPP: lookup '%c%c%c' found at %d\n",
3804 first, next, third, base);
3805#endif
3806 return(base - (in->cur - in->base));
3807 }
3808 }
3809 ctxt->checkIndex = base;
3810#ifdef DEBUG_PUSH
3811 if (next == 0)
3812 xmlGenericError(xmlGenericErrorContext,
3813 "HPP: lookup '%c' failed\n", first);
3814 else if (third == 0)
3815 xmlGenericError(xmlGenericErrorContext,
3816 "HPP: lookup '%c%c' failed\n", first, next);
3817 else
3818 xmlGenericError(xmlGenericErrorContext,
3819 "HPP: lookup '%c%c%c' failed\n", first, next, third);
3820#endif
3821 return(-1);
3822}
3823
3824/**
3825 * htmlParseTryOrFinish:
3826 * @ctxt: an HTML parser context
3827 * @terminate: last chunk indicator
3828 *
3829 * Try to progress on parsing
3830 *
3831 * Returns zero if no parsing was possible
3832 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003833static int
Owen Taylor3473f882001-02-23 17:55:21 +00003834htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
3835 int ret = 0;
3836 htmlParserInputPtr in;
3837 int avail = 0;
3838 xmlChar cur, next;
3839
3840#ifdef DEBUG_PUSH
3841 switch (ctxt->instate) {
3842 case XML_PARSER_EOF:
3843 xmlGenericError(xmlGenericErrorContext,
3844 "HPP: try EOF\n"); break;
3845 case XML_PARSER_START:
3846 xmlGenericError(xmlGenericErrorContext,
3847 "HPP: try START\n"); break;
3848 case XML_PARSER_MISC:
3849 xmlGenericError(xmlGenericErrorContext,
3850 "HPP: try MISC\n");break;
3851 case XML_PARSER_COMMENT:
3852 xmlGenericError(xmlGenericErrorContext,
3853 "HPP: try COMMENT\n");break;
3854 case XML_PARSER_PROLOG:
3855 xmlGenericError(xmlGenericErrorContext,
3856 "HPP: try PROLOG\n");break;
3857 case XML_PARSER_START_TAG:
3858 xmlGenericError(xmlGenericErrorContext,
3859 "HPP: try START_TAG\n");break;
3860 case XML_PARSER_CONTENT:
3861 xmlGenericError(xmlGenericErrorContext,
3862 "HPP: try CONTENT\n");break;
3863 case XML_PARSER_CDATA_SECTION:
3864 xmlGenericError(xmlGenericErrorContext,
3865 "HPP: try CDATA_SECTION\n");break;
3866 case XML_PARSER_END_TAG:
3867 xmlGenericError(xmlGenericErrorContext,
3868 "HPP: try END_TAG\n");break;
3869 case XML_PARSER_ENTITY_DECL:
3870 xmlGenericError(xmlGenericErrorContext,
3871 "HPP: try ENTITY_DECL\n");break;
3872 case XML_PARSER_ENTITY_VALUE:
3873 xmlGenericError(xmlGenericErrorContext,
3874 "HPP: try ENTITY_VALUE\n");break;
3875 case XML_PARSER_ATTRIBUTE_VALUE:
3876 xmlGenericError(xmlGenericErrorContext,
3877 "HPP: try ATTRIBUTE_VALUE\n");break;
3878 case XML_PARSER_DTD:
3879 xmlGenericError(xmlGenericErrorContext,
3880 "HPP: try DTD\n");break;
3881 case XML_PARSER_EPILOG:
3882 xmlGenericError(xmlGenericErrorContext,
3883 "HPP: try EPILOG\n");break;
3884 case XML_PARSER_PI:
3885 xmlGenericError(xmlGenericErrorContext,
3886 "HPP: try PI\n");break;
3887 case XML_PARSER_SYSTEM_LITERAL:
3888 xmlGenericError(xmlGenericErrorContext,
3889 "HPP: try SYSTEM_LITERAL\n");break;
3890 }
3891#endif
3892
3893 while (1) {
3894
3895 in = ctxt->input;
3896 if (in == NULL) break;
3897 if (in->buf == NULL)
3898 avail = in->length - (in->cur - in->base);
3899 else
3900 avail = in->buf->buffer->use - (in->cur - in->base);
3901 if ((avail == 0) && (terminate)) {
3902 htmlAutoClose(ctxt, NULL);
3903 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
3904 /*
3905 * SAX: end of the document processing.
3906 */
3907 ctxt->instate = XML_PARSER_EOF;
3908 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3909 ctxt->sax->endDocument(ctxt->userData);
3910 }
3911 }
3912 if (avail < 1)
3913 goto done;
3914 switch (ctxt->instate) {
3915 case XML_PARSER_EOF:
3916 /*
3917 * Document parsing is done !
3918 */
3919 goto done;
3920 case XML_PARSER_START:
3921 /*
3922 * Very first chars read from the document flow.
3923 */
3924 cur = in->cur[0];
3925 if (IS_BLANK(cur)) {
3926 SKIP_BLANKS;
3927 if (in->buf == NULL)
3928 avail = in->length - (in->cur - in->base);
3929 else
3930 avail = in->buf->buffer->use - (in->cur - in->base);
3931 }
3932 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3933 ctxt->sax->setDocumentLocator(ctxt->userData,
3934 &xmlDefaultSAXLocator);
3935 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
3936 (!ctxt->disableSAX))
3937 ctxt->sax->startDocument(ctxt->userData);
3938
3939 cur = in->cur[0];
3940 next = in->cur[1];
3941 if ((cur == '<') && (next == '!') &&
3942 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3943 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3944 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3945 (UPP(8) == 'E')) {
3946 if ((!terminate) &&
3947 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
3948 goto done;
3949#ifdef DEBUG_PUSH
3950 xmlGenericError(xmlGenericErrorContext,
3951 "HPP: Parsing internal subset\n");
3952#endif
3953 htmlParseDocTypeDecl(ctxt);
3954 ctxt->instate = XML_PARSER_PROLOG;
3955#ifdef DEBUG_PUSH
3956 xmlGenericError(xmlGenericErrorContext,
3957 "HPP: entering PROLOG\n");
3958#endif
3959 } else {
3960 ctxt->instate = XML_PARSER_MISC;
3961 }
3962#ifdef DEBUG_PUSH
3963 xmlGenericError(xmlGenericErrorContext,
3964 "HPP: entering MISC\n");
3965#endif
3966 break;
3967 case XML_PARSER_MISC:
3968 SKIP_BLANKS;
3969 if (in->buf == NULL)
3970 avail = in->length - (in->cur - in->base);
3971 else
3972 avail = in->buf->buffer->use - (in->cur - in->base);
3973 if (avail < 2)
3974 goto done;
3975 cur = in->cur[0];
3976 next = in->cur[1];
3977 if ((cur == '<') && (next == '!') &&
3978 (in->cur[2] == '-') && (in->cur[3] == '-')) {
3979 if ((!terminate) &&
3980 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
3981 goto done;
3982#ifdef DEBUG_PUSH
3983 xmlGenericError(xmlGenericErrorContext,
3984 "HPP: Parsing Comment\n");
3985#endif
3986 htmlParseComment(ctxt);
3987 ctxt->instate = XML_PARSER_MISC;
3988 } else if ((cur == '<') && (next == '!') &&
3989 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3990 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3991 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3992 (UPP(8) == 'E')) {
3993 if ((!terminate) &&
3994 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
3995 goto done;
3996#ifdef DEBUG_PUSH
3997 xmlGenericError(xmlGenericErrorContext,
3998 "HPP: Parsing internal subset\n");
3999#endif
4000 htmlParseDocTypeDecl(ctxt);
4001 ctxt->instate = XML_PARSER_PROLOG;
4002#ifdef DEBUG_PUSH
4003 xmlGenericError(xmlGenericErrorContext,
4004 "HPP: entering PROLOG\n");
4005#endif
4006 } else if ((cur == '<') && (next == '!') &&
4007 (avail < 9)) {
4008 goto done;
4009 } else {
4010 ctxt->instate = XML_PARSER_START_TAG;
4011#ifdef DEBUG_PUSH
4012 xmlGenericError(xmlGenericErrorContext,
4013 "HPP: entering START_TAG\n");
4014#endif
4015 }
4016 break;
4017 case XML_PARSER_PROLOG:
4018 SKIP_BLANKS;
4019 if (in->buf == NULL)
4020 avail = in->length - (in->cur - in->base);
4021 else
4022 avail = in->buf->buffer->use - (in->cur - in->base);
4023 if (avail < 2)
4024 goto done;
4025 cur = in->cur[0];
4026 next = in->cur[1];
4027 if ((cur == '<') && (next == '!') &&
4028 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4029 if ((!terminate) &&
4030 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4031 goto done;
4032#ifdef DEBUG_PUSH
4033 xmlGenericError(xmlGenericErrorContext,
4034 "HPP: Parsing Comment\n");
4035#endif
4036 htmlParseComment(ctxt);
4037 ctxt->instate = XML_PARSER_PROLOG;
4038 } else if ((cur == '<') && (next == '!') &&
4039 (avail < 4)) {
4040 goto done;
4041 } else {
4042 ctxt->instate = XML_PARSER_START_TAG;
4043#ifdef DEBUG_PUSH
4044 xmlGenericError(xmlGenericErrorContext,
4045 "HPP: entering START_TAG\n");
4046#endif
4047 }
4048 break;
4049 case XML_PARSER_EPILOG:
4050 if (in->buf == NULL)
4051 avail = in->length - (in->cur - in->base);
4052 else
4053 avail = in->buf->buffer->use - (in->cur - in->base);
4054 if (avail < 1)
4055 goto done;
4056 cur = in->cur[0];
4057 if (IS_BLANK(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004058 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004059 goto done;
4060 }
4061 if (avail < 2)
4062 goto done;
4063 next = in->cur[1];
4064 if ((cur == '<') && (next == '!') &&
4065 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4066 if ((!terminate) &&
4067 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4068 goto done;
4069#ifdef DEBUG_PUSH
4070 xmlGenericError(xmlGenericErrorContext,
4071 "HPP: Parsing Comment\n");
4072#endif
4073 htmlParseComment(ctxt);
4074 ctxt->instate = XML_PARSER_EPILOG;
4075 } else if ((cur == '<') && (next == '!') &&
4076 (avail < 4)) {
4077 goto done;
4078 } else {
4079 ctxt->errNo = XML_ERR_DOCUMENT_END;
4080 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4081 ctxt->sax->error(ctxt->userData,
4082 "Extra content at the end of the document\n");
4083 ctxt->wellFormed = 0;
4084 ctxt->instate = XML_PARSER_EOF;
4085#ifdef DEBUG_PUSH
4086 xmlGenericError(xmlGenericErrorContext,
4087 "HPP: entering EOF\n");
4088#endif
4089 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4090 ctxt->sax->endDocument(ctxt->userData);
4091 goto done;
4092 }
4093 break;
4094 case XML_PARSER_START_TAG: {
4095 xmlChar *name, *oldname;
4096 int depth = ctxt->nameNr;
4097 htmlElemDescPtr info;
4098
4099 if (avail < 2)
4100 goto done;
4101 cur = in->cur[0];
4102 if (cur != '<') {
4103 ctxt->instate = XML_PARSER_CONTENT;
4104#ifdef DEBUG_PUSH
4105 xmlGenericError(xmlGenericErrorContext,
4106 "HPP: entering CONTENT\n");
4107#endif
4108 break;
4109 }
4110 if ((!terminate) &&
4111 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4112 goto done;
4113
4114 oldname = xmlStrdup(ctxt->name);
4115 htmlParseStartTag(ctxt);
4116 name = ctxt->name;
4117#ifdef DEBUG
4118 if (oldname == NULL)
4119 xmlGenericError(xmlGenericErrorContext,
4120 "Start of element %s\n", name);
4121 else if (name == NULL)
4122 xmlGenericError(xmlGenericErrorContext,
4123 "Start of element failed, was %s\n",
4124 oldname);
4125 else
4126 xmlGenericError(xmlGenericErrorContext,
4127 "Start of element %s, was %s\n",
4128 name, oldname);
4129#endif
4130 if (((depth == ctxt->nameNr) &&
4131 (xmlStrEqual(oldname, ctxt->name))) ||
4132 (name == NULL)) {
4133 if (CUR == '>')
4134 NEXT;
4135 if (oldname != NULL)
4136 xmlFree(oldname);
4137 break;
4138 }
4139 if (oldname != NULL)
4140 xmlFree(oldname);
4141
4142 /*
4143 * Lookup the info for that element.
4144 */
4145 info = htmlTagLookup(name);
4146 if (info == NULL) {
4147 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4148 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4149 name);
4150 ctxt->wellFormed = 0;
4151 } else if (info->depr) {
4152 /***************************
4153 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4154 ctxt->sax->warning(ctxt->userData,
4155 "Tag %s is deprecated\n",
4156 name);
4157 ***************************/
4158 }
4159
4160 /*
4161 * Check for an Empty Element labelled the XML/SGML way
4162 */
4163 if ((CUR == '/') && (NXT(1) == '>')) {
4164 SKIP(2);
4165 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4166 ctxt->sax->endElement(ctxt->userData, name);
4167 oldname = htmlnamePop(ctxt);
4168#ifdef DEBUG
4169 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
4170 oldname);
4171#endif
4172 if (oldname != NULL)
4173 xmlFree(oldname);
4174 ctxt->instate = XML_PARSER_CONTENT;
4175#ifdef DEBUG_PUSH
4176 xmlGenericError(xmlGenericErrorContext,
4177 "HPP: entering CONTENT\n");
4178#endif
4179 break;
4180 }
4181
4182 if (CUR == '>') {
4183 NEXT;
4184 } else {
4185 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4186 ctxt->sax->error(ctxt->userData,
4187 "Couldn't find end of Start Tag %s\n",
4188 name);
4189 ctxt->wellFormed = 0;
4190
4191 /*
4192 * end of parsing of this node.
4193 */
4194 if (xmlStrEqual(name, ctxt->name)) {
4195 nodePop(ctxt);
4196 oldname = htmlnamePop(ctxt);
4197#ifdef DEBUG
4198 xmlGenericError(xmlGenericErrorContext,
4199 "End of start tag problem: popping out %s\n", oldname);
4200#endif
4201 if (oldname != NULL)
4202 xmlFree(oldname);
4203 }
4204
4205 ctxt->instate = XML_PARSER_CONTENT;
4206#ifdef DEBUG_PUSH
4207 xmlGenericError(xmlGenericErrorContext,
4208 "HPP: entering CONTENT\n");
4209#endif
4210 break;
4211 }
4212
4213 /*
4214 * Check for an Empty Element from DTD definition
4215 */
4216 if ((info != NULL) && (info->empty)) {
4217 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4218 ctxt->sax->endElement(ctxt->userData, name);
4219 oldname = htmlnamePop(ctxt);
4220#ifdef DEBUG
4221 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
4222#endif
4223 if (oldname != NULL)
4224 xmlFree(oldname);
4225 }
4226 ctxt->instate = XML_PARSER_CONTENT;
4227#ifdef DEBUG_PUSH
4228 xmlGenericError(xmlGenericErrorContext,
4229 "HPP: entering CONTENT\n");
4230#endif
4231 break;
4232 }
4233 case XML_PARSER_CONTENT: {
4234 long cons;
4235 /*
4236 * Handle preparsed entities and charRef
4237 */
4238 if (ctxt->token != 0) {
4239 xmlChar chr[2] = { 0 , 0 } ;
4240
4241 chr[0] = (xmlChar) ctxt->token;
4242 htmlCheckParagraph(ctxt);
4243 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4244 ctxt->sax->characters(ctxt->userData, chr, 1);
4245 ctxt->token = 0;
4246 ctxt->checkIndex = 0;
4247 }
4248 if ((avail == 1) && (terminate)) {
4249 cur = in->cur[0];
4250 if ((cur != '<') && (cur != '&')) {
4251 if (ctxt->sax != NULL) {
4252 if (IS_BLANK(cur)) {
4253 if (ctxt->sax->ignorableWhitespace != NULL)
4254 ctxt->sax->ignorableWhitespace(
4255 ctxt->userData, &cur, 1);
4256 } else {
4257 htmlCheckParagraph(ctxt);
4258 if (ctxt->sax->characters != NULL)
4259 ctxt->sax->characters(
4260 ctxt->userData, &cur, 1);
4261 }
4262 }
4263 ctxt->token = 0;
4264 ctxt->checkIndex = 0;
4265 NEXT;
4266 }
4267 break;
4268 }
4269 if (avail < 2)
4270 goto done;
4271 cur = in->cur[0];
4272 next = in->cur[1];
4273 cons = ctxt->nbChars;
4274 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4275 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4276 /*
4277 * Handle SCRIPT/STYLE separately
4278 */
4279 if ((!terminate) &&
4280 (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4281 goto done;
4282 htmlParseScript(ctxt);
4283 if ((cur == '<') && (next == '/')) {
4284 ctxt->instate = XML_PARSER_END_TAG;
4285 ctxt->checkIndex = 0;
4286#ifdef DEBUG_PUSH
4287 xmlGenericError(xmlGenericErrorContext,
4288 "HPP: entering END_TAG\n");
4289#endif
4290 break;
4291 }
4292 } else {
4293 /*
4294 * Sometimes DOCTYPE arrives in the middle of the document
4295 */
4296 if ((cur == '<') && (next == '!') &&
4297 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4298 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4299 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4300 (UPP(8) == 'E')) {
4301 if ((!terminate) &&
4302 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4303 goto done;
4304 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4305 ctxt->sax->error(ctxt->userData,
4306 "Misplaced DOCTYPE declaration\n");
4307 ctxt->wellFormed = 0;
4308 htmlParseDocTypeDecl(ctxt);
4309 } else if ((cur == '<') && (next == '!') &&
4310 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4311 if ((!terminate) &&
4312 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4313 goto done;
4314#ifdef DEBUG_PUSH
4315 xmlGenericError(xmlGenericErrorContext,
4316 "HPP: Parsing Comment\n");
4317#endif
4318 htmlParseComment(ctxt);
4319 ctxt->instate = XML_PARSER_CONTENT;
4320 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4321 goto done;
4322 } else if ((cur == '<') && (next == '/')) {
4323 ctxt->instate = XML_PARSER_END_TAG;
4324 ctxt->checkIndex = 0;
4325#ifdef DEBUG_PUSH
4326 xmlGenericError(xmlGenericErrorContext,
4327 "HPP: entering END_TAG\n");
4328#endif
4329 break;
4330 } else if (cur == '<') {
4331 ctxt->instate = XML_PARSER_START_TAG;
4332 ctxt->checkIndex = 0;
4333#ifdef DEBUG_PUSH
4334 xmlGenericError(xmlGenericErrorContext,
4335 "HPP: entering START_TAG\n");
4336#endif
4337 break;
4338 } else if (cur == '&') {
4339 if ((!terminate) &&
4340 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4341 goto done;
4342#ifdef DEBUG_PUSH
4343 xmlGenericError(xmlGenericErrorContext,
4344 "HPP: Parsing Reference\n");
4345#endif
4346 /* TODO: check generation of subtrees if noent !!! */
4347 htmlParseReference(ctxt);
4348 } else {
4349 /* TODO Avoid the extra copy, handle directly !!!!!! */
4350 /*
4351 * Goal of the following test is :
4352 * - minimize calls to the SAX 'character' callback
4353 * when they are mergeable
4354 */
4355 if ((ctxt->inputNr == 1) &&
4356 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4357 if ((!terminate) &&
4358 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4359 goto done;
4360 }
4361 ctxt->checkIndex = 0;
4362#ifdef DEBUG_PUSH
4363 xmlGenericError(xmlGenericErrorContext,
4364 "HPP: Parsing char data\n");
4365#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004366 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004367 }
4368 }
4369 if (cons == ctxt->nbChars) {
4370 if (ctxt->node != NULL) {
4371 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4372 ctxt->sax->error(ctxt->userData,
4373 "detected an error in element content\n");
4374 ctxt->wellFormed = 0;
4375 }
4376 NEXT;
4377 break;
4378 }
4379
4380 break;
4381 }
4382 case XML_PARSER_END_TAG:
4383 if (avail < 2)
4384 goto done;
4385 if ((!terminate) &&
4386 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4387 goto done;
4388 htmlParseEndTag(ctxt);
4389 if (ctxt->nameNr == 0) {
4390 ctxt->instate = XML_PARSER_EPILOG;
4391 } else {
4392 ctxt->instate = XML_PARSER_CONTENT;
4393 }
4394 ctxt->checkIndex = 0;
4395#ifdef DEBUG_PUSH
4396 xmlGenericError(xmlGenericErrorContext,
4397 "HPP: entering CONTENT\n");
4398#endif
4399 break;
4400 case XML_PARSER_CDATA_SECTION:
4401 xmlGenericError(xmlGenericErrorContext,
4402 "HPP: internal error, state == CDATA\n");
4403 ctxt->instate = XML_PARSER_CONTENT;
4404 ctxt->checkIndex = 0;
4405#ifdef DEBUG_PUSH
4406 xmlGenericError(xmlGenericErrorContext,
4407 "HPP: entering CONTENT\n");
4408#endif
4409 break;
4410 case XML_PARSER_DTD:
4411 xmlGenericError(xmlGenericErrorContext,
4412 "HPP: internal error, state == DTD\n");
4413 ctxt->instate = XML_PARSER_CONTENT;
4414 ctxt->checkIndex = 0;
4415#ifdef DEBUG_PUSH
4416 xmlGenericError(xmlGenericErrorContext,
4417 "HPP: entering CONTENT\n");
4418#endif
4419 break;
4420 case XML_PARSER_COMMENT:
4421 xmlGenericError(xmlGenericErrorContext,
4422 "HPP: internal error, state == COMMENT\n");
4423 ctxt->instate = XML_PARSER_CONTENT;
4424 ctxt->checkIndex = 0;
4425#ifdef DEBUG_PUSH
4426 xmlGenericError(xmlGenericErrorContext,
4427 "HPP: entering CONTENT\n");
4428#endif
4429 break;
4430 case XML_PARSER_PI:
4431 xmlGenericError(xmlGenericErrorContext,
4432 "HPP: internal error, state == PI\n");
4433 ctxt->instate = XML_PARSER_CONTENT;
4434 ctxt->checkIndex = 0;
4435#ifdef DEBUG_PUSH
4436 xmlGenericError(xmlGenericErrorContext,
4437 "HPP: entering CONTENT\n");
4438#endif
4439 break;
4440 case XML_PARSER_ENTITY_DECL:
4441 xmlGenericError(xmlGenericErrorContext,
4442 "HPP: internal error, state == ENTITY_DECL\n");
4443 ctxt->instate = XML_PARSER_CONTENT;
4444 ctxt->checkIndex = 0;
4445#ifdef DEBUG_PUSH
4446 xmlGenericError(xmlGenericErrorContext,
4447 "HPP: entering CONTENT\n");
4448#endif
4449 break;
4450 case XML_PARSER_ENTITY_VALUE:
4451 xmlGenericError(xmlGenericErrorContext,
4452 "HPP: internal error, state == ENTITY_VALUE\n");
4453 ctxt->instate = XML_PARSER_CONTENT;
4454 ctxt->checkIndex = 0;
4455#ifdef DEBUG_PUSH
4456 xmlGenericError(xmlGenericErrorContext,
4457 "HPP: entering DTD\n");
4458#endif
4459 break;
4460 case XML_PARSER_ATTRIBUTE_VALUE:
4461 xmlGenericError(xmlGenericErrorContext,
4462 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
4463 ctxt->instate = XML_PARSER_START_TAG;
4464 ctxt->checkIndex = 0;
4465#ifdef DEBUG_PUSH
4466 xmlGenericError(xmlGenericErrorContext,
4467 "HPP: entering START_TAG\n");
4468#endif
4469 break;
4470 case XML_PARSER_SYSTEM_LITERAL:
4471 xmlGenericError(xmlGenericErrorContext,
4472 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
4473 ctxt->instate = XML_PARSER_CONTENT;
4474 ctxt->checkIndex = 0;
4475#ifdef DEBUG_PUSH
4476 xmlGenericError(xmlGenericErrorContext,
4477 "HPP: entering CONTENT\n");
4478#endif
4479 break;
4480 case XML_PARSER_IGNORE:
4481 xmlGenericError(xmlGenericErrorContext,
4482 "HPP: internal error, state == XML_PARSER_IGNORE\n");
4483 ctxt->instate = XML_PARSER_CONTENT;
4484 ctxt->checkIndex = 0;
4485#ifdef DEBUG_PUSH
4486 xmlGenericError(xmlGenericErrorContext,
4487 "HPP: entering CONTENT\n");
4488#endif
4489 break;
4490 }
4491 }
4492done:
4493 if ((avail == 0) && (terminate)) {
4494 htmlAutoClose(ctxt, NULL);
4495 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4496 /*
4497 * SAX: end of the document processing.
4498 */
4499 ctxt->instate = XML_PARSER_EOF;
4500 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4501 ctxt->sax->endDocument(ctxt->userData);
4502 }
4503 }
4504 if ((ctxt->myDoc != NULL) &&
4505 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4506 (ctxt->instate == XML_PARSER_EPILOG))) {
4507 xmlDtdPtr dtd;
4508 dtd = xmlGetIntSubset(ctxt->myDoc);
4509 if (dtd == NULL)
4510 ctxt->myDoc->intSubset =
4511 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4512 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4513 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4514 }
4515#ifdef DEBUG_PUSH
4516 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4517#endif
4518 return(ret);
4519}
4520
4521/**
Owen Taylor3473f882001-02-23 17:55:21 +00004522 * htmlParseChunk:
4523 * @ctxt: an XML parser context
4524 * @chunk: an char array
4525 * @size: the size in byte of the chunk
4526 * @terminate: last chunk indicator
4527 *
4528 * Parse a Chunk of memory
4529 *
4530 * Returns zero if no error, the xmlParserErrors otherwise.
4531 */
4532int
4533htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4534 int terminate) {
4535 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4536 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4537 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4538 int cur = ctxt->input->cur - ctxt->input->base;
4539
4540 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4541 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4542 ctxt->input->cur = ctxt->input->base + cur;
4543#ifdef DEBUG_PUSH
4544 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4545#endif
4546
4547 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4548 htmlParseTryOrFinish(ctxt, terminate);
4549 } else if (ctxt->instate != XML_PARSER_EOF) {
4550 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
4551 htmlParseTryOrFinish(ctxt, terminate);
4552 }
4553 if (terminate) {
4554 if ((ctxt->instate != XML_PARSER_EOF) &&
4555 (ctxt->instate != XML_PARSER_EPILOG) &&
4556 (ctxt->instate != XML_PARSER_MISC)) {
4557 ctxt->errNo = XML_ERR_DOCUMENT_END;
4558 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4559 ctxt->sax->error(ctxt->userData,
4560 "Extra content at the end of the document\n");
4561 ctxt->wellFormed = 0;
4562 }
4563 if (ctxt->instate != XML_PARSER_EOF) {
4564 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4565 ctxt->sax->endDocument(ctxt->userData);
4566 }
4567 ctxt->instate = XML_PARSER_EOF;
4568 }
4569 return((xmlParserErrors) ctxt->errNo);
4570}
4571
4572/************************************************************************
4573 * *
4574 * User entry points *
4575 * *
4576 ************************************************************************/
4577
4578/**
4579 * htmlCreatePushParserCtxt :
4580 * @sax: a SAX handler
4581 * @user_data: The user data returned on SAX callbacks
4582 * @chunk: a pointer to an array of chars
4583 * @size: number of chars in the array
4584 * @filename: an optional file name or URI
4585 * @enc: an optional encoding
4586 *
4587 * Create a parser context for using the HTML parser in push mode
4588 * To allow content encoding detection, @size should be >= 4
4589 * The value of @filename is used for fetching external entities
4590 * and error/warning reports.
4591 *
4592 * Returns the new parser context or NULL
4593 */
4594htmlParserCtxtPtr
4595htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4596 const char *chunk, int size, const char *filename,
4597 xmlCharEncoding enc) {
4598 htmlParserCtxtPtr ctxt;
4599 htmlParserInputPtr inputStream;
4600 xmlParserInputBufferPtr buf;
4601
4602 buf = xmlAllocParserInputBuffer(enc);
4603 if (buf == NULL) return(NULL);
4604
4605 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4606 if (ctxt == NULL) {
4607 xmlFree(buf);
4608 return(NULL);
4609 }
4610 memset(ctxt, 0, sizeof(htmlParserCtxt));
4611 htmlInitParserCtxt(ctxt);
4612 if (sax != NULL) {
4613 if (ctxt->sax != &htmlDefaultSAXHandler)
4614 xmlFree(ctxt->sax);
4615 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4616 if (ctxt->sax == NULL) {
4617 xmlFree(buf);
4618 xmlFree(ctxt);
4619 return(NULL);
4620 }
4621 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4622 if (user_data != NULL)
4623 ctxt->userData = user_data;
4624 }
4625 if (filename == NULL) {
4626 ctxt->directory = NULL;
4627 } else {
4628 ctxt->directory = xmlParserGetDirectory(filename);
4629 }
4630
4631 inputStream = htmlNewInputStream(ctxt);
4632 if (inputStream == NULL) {
4633 xmlFreeParserCtxt(ctxt);
4634 return(NULL);
4635 }
4636
4637 if (filename == NULL)
4638 inputStream->filename = NULL;
4639 else
4640 inputStream->filename = xmlMemStrdup(filename);
4641 inputStream->buf = buf;
4642 inputStream->base = inputStream->buf->buffer->content;
4643 inputStream->cur = inputStream->buf->buffer->content;
4644
4645 inputPush(ctxt, inputStream);
4646
4647 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4648 (ctxt->input->buf != NULL)) {
4649 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4650#ifdef DEBUG_PUSH
4651 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4652#endif
4653 }
4654
4655 return(ctxt);
4656}
4657
4658/**
4659 * htmlSAXParseDoc :
4660 * @cur: a pointer to an array of xmlChar
4661 * @encoding: a free form C string describing the HTML document encoding, or NULL
4662 * @sax: the SAX handler block
4663 * @userData: if using SAX, this pointer will be provided on callbacks.
4664 *
4665 * parse an HTML in-memory document and build a tree.
4666 * It use the given SAX function block to handle the parsing callback.
4667 * If sax is NULL, fallback to the default DOM tree building routines.
4668 *
4669 * Returns the resulting document tree
4670 */
4671
4672htmlDocPtr
4673htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
4674 htmlDocPtr ret;
4675 htmlParserCtxtPtr ctxt;
4676
4677 if (cur == NULL) return(NULL);
4678
4679
4680 ctxt = htmlCreateDocParserCtxt(cur, encoding);
4681 if (ctxt == NULL) return(NULL);
4682 if (sax != NULL) {
4683 ctxt->sax = sax;
4684 ctxt->userData = userData;
4685 }
4686
4687 htmlParseDocument(ctxt);
4688 ret = ctxt->myDoc;
4689 if (sax != NULL) {
4690 ctxt->sax = NULL;
4691 ctxt->userData = NULL;
4692 }
4693 htmlFreeParserCtxt(ctxt);
4694
4695 return(ret);
4696}
4697
4698/**
4699 * htmlParseDoc :
4700 * @cur: a pointer to an array of xmlChar
4701 * @encoding: a free form C string describing the HTML document encoding, or NULL
4702 *
4703 * parse an HTML in-memory document and build a tree.
4704 *
4705 * Returns the resulting document tree
4706 */
4707
4708htmlDocPtr
4709htmlParseDoc(xmlChar *cur, const char *encoding) {
4710 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4711}
4712
4713
4714/**
4715 * htmlCreateFileParserCtxt :
4716 * @filename: the filename
4717 * @encoding: a free form C string describing the HTML document encoding, or NULL
4718 *
4719 * Create a parser context for a file content.
4720 * Automatic support for ZLIB/Compress compressed document is provided
4721 * by default if found at compile-time.
4722 *
4723 * Returns the new parser context or NULL
4724 */
4725htmlParserCtxtPtr
4726htmlCreateFileParserCtxt(const char *filename, const char *encoding)
4727{
4728 htmlParserCtxtPtr ctxt;
4729 htmlParserInputPtr inputStream;
4730 xmlParserInputBufferPtr buf;
4731 /* htmlCharEncoding enc; */
4732 xmlChar *content, *content_line = (xmlChar *) "charset=";
4733
4734 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
4735 if (buf == NULL) return(NULL);
4736
4737 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4738 if (ctxt == NULL) {
4739 perror("malloc");
4740 return(NULL);
4741 }
4742 memset(ctxt, 0, sizeof(htmlParserCtxt));
4743 htmlInitParserCtxt(ctxt);
4744 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
4745 if (inputStream == NULL) {
4746 perror("malloc");
4747 xmlFree(ctxt);
4748 return(NULL);
4749 }
4750 memset(inputStream, 0, sizeof(htmlParserInput));
4751
4752 inputStream->filename = xmlMemStrdup(filename);
4753 inputStream->line = 1;
4754 inputStream->col = 1;
4755 inputStream->buf = buf;
4756 inputStream->directory = NULL;
4757
4758 inputStream->base = inputStream->buf->buffer->content;
4759 inputStream->cur = inputStream->buf->buffer->content;
4760 inputStream->free = NULL;
4761
4762 inputPush(ctxt, inputStream);
4763
4764 /* set encoding */
4765 if (encoding) {
4766 content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
4767 if (content) {
4768 strcpy ((char *)content, (char *)content_line);
4769 strcat ((char *)content, (char *)encoding);
4770 htmlCheckEncoding (ctxt, content);
4771 xmlFree (content);
4772 }
4773 }
4774
4775 return(ctxt);
4776}
4777
4778/**
4779 * htmlSAXParseFile :
4780 * @filename: the filename
4781 * @encoding: a free form C string describing the HTML document encoding, or NULL
4782 * @sax: the SAX handler block
4783 * @userData: if using SAX, this pointer will be provided on callbacks.
4784 *
4785 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4786 * compressed document is provided by default if found at compile-time.
4787 * It use the given SAX function block to handle the parsing callback.
4788 * If sax is NULL, fallback to the default DOM tree building routines.
4789 *
4790 * Returns the resulting document tree
4791 */
4792
4793htmlDocPtr
4794htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
4795 void *userData) {
4796 htmlDocPtr ret;
4797 htmlParserCtxtPtr ctxt;
4798 htmlSAXHandlerPtr oldsax = NULL;
4799
4800 ctxt = htmlCreateFileParserCtxt(filename, encoding);
4801 if (ctxt == NULL) return(NULL);
4802 if (sax != NULL) {
4803 oldsax = ctxt->sax;
4804 ctxt->sax = sax;
4805 ctxt->userData = userData;
4806 }
4807
4808 htmlParseDocument(ctxt);
4809
4810 ret = ctxt->myDoc;
4811 if (sax != NULL) {
4812 ctxt->sax = oldsax;
4813 ctxt->userData = NULL;
4814 }
4815 htmlFreeParserCtxt(ctxt);
4816
4817 return(ret);
4818}
4819
4820/**
4821 * htmlParseFile :
4822 * @filename: the filename
4823 * @encoding: a free form C string describing the HTML document encoding, or NULL
4824 *
4825 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4826 * compressed document is provided by default if found at compile-time.
4827 *
4828 * Returns the resulting document tree
4829 */
4830
4831htmlDocPtr
4832htmlParseFile(const char *filename, const char *encoding) {
4833 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
4834}
4835
4836/**
4837 * htmlHandleOmittedElem:
4838 * @val: int 0 or 1
4839 *
4840 * Set and return the previous value for handling HTML omitted tags.
4841 *
4842 * Returns the last value for 0 for no handling, 1 for auto insertion.
4843 */
4844
4845int
4846htmlHandleOmittedElem(int val) {
4847 int old = htmlOmittedDefaultValue;
4848
4849 htmlOmittedDefaultValue = val;
4850 return(old);
4851}
4852
4853#endif /* LIBXML_HTML_ENABLED */