blob: 49b026417b2c40205a968a0eb0e0cbbe962cc1d7 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * Daniel.Veillard@w3.org
7 */
8
9#ifdef WIN32
10#include "win32config.h"
11#else
12#include "config.h"
13#endif
14
15#include <libxml/xmlversion.h>
16#ifdef LIBXML_HTML_ENABLED
17#include <stdio.h>
18#include <string.h>
19#ifdef HAVE_CTYPE_H
20#include <ctype.h>
21#endif
22#ifdef HAVE_STDLIB_H
23#include <stdlib.h>
24#endif
25#ifdef HAVE_SYS_STAT_H
26#include <sys/stat.h>
27#endif
28#ifdef HAVE_FCNTL_H
29#include <fcntl.h>
30#endif
31#ifdef HAVE_UNISTD_H
32#include <unistd.h>
33#endif
34#ifdef HAVE_ZLIB_H
35#include <zlib.h>
36#endif
37
38#include <libxml/xmlmemory.h>
39#include <libxml/tree.h>
40#include <libxml/parser.h>
41#include <libxml/parserInternals.h>
42#include <libxml/xmlerror.h>
43#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000044#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000045#include <libxml/entities.h>
46#include <libxml/encoding.h>
47#include <libxml/valid.h>
48#include <libxml/xmlIO.h>
49
50#define HTML_MAX_NAMELEN 1000
51#define HTML_PARSER_BIG_BUFFER_SIZE 1000
52#define HTML_PARSER_BUFFER_SIZE 100
53
54/* #define DEBUG */
55/* #define DEBUG_PUSH */
56
57int htmlOmittedDefaultValue = 1;
58
59/************************************************************************
60 * *
Daniel Veillard56a4cb82001-03-24 17:00:36 +000061 * When running GCC in vaacum cleaner mode *
62 * *
63 ************************************************************************/
64
65#ifdef __GNUC__
66#define UNUSED __attribute__((__unused__))
67#else
68#define UNUSED
69#endif
70
71xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
72 xmlChar end, xmlChar end2, xmlChar end3);
73
74/************************************************************************
75 * *
Owen Taylor3473f882001-02-23 17:55:21 +000076 * Parser stacks related functions and macros *
77 * *
78 ************************************************************************/
79
80/*
81 * Generic function for accessing stacks in the Parser Context
82 */
83
84#define PUSH_AND_POP(scope, type, name) \
85scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
86 if (ctxt->name##Nr >= ctxt->name##Max) { \
87 ctxt->name##Max *= 2; \
88 ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
89 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
90 if (ctxt->name##Tab == NULL) { \
91 xmlGenericError(xmlGenericErrorContext, \
92 "realloc failed !\n"); \
93 return(0); \
94 } \
95 } \
96 ctxt->name##Tab[ctxt->name##Nr] = value; \
97 ctxt->name = value; \
98 return(ctxt->name##Nr++); \
99} \
100scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
101 type ret; \
102 if (ctxt->name##Nr < 0) return(0); \
103 ctxt->name##Nr--; \
104 if (ctxt->name##Nr < 0) return(0); \
105 if (ctxt->name##Nr > 0) \
106 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
107 else \
108 ctxt->name = NULL; \
109 ret = ctxt->name##Tab[ctxt->name##Nr]; \
110 ctxt->name##Tab[ctxt->name##Nr] = 0; \
111 return(ret); \
112} \
113
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000114/* PUSH_AND_POP(static, xmlNodePtr, node) */
115PUSH_AND_POP(static, xmlChar*, name)
Owen Taylor3473f882001-02-23 17:55:21 +0000116
117/*
118 * Macros for accessing the content. Those should be used only by the parser,
119 * and not exported.
120 *
121 * Dirty macros, i.e. one need to make assumption on the context to use them
122 *
123 * CUR_PTR return the current pointer to the xmlChar to be parsed.
124 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
125 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
126 * in UNICODE mode. This should be used internally by the parser
127 * only to compare to ASCII values otherwise it would break when
128 * running with UTF-8 encoding.
129 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
130 * to compare on ASCII based substring.
131 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
132 * it should be used only to compare on ASCII based substring.
133 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
134 * strings within the parser.
135 *
136 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
137 *
138 * CURRENT Returns the current char value, with the full decoding of
139 * UTF-8 if we are using this mode. It returns an int.
140 * NEXT Skip to the next character, this does the proper decoding
141 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
142 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
143 */
144
145#define UPPER (toupper(*ctxt->input->cur))
146
147#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
148
149#define NXT(val) ctxt->input->cur[(val)]
150
151#define UPP(val) (toupper(ctxt->input->cur[(val)]))
152
153#define CUR_PTR ctxt->input->cur
154
155#define SHRINK xmlParserInputShrink(ctxt->input)
156
157#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
158
159#define CURRENT ((int) (*ctxt->input->cur))
160
161#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
162
163/* Inported from XML */
164
165/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
166#define CUR ((int) (*ctxt->input->cur))
167#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
168
169#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
170#define NXT(val) ctxt->input->cur[(val)]
171#define CUR_PTR ctxt->input->cur
172
173
174#define NEXTL(l) do { \
175 if (*(ctxt->input->cur) == '\n') { \
176 ctxt->input->line++; ctxt->input->col = 1; \
177 } else ctxt->input->col++; \
178 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
179 } while (0)
180
181/************
182 \
183 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
184 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
185 ************/
186
187#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
188#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
189
190#define COPY_BUF(l,b,i,v) \
191 if (l == 1) b[i++] = (xmlChar) v; \
192 else i += xmlCopyChar(l,&b[i],v)
193
194/**
195 * htmlCurrentChar:
196 * @ctxt: the HTML parser context
197 * @len: pointer to the length of the char read
198 *
199 * The current char value, if using UTF-8 this may actaully span multiple
200 * bytes in the input buffer. Implement the end of line normalization:
201 * 2.11 End-of-Line Handling
202 * If the encoding is unspecified, in the case we find an ISO-Latin-1
203 * char, then the encoding converter is plugged in automatically.
204 *
205 * Returns the current char value and its lenght
206 */
207
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000208static int
Owen Taylor3473f882001-02-23 17:55:21 +0000209htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
210 if (ctxt->instate == XML_PARSER_EOF)
211 return(0);
212
213 if (ctxt->token != 0) {
214 *len = 0;
215 return(ctxt->token);
216 }
217 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
218 /*
219 * We are supposed to handle UTF8, check it's valid
220 * From rfc2044: encoding of the Unicode values on UTF-8:
221 *
222 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
223 * 0000 0000-0000 007F 0xxxxxxx
224 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
225 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
226 *
227 * Check for the 0x110000 limit too
228 */
229 const unsigned char *cur = ctxt->input->cur;
230 unsigned char c;
231 unsigned int val;
232
233 c = *cur;
234 if (c & 0x80) {
235 if (cur[1] == 0)
236 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
237 if ((cur[1] & 0xc0) != 0x80)
238 goto encoding_error;
239 if ((c & 0xe0) == 0xe0) {
240
241 if (cur[2] == 0)
242 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
243 if ((cur[2] & 0xc0) != 0x80)
244 goto encoding_error;
245 if ((c & 0xf0) == 0xf0) {
246 if (cur[3] == 0)
247 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
248 if (((c & 0xf8) != 0xf0) ||
249 ((cur[3] & 0xc0) != 0x80))
250 goto encoding_error;
251 /* 4-byte code */
252 *len = 4;
253 val = (cur[0] & 0x7) << 18;
254 val |= (cur[1] & 0x3f) << 12;
255 val |= (cur[2] & 0x3f) << 6;
256 val |= cur[3] & 0x3f;
257 } else {
258 /* 3-byte code */
259 *len = 3;
260 val = (cur[0] & 0xf) << 12;
261 val |= (cur[1] & 0x3f) << 6;
262 val |= cur[2] & 0x3f;
263 }
264 } else {
265 /* 2-byte code */
266 *len = 2;
267 val = (cur[0] & 0x1f) << 6;
268 val |= cur[1] & 0x3f;
269 }
270 if (!IS_CHAR(val)) {
271 ctxt->errNo = XML_ERR_INVALID_ENCODING;
272 if ((ctxt->sax != NULL) &&
273 (ctxt->sax->error != NULL))
274 ctxt->sax->error(ctxt->userData,
275 "Char 0x%X out of allowed range\n", val);
276 ctxt->wellFormed = 0;
277 ctxt->disableSAX = 1;
278 }
279 return(val);
280 } else {
281 /* 1-byte code */
282 *len = 1;
283 return((int) *ctxt->input->cur);
284 }
285 }
286 /*
287 * Assume it's a fixed lenght encoding (1) with
288 * a compatibke encoding for the ASCII set, since
289 * XML constructs only use < 128 chars
290 */
291 *len = 1;
292 if ((int) *ctxt->input->cur < 0x80)
293 return((int) *ctxt->input->cur);
294
295 /*
296 * Humm this is bad, do an automatic flow conversion
297 */
298 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
299 ctxt->charset = XML_CHAR_ENCODING_UTF8;
300 return(xmlCurrentChar(ctxt, len));
301
302encoding_error:
303 /*
304 * If we detect an UTF8 error that probably mean that the
305 * input encoding didn't get properly advertized in the
306 * declaration header. Report the error and switch the encoding
307 * to ISO-Latin-1 (if you don't like this policy, just declare the
308 * encoding !)
309 */
310 ctxt->errNo = XML_ERR_INVALID_ENCODING;
311 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
312 ctxt->sax->error(ctxt->userData,
313 "Input is not proper UTF-8, indicate encoding !\n");
314 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
315 ctxt->input->cur[0], ctxt->input->cur[1],
316 ctxt->input->cur[2], ctxt->input->cur[3]);
317 }
318
319 ctxt->charset = XML_CHAR_ENCODING_8859_1;
320 *len = 1;
321 return((int) *ctxt->input->cur);
322}
323
324/**
Owen Taylor3473f882001-02-23 17:55:21 +0000325 * htmlSkipBlankChars:
326 * @ctxt: the HTML parser context
327 *
328 * skip all blanks character found at that point in the input streams.
329 *
330 * Returns the number of space chars skipped
331 */
332
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000333static int
Owen Taylor3473f882001-02-23 17:55:21 +0000334htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
335 int res = 0;
336
337 while (IS_BLANK(*(ctxt->input->cur))) {
338 if ((*ctxt->input->cur == 0) &&
339 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
340 xmlPopInput(ctxt);
341 } else {
342 if (*(ctxt->input->cur) == '\n') {
343 ctxt->input->line++; ctxt->input->col = 1;
344 } else ctxt->input->col++;
345 ctxt->input->cur++;
346 ctxt->nbChars++;
347 if (*ctxt->input->cur == 0)
348 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
349 }
350 res++;
351 }
352 return(res);
353}
354
355
356
357/************************************************************************
358 * *
359 * The list of HTML elements and their properties *
360 * *
361 ************************************************************************/
362
363/*
364 * Start Tag: 1 means the start tag can be ommited
365 * End Tag: 1 means the end tag can be ommited
366 * 2 means it's forbidden (empty elements)
367 * Depr: this element is deprecated
368 * DTD: 1 means that this element is valid only in the Loose DTD
369 * 2 means that this element is valid only in the Frameset DTD
370 *
371 * Name,Start Tag,End Tag,Save End, Empty, Depr., DTD, Description
372 */
373htmlElemDesc html40ElementTable[] = {
374{ "a", 0, 0, 0, 0, 0, 0, "anchor " },
375{ "abbr", 0, 0, 0, 0, 0, 0, "abbreviated form" },
376{ "acronym", 0, 0, 0, 0, 0, 0, "" },
377{ "address", 0, 0, 0, 0, 0, 0, "information on author " },
378{ "applet", 0, 0, 0, 0, 1, 1, "java applet " },
379{ "area", 0, 2, 2, 1, 0, 0, "client-side image map area " },
380{ "b", 0, 0, 0, 0, 0, 0, "bold text style" },
381{ "base", 0, 2, 2, 1, 0, 0, "document base uri " },
382{ "basefont", 0, 2, 2, 1, 1, 1, "base font size " },
383{ "bdo", 0, 0, 0, 0, 0, 0, "i18n bidi over-ride " },
384{ "big", 0, 0, 0, 0, 0, 0, "large text style" },
385{ "blockquote", 0, 0, 0, 0, 0, 0, "long quotation " },
386{ "body", 1, 1, 0, 0, 0, 0, "document body " },
387{ "br", 0, 2, 2, 1, 0, 0, "forced line break " },
388{ "button", 0, 0, 0, 0, 0, 0, "push button " },
389{ "caption", 0, 0, 0, 0, 0, 0, "table caption " },
390{ "center", 0, 0, 0, 0, 1, 1, "shorthand for div align=center " },
391{ "cite", 0, 0, 0, 0, 0, 0, "citation" },
392{ "code", 0, 0, 0, 0, 0, 0, "computer code fragment" },
393{ "col", 0, 2, 2, 1, 0, 0, "table column " },
394{ "colgroup", 0, 1, 0, 0, 0, 0, "table column group " },
395{ "dd", 0, 1, 0, 0, 0, 0, "definition description " },
396{ "del", 0, 0, 0, 0, 0, 0, "deleted text " },
397{ "dfn", 0, 0, 0, 0, 0, 0, "instance definition" },
398{ "dir", 0, 0, 0, 0, 1, 1, "directory list" },
399{ "div", 0, 0, 0, 0, 0, 0, "generic language/style container"},
400{ "dl", 0, 0, 0, 0, 0, 0, "definition list " },
401{ "dt", 0, 1, 0, 0, 0, 0, "definition term " },
402{ "em", 0, 0, 0, 0, 0, 0, "emphasis" },
403{ "fieldset", 0, 0, 0, 0, 0, 0, "form control group " },
404{ "font", 0, 0, 0, 0, 1, 1, "local change to font " },
405{ "form", 0, 0, 0, 0, 0, 0, "interactive form " },
406{ "frame", 0, 2, 2, 1, 0, 2, "subwindow " },
407{ "frameset", 0, 0, 0, 0, 0, 2, "window subdivision" },
408{ "h1", 0, 0, 0, 0, 0, 0, "heading " },
409{ "h2", 0, 0, 0, 0, 0, 0, "heading " },
410{ "h3", 0, 0, 0, 0, 0, 0, "heading " },
411{ "h4", 0, 0, 0, 0, 0, 0, "heading " },
412{ "h5", 0, 0, 0, 0, 0, 0, "heading " },
413{ "h6", 0, 0, 0, 0, 0, 0, "heading " },
414{ "head", 1, 1, 0, 0, 0, 0, "document head " },
415{ "hr", 0, 2, 2, 1, 0, 0, "horizontal rule " },
416{ "html", 1, 1, 0, 0, 0, 0, "document root element " },
417{ "i", 0, 0, 0, 0, 0, 0, "italic text style" },
418{ "iframe", 0, 0, 0, 0, 0, 1, "inline subwindow " },
419{ "img", 0, 2, 2, 1, 0, 0, "embedded image " },
420{ "input", 0, 2, 2, 1, 0, 0, "form control " },
421{ "ins", 0, 0, 0, 0, 0, 0, "inserted text" },
422{ "isindex", 0, 2, 2, 1, 1, 1, "single line prompt " },
423{ "kbd", 0, 0, 0, 0, 0, 0, "text to be entered by the user" },
424{ "label", 0, 0, 0, 0, 0, 0, "form field label text " },
425{ "legend", 0, 0, 0, 0, 0, 0, "fieldset legend " },
426{ "li", 0, 1, 1, 0, 0, 0, "list item " },
427{ "link", 0, 2, 2, 1, 0, 0, "a media-independent link " },
428{ "map", 0, 0, 0, 0, 0, 0, "client-side image map " },
429{ "menu", 0, 0, 0, 0, 1, 1, "menu list " },
430{ "meta", 0, 2, 2, 1, 0, 0, "generic metainformation " },
431{ "noframes", 0, 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " },
432{ "noscript", 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
433{ "object", 0, 0, 0, 0, 0, 0, "generic embedded object " },
434{ "ol", 0, 0, 0, 0, 0, 0, "ordered list " },
435{ "optgroup", 0, 0, 0, 0, 0, 0, "option group " },
436{ "option", 0, 1, 0, 0, 0, 0, "selectable choice " },
437{ "p", 0, 1, 1, 0, 0, 0, "paragraph " },
438{ "param", 0, 2, 2, 1, 0, 0, "named property value " },
439{ "pre", 0, 0, 0, 0, 0, 0, "preformatted text " },
440{ "q", 0, 0, 0, 0, 0, 0, "short inline quotation " },
441{ "s", 0, 0, 0, 0, 1, 1, "strike-through text style" },
442{ "samp", 0, 0, 0, 0, 0, 0, "sample program output, scripts, etc." },
443{ "script", 0, 0, 0, 0, 0, 0, "script statements " },
444{ "select", 0, 0, 0, 0, 0, 0, "option selector " },
445{ "small", 0, 0, 0, 0, 0, 0, "small text style" },
446{ "span", 0, 0, 0, 0, 0, 0, "generic language/style container " },
447{ "strike", 0, 0, 0, 0, 1, 1, "strike-through text" },
448{ "strong", 0, 0, 0, 0, 0, 0, "strong emphasis" },
449{ "style", 0, 0, 0, 0, 0, 0, "style info " },
450{ "sub", 0, 0, 0, 0, 0, 0, "subscript" },
451{ "sup", 0, 0, 0, 0, 0, 0, "superscript " },
452{ "table", 0, 0, 0, 0, 0, 0, "&#160;" },
453{ "tbody", 1, 0, 0, 0, 0, 0, "table body " },
454{ "td", 0, 0, 0, 0, 0, 0, "table data cell" },
455{ "textarea", 0, 0, 0, 0, 0, 0, "multi-line text field " },
456{ "tfoot", 0, 1, 0, 0, 0, 0, "table footer " },
457{ "th", 0, 1, 0, 0, 0, 0, "table header cell" },
458{ "thead", 0, 1, 0, 0, 0, 0, "table header " },
459{ "title", 0, 0, 0, 0, 0, 0, "document title " },
460{ "tr", 0, 1, 0, 0, 0, 0, "table row " },
461{ "tt", 0, 0, 0, 0, 0, 0, "teletype or monospaced text style" },
462{ "u", 0, 0, 0, 0, 1, 1, "underlined text style" },
463{ "ul", 0, 0, 0, 0, 0, 0, "unordered list " },
464{ "var", 0, 0, 0, 0, 0, 0, "instance of a variable or program argument" },
465};
466
467/*
468 * start tags that imply the end of a current element
469 * any tag of each line implies the end of the current element if the type of
470 * that element is in the same line
471 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000472const char *htmlEquEnd[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000473"dt", "dd", "li", "option", NULL,
474"h1", "h2", "h3", "h4", "h5", "h6", NULL,
475"ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,
476NULL
477};
478/*
479 * acording the HTML DTD, HR should be added to the 2nd line above, as it
480 * is not allowed within a H1, H2, H3, etc. But we should tolerate that case
481 * because many documents contain rules in headings...
482 */
483
484/*
485 * start tags that imply the end of current element
486 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000487const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000488"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
489 "dl", "ul", "ol", "menu", "dir", "address", "pre",
490 "listing", "xmp", "head", NULL,
491"head", "p", NULL,
492"title", "p", NULL,
493"body", "head", "style", "link", "title", "p", NULL,
494"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
495 "pre", "listing", "xmp", "head", "li", NULL,
496"hr", "p", "head", NULL,
497"h1", "p", "head", NULL,
498"h2", "p", "head", NULL,
499"h3", "p", "head", NULL,
500"h4", "p", "head", NULL,
501"h5", "p", "head", NULL,
502"h6", "p", "head", NULL,
503"dir", "p", "head", NULL,
504"address", "p", "head", "ul", NULL,
505"pre", "p", "head", "ul", NULL,
506"listing", "p", "head", NULL,
507"xmp", "p", "head", NULL,
508"blockquote", "p", "head", NULL,
509"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
510 "xmp", "head", NULL,
511"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
512 "head", "dd", NULL,
513"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
514 "head", "dt", NULL,
515"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
516 "listing", "xmp", NULL,
517"ol", "p", "head", "ul", NULL,
518"menu", "p", "head", "ul", NULL,
519"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
520"div", "p", "head", NULL,
521"noscript", "p", "head", NULL,
522"center", "font", "b", "i", "p", "head", NULL,
523"a", "a", NULL,
524"caption", "p", NULL,
525"colgroup", "caption", "colgroup", "col", "p", NULL,
526"col", "caption", "col", "p", NULL,
527"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
528 "listing", "xmp", "a", NULL,
529"th", "th", "td", NULL,
530"td", "th", "td", "p", NULL,
531"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
532"thead", "caption", "col", "colgroup", NULL,
533"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
534 "tbody", "p", NULL,
535"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
536 "tfoot", "tbody", "p", NULL,
537"optgroup", "option", NULL,
538"option", "option", NULL,
539"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
540 "pre", "listing", "xmp", "a", NULL,
541NULL
542};
543
544/*
545 * The list of HTML elements which are supposed not to have
546 * CDATA content and where a p element will be implied
547 *
548 * TODO: extend that list by reading the HTML SGML DtD on
549 * implied paragraph
550 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000551static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000552 "html",
553 "head",
554 "body",
555 NULL
556};
557
558/*
559 * The list of HTML attributes which are of content %Script;
560 * NOTE: when adding ones, check htmlIsScriptAttribute() since
561 * it assumes the name starts with 'on'
562 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000563static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000564 "onclick",
565 "ondblclick",
566 "onmousedown",
567 "onmouseup",
568 "onmouseover",
569 "onmousemove",
570 "onmouseout",
571 "onkeypress",
572 "onkeydown",
573 "onkeyup",
574 "onload",
575 "onunload",
576 "onfocus",
577 "onblur",
578 "onsubmit",
579 "onrest",
580 "onchange",
581 "onselect"
582};
583
584
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000585static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +0000586static int htmlStartCloseIndexinitialized = 0;
587
588/************************************************************************
589 * *
590 * functions to handle HTML specific data *
591 * *
592 ************************************************************************/
593
594/**
595 * htmlInitAutoClose:
596 *
597 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
598 * This is not reentrant. Call xmlInitParser() once before processing in
599 * case of use in multithreaded programs.
600 */
601void
602htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000603 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000604
605 if (htmlStartCloseIndexinitialized) return;
606
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000607 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
608 indx = 0;
609 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
610 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +0000611 while (htmlStartClose[i] != NULL) i++;
612 i++;
613 }
614 htmlStartCloseIndexinitialized = 1;
615}
616
617/**
618 * htmlTagLookup:
619 * @tag: The tag name in lowercase
620 *
621 * Lookup the HTML tag in the ElementTable
622 *
623 * Returns the related htmlElemDescPtr or NULL if not found.
624 */
625htmlElemDescPtr
626htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000627 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000628
629 for (i = 0; i < (sizeof(html40ElementTable) /
630 sizeof(html40ElementTable[0]));i++) {
631 if (xmlStrEqual(tag, BAD_CAST html40ElementTable[i].name))
632 return(&html40ElementTable[i]);
633 }
634 return(NULL);
635}
636
637/**
638 * htmlCheckAutoClose:
639 * @newtag: The new tag name
640 * @oldtag: The old tag name
641 *
642 * Checks wether the new tag is one of the registered valid tags for closing old.
643 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
644 *
645 * Returns 0 if no, 1 if yes.
646 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000647static int
Owen Taylor3473f882001-02-23 17:55:21 +0000648htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000649 int i, indx;
650 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +0000651
652 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
653
654 /* inefficient, but not a big deal */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000655 for (indx = 0; indx < 100;indx++) {
656 closed = htmlStartCloseIndex[indx];
657 if (closed == NULL) return(0);
658 if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor3473f882001-02-23 17:55:21 +0000659 }
660
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000661 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +0000662 i++;
663 while (htmlStartClose[i] != NULL) {
664 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
665 return(1);
666 }
667 i++;
668 }
669 return(0);
670}
671
672/**
673 * htmlAutoCloseOnClose:
674 * @ctxt: an HTML parser context
675 * @newtag: The new tag name
676 *
677 * The HTmL DtD allows an ending tag to implicitely close other tags.
678 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000679static void
Owen Taylor3473f882001-02-23 17:55:21 +0000680htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
681 htmlElemDescPtr info;
682 xmlChar *oldname;
683 int i;
684
685#ifdef DEBUG
686 xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
687 for (i = 0;i < ctxt->nameNr;i++)
688 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
689#endif
690
691 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
692 if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
693 }
694 if (i < 0) return;
695
696 while (!xmlStrEqual(newtag, ctxt->name)) {
697 info = htmlTagLookup(ctxt->name);
698 if ((info == NULL) || (info->endTag == 1)) {
699#ifdef DEBUG
700 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
701#endif
702 } else {
703 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
704 ctxt->sax->error(ctxt->userData,
705 "Opening and ending tag mismatch: %s and %s\n",
706 newtag, ctxt->name);
707 ctxt->wellFormed = 0;
708 }
709 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
710 ctxt->sax->endElement(ctxt->userData, ctxt->name);
711 oldname = htmlnamePop(ctxt);
712 if (oldname != NULL) {
713#ifdef DEBUG
714 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
715#endif
716 xmlFree(oldname);
717 }
718 }
719}
720
721/**
722 * htmlAutoClose:
723 * @ctxt: an HTML parser context
724 * @newtag: The new tag name or NULL
725 *
726 * The HTmL DtD allows a tag to implicitely close other tags.
727 * The list is kept in htmlStartClose array. This function is
728 * called when a new tag has been detected and generates the
729 * appropriates closes if possible/needed.
730 * If newtag is NULL this mean we are at the end of the resource
731 * and we should check
732 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000733static void
Owen Taylor3473f882001-02-23 17:55:21 +0000734htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
735 xmlChar *oldname;
736 while ((newtag != NULL) && (ctxt->name != NULL) &&
737 (htmlCheckAutoClose(newtag, ctxt->name))) {
738#ifdef DEBUG
739 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
740#endif
741 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
742 ctxt->sax->endElement(ctxt->userData, ctxt->name);
743 oldname = htmlnamePop(ctxt);
744 if (oldname != NULL) {
745#ifdef DEBUG
746 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
747#endif
748 xmlFree(oldname);
749 }
750 }
751 if (newtag == NULL) {
752 htmlAutoCloseOnClose(ctxt, BAD_CAST"head");
753 htmlAutoCloseOnClose(ctxt, BAD_CAST"body");
754 htmlAutoCloseOnClose(ctxt, BAD_CAST"html");
755 }
756 while ((newtag == NULL) && (ctxt->name != NULL) &&
757 ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
758 (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
759 (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
760#ifdef DEBUG
761 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
762#endif
763 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
764 ctxt->sax->endElement(ctxt->userData, ctxt->name);
765 oldname = htmlnamePop(ctxt);
766 if (oldname != NULL) {
767#ifdef DEBUG
768 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
769#endif
770 xmlFree(oldname);
771 }
772 }
773
774}
775
776/**
777 * htmlAutoCloseTag:
778 * @doc: the HTML document
779 * @name: The tag name
780 * @elem: the HTML element
781 *
782 * The HTmL DtD allows a tag to implicitely close other tags.
783 * The list is kept in htmlStartClose array. This function checks
784 * if the element or one of it's children would autoclose the
785 * given tag.
786 *
787 * Returns 1 if autoclose, 0 otherwise
788 */
789int
790htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
791 htmlNodePtr child;
792
793 if (elem == NULL) return(1);
794 if (xmlStrEqual(name, elem->name)) return(0);
795 if (htmlCheckAutoClose(elem->name, name)) return(1);
796 child = elem->children;
797 while (child != NULL) {
798 if (htmlAutoCloseTag(doc, name, child)) return(1);
799 child = child->next;
800 }
801 return(0);
802}
803
804/**
805 * htmlIsAutoClosed:
806 * @doc: the HTML document
807 * @elem: the HTML element
808 *
809 * The HTmL DtD allows a tag to implicitely close other tags.
810 * The list is kept in htmlStartClose array. This function checks
811 * if a tag is autoclosed by one of it's child
812 *
813 * Returns 1 if autoclosed, 0 otherwise
814 */
815int
816htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
817 htmlNodePtr child;
818
819 if (elem == NULL) return(1);
820 child = elem->children;
821 while (child != NULL) {
822 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
823 child = child->next;
824 }
825 return(0);
826}
827
828/**
829 * htmlCheckImplied:
830 * @ctxt: an HTML parser context
831 * @newtag: The new tag name
832 *
833 * The HTML DtD allows a tag to exists only implicitely
834 * called when a new tag has been detected and generates the
835 * appropriates implicit tags if missing
836 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000837static void
Owen Taylor3473f882001-02-23 17:55:21 +0000838htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
839 if (!htmlOmittedDefaultValue)
840 return;
841 if (xmlStrEqual(newtag, BAD_CAST"html"))
842 return;
843 if (ctxt->nameNr <= 0) {
844#ifdef DEBUG
845 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
846#endif
847 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
848 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
849 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
850 }
851 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
852 return;
853 if ((ctxt->nameNr <= 1) &&
854 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
855 (xmlStrEqual(newtag, BAD_CAST"style")) ||
856 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
857 (xmlStrEqual(newtag, BAD_CAST"link")) ||
858 (xmlStrEqual(newtag, BAD_CAST"title")) ||
859 (xmlStrEqual(newtag, BAD_CAST"base")))) {
860 /*
861 * dropped OBJECT ... i you put it first BODY will be
862 * assumed !
863 */
864#ifdef DEBUG
865 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
866#endif
867 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
868 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
869 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
870 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
871 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
872 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
873 int i;
874 for (i = 0;i < ctxt->nameNr;i++) {
875 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
876 return;
877 }
878 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
879 return;
880 }
881 }
882
883#ifdef DEBUG
884 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
885#endif
886 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
887 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
888 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
889 }
890}
891
892/**
893 * htmlCheckParagraph
894 * @ctxt: an HTML parser context
895 *
896 * Check whether a p element need to be implied before inserting
897 * characters in the current element.
898 *
899 * Returns 1 if a paragraph has been inserted, 0 if not and -1
900 * in case of error.
901 */
902
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000903static int
Owen Taylor3473f882001-02-23 17:55:21 +0000904htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
905 const xmlChar *tag;
906 int i;
907
908 if (ctxt == NULL)
909 return(-1);
910 tag = ctxt->name;
911 if (tag == NULL) {
912 htmlAutoClose(ctxt, BAD_CAST"p");
913 htmlCheckImplied(ctxt, BAD_CAST"p");
914 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
915 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
916 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
917 return(1);
918 }
919 if (!htmlOmittedDefaultValue)
920 return(0);
921 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
922 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
923#ifdef DEBUG
924 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
925#endif
926 htmlAutoClose(ctxt, BAD_CAST"p");
927 htmlCheckImplied(ctxt, BAD_CAST"p");
928 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
929 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
930 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
931 return(1);
932 }
933 }
934 return(0);
935}
936
937/**
938 * htmlIsScriptAttribute:
939 * @name: an attribute name
940 *
941 * Check if an attribute is of content type Script
942 *
943 * Returns 1 is the attribute is a script 0 otherwise
944 */
945int
946htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000947 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000948
949 if (name == NULL)
950 return(0);
951 /*
952 * all script attributes start with 'on'
953 */
954 if ((name[0] != 'o') || (name[1] != 'n'))
955 return(0);
956 for (i = 0;
957 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
958 i++) {
959 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
960 return(1);
961 }
962 return(0);
963}
964
965/************************************************************************
966 * *
967 * The list of HTML predefined entities *
968 * *
969 ************************************************************************/
970
971
972htmlEntityDesc html40EntitiesTable[] = {
973/*
974 * the 4 absolute ones, plus apostrophe.
975 */
976{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
977{ 38, "amp", "ampersand, U+0026 ISOnum" },
978{ 39, "apos", "single quote" },
979{ 60, "lt", "less-than sign, U+003C ISOnum" },
980{ 62, "gt", "greater-than sign, U+003E ISOnum" },
981
982/*
983 * A bunch still in the 128-255 range
984 * Replacing them depend really on the charset used.
985 */
986{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
987{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
988{ 162, "cent", "cent sign, U+00A2 ISOnum" },
989{ 163, "pound","pound sign, U+00A3 ISOnum" },
990{ 164, "curren","currency sign, U+00A4 ISOnum" },
991{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
992{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
993{ 167, "sect", "section sign, U+00A7 ISOnum" },
994{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
995{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
996{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
997{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
998{ 172, "not", "not sign, U+00AC ISOnum" },
999{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1000{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1001{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1002{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1003{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1004{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1005{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1006{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1007{ 181, "micro","micro sign, U+00B5 ISOnum" },
1008{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1009{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1010{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1011{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1012{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1013{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1014{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1015{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1016{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1017{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1018{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1019{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1020{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1021{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1022{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1023{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1024{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1025{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1026{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1027{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1028{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1029{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1030{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1031{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1032{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1033{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1034{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1035{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1036{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1037{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1038{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1039{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1040{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1041{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1042{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1043{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1044{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1045{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1046{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1047{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1048{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1049{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1050{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1051{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1052{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1053{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1054{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1055{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1056{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1057{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1058{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1059{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1060{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1061{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1062{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1063{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1064{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1065{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1066{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1067{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1068{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1069{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1070{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1071{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1072{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1073{ 247, "divide","division sign, U+00F7 ISOnum" },
1074{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1075{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1076{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1077{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1078{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1079{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1080{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1081{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1082
1083{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1084{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1085{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1086{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1087{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1088
1089/*
1090 * Anything below should really be kept as entities references
1091 */
1092{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1093
1094{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1095{ 732, "tilde","small tilde, U+02DC ISOdia" },
1096
1097{ 913, "Alpha","greek capital letter alpha, U+0391" },
1098{ 914, "Beta", "greek capital letter beta, U+0392" },
1099{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1100{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1101{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1102{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1103{ 919, "Eta", "greek capital letter eta, U+0397" },
1104{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1105{ 921, "Iota", "greek capital letter iota, U+0399" },
1106{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001107{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001108{ 924, "Mu", "greek capital letter mu, U+039C" },
1109{ 925, "Nu", "greek capital letter nu, U+039D" },
1110{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1111{ 927, "Omicron","greek capital letter omicron, U+039F" },
1112{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1113{ 929, "Rho", "greek capital letter rho, U+03A1" },
1114{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1115{ 932, "Tau", "greek capital letter tau, U+03A4" },
1116{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1117{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1118{ 935, "Chi", "greek capital letter chi, U+03A7" },
1119{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1120{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1121
1122{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1123{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1124{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1125{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1126{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1127{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1128{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1129{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1130{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1131{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1132{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1133{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1134{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1135{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1136{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1137{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1138{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1139{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1140{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1141{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1142{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1143{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1144{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1145{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1146{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1147{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1148{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1149{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1150
1151{ 8194, "ensp", "en space, U+2002 ISOpub" },
1152{ 8195, "emsp", "em space, U+2003 ISOpub" },
1153{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1154{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1155{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1156{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1157{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1158{ 8211, "ndash","en dash, U+2013 ISOpub" },
1159{ 8212, "mdash","em dash, U+2014 ISOpub" },
1160{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1161{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1162{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1163{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1164{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1165{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1166{ 8224, "dagger","dagger, U+2020 ISOpub" },
1167{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1168
1169{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1170{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1171
1172{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1173
1174{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1175{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1176
1177{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1178{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1179
1180{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1181{ 8260, "frasl","fraction slash, U+2044 NEW" },
1182
1183{ 8364, "euro", "euro sign, U+20AC NEW" },
1184
1185{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1186{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1187{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1188{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1189{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1190{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1191{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1192{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1193{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1194{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1195{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1196{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1197{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1198{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1199{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1200{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1201
1202{ 8704, "forall","for all, U+2200 ISOtech" },
1203{ 8706, "part", "partial differential, U+2202 ISOtech" },
1204{ 8707, "exist","there exists, U+2203 ISOtech" },
1205{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1206{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1207{ 8712, "isin", "element of, U+2208 ISOtech" },
1208{ 8713, "notin","not an element of, U+2209 ISOtech" },
1209{ 8715, "ni", "contains as member, U+220B ISOtech" },
1210{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1211{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
1212{ 8722, "minus","minus sign, U+2212 ISOtech" },
1213{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1214{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1215{ 8733, "prop", "proportional to, U+221D ISOtech" },
1216{ 8734, "infin","infinity, U+221E ISOtech" },
1217{ 8736, "ang", "angle, U+2220 ISOamso" },
1218{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1219{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1220{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1221{ 8746, "cup", "union = cup, U+222A ISOtech" },
1222{ 8747, "int", "integral, U+222B ISOtech" },
1223{ 8756, "there4","therefore, U+2234 ISOtech" },
1224{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1225{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1226{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1227{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1228{ 8801, "equiv","identical to, U+2261 ISOtech" },
1229{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1230{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1231{ 8834, "sub", "subset of, U+2282 ISOtech" },
1232{ 8835, "sup", "superset of, U+2283 ISOtech" },
1233{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1234{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1235{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1236{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1237{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1238{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1239{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1240{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1241{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1242{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1243{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1244{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1245{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1246{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1247
1248{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1249{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1250{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1251{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1252
1253};
1254
1255/************************************************************************
1256 * *
1257 * Commodity functions to handle entities *
1258 * *
1259 ************************************************************************/
1260
1261/*
1262 * Macro used to grow the current buffer.
1263 */
1264#define growBuffer(buffer) { \
1265 buffer##_size *= 2; \
1266 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1267 if (buffer == NULL) { \
1268 perror("realloc failed"); \
1269 return(NULL); \
1270 } \
1271}
1272
1273/**
1274 * htmlEntityLookup:
1275 * @name: the entity name
1276 *
1277 * Lookup the given entity in EntitiesTable
1278 *
1279 * TODO: the linear scan is really ugly, an hash table is really needed.
1280 *
1281 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1282 */
1283htmlEntityDescPtr
1284htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001285 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001286
1287 for (i = 0;i < (sizeof(html40EntitiesTable)/
1288 sizeof(html40EntitiesTable[0]));i++) {
1289 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1290#ifdef DEBUG
1291 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
1292#endif
1293 return(&html40EntitiesTable[i]);
1294 }
1295 }
1296 return(NULL);
1297}
1298
1299/**
1300 * htmlEntityValueLookup:
1301 * @value: the entity's unicode value
1302 *
1303 * Lookup the given entity in EntitiesTable
1304 *
1305 * TODO: the linear scan is really ugly, an hash table is really needed.
1306 *
1307 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1308 */
1309htmlEntityDescPtr
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001310htmlEntityValueLookup(unsigned int value) {
1311 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001312#ifdef DEBUG
1313 int lv = 0;
1314#endif
1315
1316 for (i = 0;i < (sizeof(html40EntitiesTable)/
1317 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001318 if (html40EntitiesTable[i].value >= value) {
1319 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001320 break;
1321#ifdef DEBUG
1322 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
1323#endif
1324 return(&html40EntitiesTable[i]);
1325 }
1326#ifdef DEBUG
1327 if (lv > html40EntitiesTable[i].value) {
1328 xmlGenericError(xmlGenericErrorContext,
1329 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1330 lv, html40EntitiesTable[i].value);
1331 }
1332 lv = html40EntitiesTable[i].value;
1333#endif
1334 }
1335 return(NULL);
1336}
1337
1338/**
1339 * UTF8ToHtml:
1340 * @out: a pointer to an array of bytes to store the result
1341 * @outlen: the length of @out
1342 * @in: a pointer to an array of UTF-8 chars
1343 * @inlen: the length of @in
1344 *
1345 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1346 * plus HTML entities block of chars out.
1347 *
1348 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1349 * The value of @inlen after return is the number of octets consumed
1350 * as the return value is positive, else unpredictiable.
1351 * The value of @outlen after return is the number of octets consumed.
1352 */
1353int
1354UTF8ToHtml(unsigned char* out, int *outlen,
1355 const unsigned char* in, int *inlen) {
1356 const unsigned char* processed = in;
1357 const unsigned char* outend;
1358 const unsigned char* outstart = out;
1359 const unsigned char* instart = in;
1360 const unsigned char* inend;
1361 unsigned int c, d;
1362 int trailing;
1363
1364 if (in == NULL) {
1365 /*
1366 * initialization nothing to do
1367 */
1368 *outlen = 0;
1369 *inlen = 0;
1370 return(0);
1371 }
1372 inend = in + (*inlen);
1373 outend = out + (*outlen);
1374 while (in < inend) {
1375 d = *in++;
1376 if (d < 0x80) { c= d; trailing= 0; }
1377 else if (d < 0xC0) {
1378 /* trailing byte in leading position */
1379 *outlen = out - outstart;
1380 *inlen = processed - instart;
1381 return(-2);
1382 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1383 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1384 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1385 else {
1386 /* no chance for this in Ascii */
1387 *outlen = out - outstart;
1388 *inlen = processed - instart;
1389 return(-2);
1390 }
1391
1392 if (inend - in < trailing) {
1393 break;
1394 }
1395
1396 for ( ; trailing; trailing--) {
1397 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1398 break;
1399 c <<= 6;
1400 c |= d & 0x3F;
1401 }
1402
1403 /* assertion: c is a single UTF-4 value */
1404 if (c < 0x80) {
1405 if (out + 1 >= outend)
1406 break;
1407 *out++ = c;
1408 } else {
1409 int len;
1410 htmlEntityDescPtr ent;
1411
1412 /*
1413 * Try to lookup a predefined HTML entity for it
1414 */
1415
1416 ent = htmlEntityValueLookup(c);
1417 if (ent == NULL) {
1418 /* no chance for this in Ascii */
1419 *outlen = out - outstart;
1420 *inlen = processed - instart;
1421 return(-2);
1422 }
1423 len = strlen(ent->name);
1424 if (out + 2 + len >= outend)
1425 break;
1426 *out++ = '&';
1427 memcpy(out, ent->name, len);
1428 out += len;
1429 *out++ = ';';
1430 }
1431 processed = in;
1432 }
1433 *outlen = out - outstart;
1434 *inlen = processed - instart;
1435 return(0);
1436}
1437
1438/**
1439 * htmlEncodeEntities:
1440 * @out: a pointer to an array of bytes to store the result
1441 * @outlen: the length of @out
1442 * @in: a pointer to an array of UTF-8 chars
1443 * @inlen: the length of @in
1444 * @quoteChar: the quote character to escape (' or ") or zero.
1445 *
1446 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1447 * plus HTML entities block of chars out.
1448 *
1449 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1450 * The value of @inlen after return is the number of octets consumed
1451 * as the return value is positive, else unpredictiable.
1452 * The value of @outlen after return is the number of octets consumed.
1453 */
1454int
1455htmlEncodeEntities(unsigned char* out, int *outlen,
1456 const unsigned char* in, int *inlen, int quoteChar) {
1457 const unsigned char* processed = in;
1458 const unsigned char* outend = out + (*outlen);
1459 const unsigned char* outstart = out;
1460 const unsigned char* instart = in;
1461 const unsigned char* inend = in + (*inlen);
1462 unsigned int c, d;
1463 int trailing;
1464
1465 while (in < inend) {
1466 d = *in++;
1467 if (d < 0x80) { c= d; trailing= 0; }
1468 else if (d < 0xC0) {
1469 /* trailing byte in leading position */
1470 *outlen = out - outstart;
1471 *inlen = processed - instart;
1472 return(-2);
1473 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1474 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1475 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1476 else {
1477 /* no chance for this in Ascii */
1478 *outlen = out - outstart;
1479 *inlen = processed - instart;
1480 return(-2);
1481 }
1482
1483 if (inend - in < trailing)
1484 break;
1485
1486 while (trailing--) {
1487 if (((d= *in++) & 0xC0) != 0x80) {
1488 *outlen = out - outstart;
1489 *inlen = processed - instart;
1490 return(-2);
1491 }
1492 c <<= 6;
1493 c |= d & 0x3F;
1494 }
1495
1496 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001497 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1498 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001499 if (out >= outend)
1500 break;
1501 *out++ = c;
1502 } else {
1503 htmlEntityDescPtr ent;
1504 const char *cp;
1505 char nbuf[16];
1506 int len;
1507
1508 /*
1509 * Try to lookup a predefined HTML entity for it
1510 */
1511 ent = htmlEntityValueLookup(c);
1512 if (ent == NULL) {
1513 sprintf(nbuf, "#%u", c);
1514 cp = nbuf;
1515 }
1516 else
1517 cp = ent->name;
1518 len = strlen(cp);
1519 if (out + 2 + len > outend)
1520 break;
1521 *out++ = '&';
1522 memcpy(out, cp, len);
1523 out += len;
1524 *out++ = ';';
1525 }
1526 processed = in;
1527 }
1528 *outlen = out - outstart;
1529 *inlen = processed - instart;
1530 return(0);
1531}
1532
1533/**
1534 * htmlDecodeEntities:
1535 * @ctxt: the parser context
1536 * @len: the len to decode (in bytes !), -1 for no size limit
1537 * @end: an end marker xmlChar, 0 if none
1538 * @end2: an end marker xmlChar, 0 if none
1539 * @end3: an end marker xmlChar, 0 if none
1540 *
1541 * Subtitute the HTML entities by their value
1542 *
1543 * DEPRECATED !!!!
1544 *
1545 * Returns A newly allocated string with the substitution done. The caller
1546 * must deallocate it !
1547 */
1548xmlChar *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001549htmlDecodeEntities(htmlParserCtxtPtr ctxt UNUSED, int len UNUSED,
1550 xmlChar end UNUSED, xmlChar end2 UNUSED, xmlChar end3 UNUSED) {
1551 static int deprecated = 0;
1552 if (!deprecated) {
1553 xmlGenericError(xmlGenericErrorContext,
1554 "htmlDecodeEntities() deprecated function reached\n");
1555 deprecated = 1;
1556 }
1557 return(NULL);
1558#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00001559 xmlChar *name = NULL;
1560 xmlChar *buffer = NULL;
1561 unsigned int buffer_size = 0;
1562 unsigned int nbchars = 0;
1563 htmlEntityDescPtr ent;
1564 unsigned int max = (unsigned int) len;
1565 int c,l;
1566
1567 if (ctxt->depth > 40) {
1568 ctxt->errNo = XML_ERR_ENTITY_LOOP;
1569 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1570 ctxt->sax->error(ctxt->userData,
1571 "Detected entity reference loop\n");
1572 ctxt->wellFormed = 0;
1573 ctxt->disableSAX = 1;
1574 return(NULL);
1575 }
1576
1577 /*
1578 * allocate a translation buffer.
1579 */
1580 buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1581 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1582 if (buffer == NULL) {
1583 perror("xmlDecodeEntities: malloc failed");
1584 return(NULL);
1585 }
1586
1587 /*
1588 * Ok loop until we reach one of the ending char or a size limit.
1589 */
1590 c = CUR_CHAR(l);
1591 while ((nbchars < max) && (c != end) &&
1592 (c != end2) && (c != end3)) {
1593
1594 if (c == 0) break;
1595 if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
1596 int val = htmlParseCharRef(ctxt);
1597 COPY_BUF(0,buffer,nbchars,val);
1598 NEXTL(l);
1599 } else if ((c == '&') && (ctxt->token != '&')) {
1600 ent = htmlParseEntityRef(ctxt, &name);
1601 if (name != NULL) {
1602 if (ent != NULL) {
1603 int val = ent->value;
1604 COPY_BUF(0,buffer,nbchars,val);
1605 NEXTL(l);
1606 } else {
1607 const xmlChar *cur = name;
1608
1609 buffer[nbchars++] = '&';
1610 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1611 growBuffer(buffer);
1612 }
1613 while (*cur != 0) {
1614 buffer[nbchars++] = *cur++;
1615 }
1616 buffer[nbchars++] = ';';
1617 }
1618 }
1619 } else {
1620 COPY_BUF(l,buffer,nbchars,c);
1621 NEXTL(l);
1622 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1623 growBuffer(buffer);
1624 }
1625 }
1626 c = CUR_CHAR(l);
1627 }
1628 buffer[nbchars++] = 0;
1629 return(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001630#endif
Owen Taylor3473f882001-02-23 17:55:21 +00001631}
1632
1633/************************************************************************
1634 * *
1635 * Commodity functions to handle streams *
1636 * *
1637 ************************************************************************/
1638
1639/**
Owen Taylor3473f882001-02-23 17:55:21 +00001640 * htmlNewInputStream:
1641 * @ctxt: an HTML parser context
1642 *
1643 * Create a new input stream structure
1644 * Returns the new input stream or NULL
1645 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001646static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001647htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1648 htmlParserInputPtr input;
1649
1650 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1651 if (input == NULL) {
1652 ctxt->errNo = XML_ERR_NO_MEMORY;
1653 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1654 ctxt->sax->error(ctxt->userData,
1655 "malloc: couldn't allocate a new input stream\n");
1656 return(NULL);
1657 }
1658 memset(input, 0, sizeof(htmlParserInput));
1659 input->filename = NULL;
1660 input->directory = NULL;
1661 input->base = NULL;
1662 input->cur = NULL;
1663 input->buf = NULL;
1664 input->line = 1;
1665 input->col = 1;
1666 input->buf = NULL;
1667 input->free = NULL;
1668 input->version = NULL;
1669 input->consumed = 0;
1670 input->length = 0;
1671 return(input);
1672}
1673
1674
1675/************************************************************************
1676 * *
1677 * Commodity functions, cleanup needed ? *
1678 * *
1679 ************************************************************************/
1680
1681/**
1682 * areBlanks:
1683 * @ctxt: an HTML parser context
1684 * @str: a xmlChar *
1685 * @len: the size of @str
1686 *
1687 * Is this a sequence of blank chars that one can ignore ?
1688 *
1689 * Returns 1 if ignorable 0 otherwise.
1690 */
1691
1692static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
1693 int i;
1694 xmlNodePtr lastChild;
1695
1696 for (i = 0;i < len;i++)
1697 if (!(IS_BLANK(str[i]))) return(0);
1698
1699 if (CUR == 0) return(1);
1700 if (CUR != '<') return(0);
1701 if (ctxt->name == NULL)
1702 return(1);
1703 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
1704 return(1);
1705 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
1706 return(1);
1707 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
1708 return(1);
1709 if (ctxt->node == NULL) return(0);
1710 lastChild = xmlGetLastChild(ctxt->node);
1711 if (lastChild == NULL) {
1712 if (ctxt->node->content != NULL) return(0);
1713 } else if (xmlNodeIsText(lastChild)) {
1714 return(0);
1715 } else if (xmlStrEqual(lastChild->name, BAD_CAST"b")) {
1716 return(0);
1717 } else if (xmlStrEqual(lastChild->name, BAD_CAST"bold")) {
1718 return(0);
1719 } else if (xmlStrEqual(lastChild->name, BAD_CAST"em")) {
1720 return(0);
1721 }
1722 return(1);
1723}
1724
1725/**
Owen Taylor3473f882001-02-23 17:55:21 +00001726 * htmlNewDocNoDtD:
1727 * @URI: URI for the dtd, or NULL
1728 * @ExternalID: the external ID of the DTD, or NULL
1729 *
1730 * Returns a new document, do not intialize the DTD if not provided
1731 */
1732htmlDocPtr
1733htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
1734 xmlDocPtr cur;
1735
1736 /*
1737 * Allocate a new document and fill the fields.
1738 */
1739 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
1740 if (cur == NULL) {
1741 xmlGenericError(xmlGenericErrorContext,
1742 "xmlNewDoc : malloc failed\n");
1743 return(NULL);
1744 }
1745 memset(cur, 0, sizeof(xmlDoc));
1746
1747 cur->type = XML_HTML_DOCUMENT_NODE;
1748 cur->version = NULL;
1749 cur->intSubset = NULL;
1750 if ((ExternalID != NULL) ||
1751 (URI != NULL))
1752 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
1753 cur->doc = cur;
1754 cur->name = NULL;
1755 cur->children = NULL;
1756 cur->extSubset = NULL;
1757 cur->oldNs = NULL;
1758 cur->encoding = NULL;
1759 cur->standalone = 1;
1760 cur->compression = 0;
1761 cur->ids = NULL;
1762 cur->refs = NULL;
1763#ifndef XML_WITHOUT_CORBA
1764 cur->_private = NULL;
1765#endif
1766 return(cur);
1767}
1768
1769/**
1770 * htmlNewDoc:
1771 * @URI: URI for the dtd, or NULL
1772 * @ExternalID: the external ID of the DTD, or NULL
1773 *
1774 * Returns a new document
1775 */
1776htmlDocPtr
1777htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1778 if ((URI == NULL) && (ExternalID == NULL))
1779 return(htmlNewDocNoDtD(
1780 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
1781 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"));
1782
1783 return(htmlNewDocNoDtD(URI, ExternalID));
1784}
1785
1786
1787/************************************************************************
1788 * *
1789 * The parser itself *
1790 * Relates to http://www.w3.org/TR/html40 *
1791 * *
1792 ************************************************************************/
1793
1794/************************************************************************
1795 * *
1796 * The parser itself *
1797 * *
1798 ************************************************************************/
1799
1800/**
1801 * htmlParseHTMLName:
1802 * @ctxt: an HTML parser context
1803 *
1804 * parse an HTML tag or attribute name, note that we convert it to lowercase
1805 * since HTML names are not case-sensitive.
1806 *
1807 * Returns the Tag Name parsed or NULL
1808 */
1809
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001810static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001811htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1812 xmlChar *ret = NULL;
1813 int i = 0;
1814 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
1815
1816 if (!IS_LETTER(CUR) && (CUR != '_') &&
1817 (CUR != ':')) return(NULL);
1818
1819 while ((i < HTML_PARSER_BUFFER_SIZE) &&
1820 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1821 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
1822 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
1823 else loc[i] = CUR;
1824 i++;
1825
1826 NEXT;
1827 }
1828
1829 ret = xmlStrndup(loc, i);
1830
1831 return(ret);
1832}
1833
1834/**
1835 * htmlParseName:
1836 * @ctxt: an HTML parser context
1837 *
1838 * parse an HTML name, this routine is case sensistive.
1839 *
1840 * Returns the Name parsed or NULL
1841 */
1842
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001843static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001844htmlParseName(htmlParserCtxtPtr ctxt) {
1845 xmlChar buf[HTML_MAX_NAMELEN];
1846 int len = 0;
1847
1848 GROW;
1849 if (!IS_LETTER(CUR) && (CUR != '_')) {
1850 return(NULL);
1851 }
1852
1853 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1854 (CUR == '.') || (CUR == '-') ||
1855 (CUR == '_') || (CUR == ':') ||
1856 (IS_COMBINING(CUR)) ||
1857 (IS_EXTENDER(CUR))) {
1858 buf[len++] = CUR;
1859 NEXT;
1860 if (len >= HTML_MAX_NAMELEN) {
1861 xmlGenericError(xmlGenericErrorContext,
1862 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1863 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1864 (CUR == '.') || (CUR == '-') ||
1865 (CUR == '_') || (CUR == ':') ||
1866 (IS_COMBINING(CUR)) ||
1867 (IS_EXTENDER(CUR)))
1868 NEXT;
1869 break;
1870 }
1871 }
1872 return(xmlStrndup(buf, len));
1873}
1874
1875/**
1876 * htmlParseHTMLAttribute:
1877 * @ctxt: an HTML parser context
1878 * @stop: a char stop value
1879 *
1880 * parse an HTML attribute value till the stop (quote), if
1881 * stop is 0 then it stops at the first space
1882 *
1883 * Returns the attribute parsed or NULL
1884 */
1885
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001886static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001887htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
1888 xmlChar *buffer = NULL;
1889 int buffer_size = 0;
1890 xmlChar *out = NULL;
1891 xmlChar *name = NULL;
1892
1893 xmlChar *cur = NULL;
1894 htmlEntityDescPtr ent;
1895
1896 /*
1897 * allocate a translation buffer.
1898 */
1899 buffer_size = HTML_PARSER_BUFFER_SIZE;
1900 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1901 if (buffer == NULL) {
1902 perror("htmlParseHTMLAttribute: malloc failed");
1903 return(NULL);
1904 }
1905 out = buffer;
1906
1907 /*
1908 * Ok loop until we reach one of the ending chars
1909 */
1910 while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1911 if ((stop == 0) && (IS_BLANK(CUR))) break;
1912 if (CUR == '&') {
1913 if (NXT(1) == '#') {
1914 unsigned int c;
1915 int bits;
1916
1917 c = htmlParseCharRef(ctxt);
1918 if (c < 0x80)
1919 { *out++ = c; bits= -6; }
1920 else if (c < 0x800)
1921 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1922 else if (c < 0x10000)
1923 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1924 else
1925 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1926
1927 for ( ; bits >= 0; bits-= 6) {
1928 *out++ = ((c >> bits) & 0x3F) | 0x80;
1929 }
1930 } else {
1931 ent = htmlParseEntityRef(ctxt, &name);
1932 if (name == NULL) {
1933 *out++ = '&';
1934 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001935 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00001936
1937 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001938 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00001939 }
1940 } else if (ent == NULL) {
1941 *out++ = '&';
1942 cur = name;
1943 while (*cur != 0) {
1944 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001945 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00001946
1947 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001948 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00001949 }
1950 *out++ = *cur++;
1951 }
1952 xmlFree(name);
1953 } else {
1954 unsigned int c;
1955 int bits;
1956
1957 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001958 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00001959
1960 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001961 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00001962 }
1963 c = (xmlChar)ent->value;
1964 if (c < 0x80)
1965 { *out++ = c; bits= -6; }
1966 else if (c < 0x800)
1967 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1968 else if (c < 0x10000)
1969 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1970 else
1971 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1972
1973 for ( ; bits >= 0; bits-= 6) {
1974 *out++ = ((c >> bits) & 0x3F) | 0x80;
1975 }
1976 xmlFree(name);
1977 }
1978 }
1979 } else {
1980 unsigned int c;
1981 int bits, l;
1982
1983 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001984 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00001985
1986 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001987 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00001988 }
1989 c = CUR_CHAR(l);
1990 if (c < 0x80)
1991 { *out++ = c; bits= -6; }
1992 else if (c < 0x800)
1993 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1994 else if (c < 0x10000)
1995 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1996 else
1997 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1998
1999 for ( ; bits >= 0; bits-= 6) {
2000 *out++ = ((c >> bits) & 0x3F) | 0x80;
2001 }
2002 NEXT;
2003 }
2004 }
2005 *out++ = 0;
2006 return(buffer);
2007}
2008
2009/**
Owen Taylor3473f882001-02-23 17:55:21 +00002010 * htmlParseEntityRef:
2011 * @ctxt: an HTML parser context
2012 * @str: location to store the entity name
2013 *
2014 * parse an HTML ENTITY references
2015 *
2016 * [68] EntityRef ::= '&' Name ';'
2017 *
2018 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2019 * if non-NULL *str will have to be freed by the caller.
2020 */
2021htmlEntityDescPtr
2022htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2023 xmlChar *name;
2024 htmlEntityDescPtr ent = NULL;
2025 *str = NULL;
2026
2027 if (CUR == '&') {
2028 NEXT;
2029 name = htmlParseName(ctxt);
2030 if (name == NULL) {
2031 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2032 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2033 ctxt->wellFormed = 0;
2034 } else {
2035 GROW;
2036 if (CUR == ';') {
2037 *str = name;
2038
2039 /*
2040 * Lookup the entity in the table.
2041 */
2042 ent = htmlEntityLookup(name);
2043 if (ent != NULL) /* OK that's ugly !!! */
2044 NEXT;
2045 } else {
2046 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2047 ctxt->sax->error(ctxt->userData,
2048 "htmlParseEntityRef: expecting ';'\n");
2049 *str = name;
2050 }
2051 }
2052 }
2053 return(ent);
2054}
2055
2056/**
2057 * htmlParseAttValue:
2058 * @ctxt: an HTML parser context
2059 *
2060 * parse a value for an attribute
2061 * Note: the parser won't do substitution of entities here, this
2062 * will be handled later in xmlStringGetNodeList, unless it was
2063 * asked for ctxt->replaceEntities != 0
2064 *
2065 * Returns the AttValue parsed or NULL.
2066 */
2067
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002068static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002069htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2070 xmlChar *ret = NULL;
2071
2072 if (CUR == '"') {
2073 NEXT;
2074 ret = htmlParseHTMLAttribute(ctxt, '"');
2075 if (CUR != '"') {
2076 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2077 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2078 ctxt->wellFormed = 0;
2079 } else
2080 NEXT;
2081 } else if (CUR == '\'') {
2082 NEXT;
2083 ret = htmlParseHTMLAttribute(ctxt, '\'');
2084 if (CUR != '\'') {
2085 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2086 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2087 ctxt->wellFormed = 0;
2088 } else
2089 NEXT;
2090 } else {
2091 /*
2092 * That's an HTMLism, the attribute value may not be quoted
2093 */
2094 ret = htmlParseHTMLAttribute(ctxt, 0);
2095 if (ret == NULL) {
2096 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2097 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2098 ctxt->wellFormed = 0;
2099 }
2100 }
2101 return(ret);
2102}
2103
2104/**
2105 * htmlParseSystemLiteral:
2106 * @ctxt: an HTML parser context
2107 *
2108 * parse an HTML Literal
2109 *
2110 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2111 *
2112 * Returns the SystemLiteral parsed or NULL
2113 */
2114
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002115static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002116htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2117 const xmlChar *q;
2118 xmlChar *ret = NULL;
2119
2120 if (CUR == '"') {
2121 NEXT;
2122 q = CUR_PTR;
2123 while ((IS_CHAR(CUR)) && (CUR != '"'))
2124 NEXT;
2125 if (!IS_CHAR(CUR)) {
2126 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2127 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2128 ctxt->wellFormed = 0;
2129 } else {
2130 ret = xmlStrndup(q, CUR_PTR - q);
2131 NEXT;
2132 }
2133 } else if (CUR == '\'') {
2134 NEXT;
2135 q = CUR_PTR;
2136 while ((IS_CHAR(CUR)) && (CUR != '\''))
2137 NEXT;
2138 if (!IS_CHAR(CUR)) {
2139 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2140 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2141 ctxt->wellFormed = 0;
2142 } else {
2143 ret = xmlStrndup(q, CUR_PTR - q);
2144 NEXT;
2145 }
2146 } else {
2147 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2148 ctxt->sax->error(ctxt->userData,
2149 "SystemLiteral \" or ' expected\n");
2150 ctxt->wellFormed = 0;
2151 }
2152
2153 return(ret);
2154}
2155
2156/**
2157 * htmlParsePubidLiteral:
2158 * @ctxt: an HTML parser context
2159 *
2160 * parse an HTML public literal
2161 *
2162 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2163 *
2164 * Returns the PubidLiteral parsed or NULL.
2165 */
2166
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002167static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002168htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2169 const xmlChar *q;
2170 xmlChar *ret = NULL;
2171 /*
2172 * Name ::= (Letter | '_') (NameChar)*
2173 */
2174 if (CUR == '"') {
2175 NEXT;
2176 q = CUR_PTR;
2177 while (IS_PUBIDCHAR(CUR)) NEXT;
2178 if (CUR != '"') {
2179 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2180 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2181 ctxt->wellFormed = 0;
2182 } else {
2183 ret = xmlStrndup(q, CUR_PTR - q);
2184 NEXT;
2185 }
2186 } else if (CUR == '\'') {
2187 NEXT;
2188 q = CUR_PTR;
2189 while ((IS_LETTER(CUR)) && (CUR != '\''))
2190 NEXT;
2191 if (!IS_LETTER(CUR)) {
2192 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2193 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2194 ctxt->wellFormed = 0;
2195 } else {
2196 ret = xmlStrndup(q, CUR_PTR - q);
2197 NEXT;
2198 }
2199 } else {
2200 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2201 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2202 ctxt->wellFormed = 0;
2203 }
2204
2205 return(ret);
2206}
2207
2208/**
2209 * htmlParseScript:
2210 * @ctxt: an HTML parser context
2211 *
2212 * parse the content of an HTML SCRIPT or STYLE element
2213 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2214 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2215 * http://www.w3.org/TR/html4/types.html#type-script
2216 * http://www.w3.org/TR/html4/types.html#h-6.15
2217 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2218 *
2219 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2220 * element and the value of intrinsic event attributes. User agents must
2221 * not evaluate script data as HTML markup but instead must pass it on as
2222 * data to a script engine.
2223 * NOTES:
2224 * - The content is passed like CDATA
2225 * - the attributes for style and scripting "onXXX" are also described
2226 * as CDATA but SGML allows entities references in attributes so their
2227 * processing is identical as other attributes
2228 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002229static void
Owen Taylor3473f882001-02-23 17:55:21 +00002230htmlParseScript(htmlParserCtxtPtr ctxt) {
2231 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2232 int nbchar = 0;
2233 xmlChar cur;
2234
2235 SHRINK;
2236 cur = CUR;
2237 while (IS_CHAR(cur)) {
2238 if ((cur == '<') && (NXT(1) == '/')) {
2239 /*
2240 * One should break here, the specification is clear:
2241 * Authors should therefore escape "</" within the content.
2242 * Escape mechanisms are specific to each scripting or
2243 * style sheet language.
2244 */
2245 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2246 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2247 break; /* while */
2248 }
2249 buf[nbchar++] = cur;
2250 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2251 if (ctxt->sax->cdataBlock!= NULL) {
2252 /*
2253 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2254 */
2255 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2256 }
2257 nbchar = 0;
2258 }
2259 NEXT;
2260 cur = CUR;
2261 }
2262 if (!(IS_CHAR(cur))) {
2263 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2264 ctxt->sax->error(ctxt->userData,
2265 "Invalid char in CDATA 0x%X\n", cur);
2266 ctxt->wellFormed = 0;
2267 NEXT;
2268 }
2269
2270 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2271 if (ctxt->sax->cdataBlock!= NULL) {
2272 /*
2273 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2274 */
2275 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2276 }
2277 }
2278}
2279
2280
2281/**
2282 * htmlParseCharData:
2283 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002284 *
2285 * parse a CharData section.
2286 * if we are within a CDATA section ']]>' marks an end of section.
2287 *
2288 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2289 */
2290
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002291static void
2292htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002293 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2294 int nbchar = 0;
2295 int cur, l;
2296
2297 SHRINK;
2298 cur = CUR_CHAR(l);
2299 while (((cur != '<') || (ctxt->token == '<')) &&
2300 ((cur != '&') || (ctxt->token == '&')) &&
2301 (IS_CHAR(cur))) {
2302 COPY_BUF(l,buf,nbchar,cur);
2303 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2304 /*
2305 * Ok the segment is to be consumed as chars.
2306 */
2307 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2308 if (areBlanks(ctxt, buf, nbchar)) {
2309 if (ctxt->sax->ignorableWhitespace != NULL)
2310 ctxt->sax->ignorableWhitespace(ctxt->userData,
2311 buf, nbchar);
2312 } else {
2313 htmlCheckParagraph(ctxt);
2314 if (ctxt->sax->characters != NULL)
2315 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2316 }
2317 }
2318 nbchar = 0;
2319 }
2320 NEXTL(l);
2321 cur = CUR_CHAR(l);
2322 }
2323 if (nbchar != 0) {
2324 /*
2325 * Ok the segment is to be consumed as chars.
2326 */
2327 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2328 if (areBlanks(ctxt, buf, nbchar)) {
2329 if (ctxt->sax->ignorableWhitespace != NULL)
2330 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2331 } else {
2332 htmlCheckParagraph(ctxt);
2333 if (ctxt->sax->characters != NULL)
2334 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2335 }
2336 }
2337 }
2338}
2339
2340/**
2341 * htmlParseExternalID:
2342 * @ctxt: an HTML parser context
2343 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002344 *
2345 * Parse an External ID or a Public ID
2346 *
Owen Taylor3473f882001-02-23 17:55:21 +00002347 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2348 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2349 *
2350 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2351 *
2352 * Returns the function returns SystemLiteral and in the second
2353 * case publicID receives PubidLiteral, is strict is off
2354 * it is possible to return NULL and have publicID set.
2355 */
2356
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002357static xmlChar *
2358htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002359 xmlChar *URI = NULL;
2360
2361 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2362 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2363 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2364 SKIP(6);
2365 if (!IS_BLANK(CUR)) {
2366 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2367 ctxt->sax->error(ctxt->userData,
2368 "Space required after 'SYSTEM'\n");
2369 ctxt->wellFormed = 0;
2370 }
2371 SKIP_BLANKS;
2372 URI = htmlParseSystemLiteral(ctxt);
2373 if (URI == NULL) {
2374 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2375 ctxt->sax->error(ctxt->userData,
2376 "htmlParseExternalID: SYSTEM, no URI\n");
2377 ctxt->wellFormed = 0;
2378 }
2379 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2380 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2381 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2382 SKIP(6);
2383 if (!IS_BLANK(CUR)) {
2384 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2385 ctxt->sax->error(ctxt->userData,
2386 "Space required after 'PUBLIC'\n");
2387 ctxt->wellFormed = 0;
2388 }
2389 SKIP_BLANKS;
2390 *publicID = htmlParsePubidLiteral(ctxt);
2391 if (*publicID == NULL) {
2392 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2393 ctxt->sax->error(ctxt->userData,
2394 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2395 ctxt->wellFormed = 0;
2396 }
2397 SKIP_BLANKS;
2398 if ((CUR == '"') || (CUR == '\'')) {
2399 URI = htmlParseSystemLiteral(ctxt);
2400 }
2401 }
2402 return(URI);
2403}
2404
2405/**
2406 * htmlParseComment:
2407 * @ctxt: an HTML parser context
2408 *
2409 * Parse an XML (SGML) comment <!-- .... -->
2410 *
2411 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2412 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002413static void
Owen Taylor3473f882001-02-23 17:55:21 +00002414htmlParseComment(htmlParserCtxtPtr ctxt) {
2415 xmlChar *buf = NULL;
2416 int len;
2417 int size = HTML_PARSER_BUFFER_SIZE;
2418 int q, ql;
2419 int r, rl;
2420 int cur, l;
2421 xmlParserInputState state;
2422
2423 /*
2424 * Check that there is a comment right here.
2425 */
2426 if ((RAW != '<') || (NXT(1) != '!') ||
2427 (NXT(2) != '-') || (NXT(3) != '-')) return;
2428
2429 state = ctxt->instate;
2430 ctxt->instate = XML_PARSER_COMMENT;
2431 SHRINK;
2432 SKIP(4);
2433 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2434 if (buf == NULL) {
2435 xmlGenericError(xmlGenericErrorContext,
2436 "malloc of %d byte failed\n", size);
2437 ctxt->instate = state;
2438 return;
2439 }
2440 q = CUR_CHAR(ql);
2441 NEXTL(ql);
2442 r = CUR_CHAR(rl);
2443 NEXTL(rl);
2444 cur = CUR_CHAR(l);
2445 len = 0;
2446 while (IS_CHAR(cur) &&
2447 ((cur != '>') ||
2448 (r != '-') || (q != '-'))) {
2449 if (len + 5 >= size) {
2450 size *= 2;
2451 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2452 if (buf == NULL) {
2453 xmlGenericError(xmlGenericErrorContext,
2454 "realloc of %d byte failed\n", size);
2455 ctxt->instate = state;
2456 return;
2457 }
2458 }
2459 COPY_BUF(ql,buf,len,q);
2460 q = r;
2461 ql = rl;
2462 r = cur;
2463 rl = l;
2464 NEXTL(l);
2465 cur = CUR_CHAR(l);
2466 if (cur == 0) {
2467 SHRINK;
2468 GROW;
2469 cur = CUR_CHAR(l);
2470 }
2471 }
2472 buf[len] = 0;
2473 if (!IS_CHAR(cur)) {
2474 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2475 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2476 ctxt->sax->error(ctxt->userData,
2477 "Comment not terminated \n<!--%.50s\n", buf);
2478 ctxt->wellFormed = 0;
2479 xmlFree(buf);
2480 } else {
2481 NEXT;
2482 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2483 (!ctxt->disableSAX))
2484 ctxt->sax->comment(ctxt->userData, buf);
2485 xmlFree(buf);
2486 }
2487 ctxt->instate = state;
2488}
2489
2490/**
2491 * htmlParseCharRef:
2492 * @ctxt: an HTML parser context
2493 *
2494 * parse Reference declarations
2495 *
2496 * [66] CharRef ::= '&#' [0-9]+ ';' |
2497 * '&#x' [0-9a-fA-F]+ ';'
2498 *
2499 * Returns the value parsed (as an int)
2500 */
2501int
2502htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2503 int val = 0;
2504
2505 if ((CUR == '&') && (NXT(1) == '#') &&
2506 (NXT(2) == 'x')) {
2507 SKIP(3);
2508 while (CUR != ';') {
2509 if ((CUR >= '0') && (CUR <= '9'))
2510 val = val * 16 + (CUR - '0');
2511 else if ((CUR >= 'a') && (CUR <= 'f'))
2512 val = val * 16 + (CUR - 'a') + 10;
2513 else if ((CUR >= 'A') && (CUR <= 'F'))
2514 val = val * 16 + (CUR - 'A') + 10;
2515 else {
2516 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2517 ctxt->sax->error(ctxt->userData,
2518 "htmlParseCharRef: invalid hexadecimal value\n");
2519 ctxt->wellFormed = 0;
2520 return(0);
2521 }
2522 NEXT;
2523 }
2524 if (CUR == ';')
2525 NEXT;
2526 } else if ((CUR == '&') && (NXT(1) == '#')) {
2527 SKIP(2);
2528 while (CUR != ';') {
2529 if ((CUR >= '0') && (CUR <= '9'))
2530 val = val * 10 + (CUR - '0');
2531 else {
2532 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2533 ctxt->sax->error(ctxt->userData,
2534 "htmlParseCharRef: invalid decimal value\n");
2535 ctxt->wellFormed = 0;
2536 return(0);
2537 }
2538 NEXT;
2539 }
2540 if (CUR == ';')
2541 NEXT;
2542 } else {
2543 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2544 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2545 ctxt->wellFormed = 0;
2546 }
2547 /*
2548 * Check the value IS_CHAR ...
2549 */
2550 if (IS_CHAR(val)) {
2551 return(val);
2552 } else {
2553 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2554 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
2555 val);
2556 ctxt->wellFormed = 0;
2557 }
2558 return(0);
2559}
2560
2561
2562/**
2563 * htmlParseDocTypeDecl :
2564 * @ctxt: an HTML parser context
2565 *
2566 * parse a DOCTYPE declaration
2567 *
2568 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2569 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2570 */
2571
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002572static void
Owen Taylor3473f882001-02-23 17:55:21 +00002573htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
2574 xmlChar *name;
2575 xmlChar *ExternalID = NULL;
2576 xmlChar *URI = NULL;
2577
2578 /*
2579 * We know that '<!DOCTYPE' has been detected.
2580 */
2581 SKIP(9);
2582
2583 SKIP_BLANKS;
2584
2585 /*
2586 * Parse the DOCTYPE name.
2587 */
2588 name = htmlParseName(ctxt);
2589 if (name == NULL) {
2590 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2591 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2592 ctxt->wellFormed = 0;
2593 }
2594 /*
2595 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2596 */
2597
2598 SKIP_BLANKS;
2599
2600 /*
2601 * Check for SystemID and ExternalID
2602 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002603 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00002604 SKIP_BLANKS;
2605
2606 /*
2607 * We should be at the end of the DOCTYPE declaration.
2608 */
2609 if (CUR != '>') {
2610 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2611 ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
2612 ctxt->wellFormed = 0;
2613 /* We shouldn't try to resynchronize ... */
2614 }
2615 NEXT;
2616
2617 /*
2618 * Create or update the document accordingly to the DOCTYPE
2619 */
2620 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2621 (!ctxt->disableSAX))
2622 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
2623
2624 /*
2625 * Cleanup, since we don't use all those identifiers
2626 */
2627 if (URI != NULL) xmlFree(URI);
2628 if (ExternalID != NULL) xmlFree(ExternalID);
2629 if (name != NULL) xmlFree(name);
2630}
2631
2632/**
2633 * htmlParseAttribute:
2634 * @ctxt: an HTML parser context
2635 * @value: a xmlChar ** used to store the value of the attribute
2636 *
2637 * parse an attribute
2638 *
2639 * [41] Attribute ::= Name Eq AttValue
2640 *
2641 * [25] Eq ::= S? '=' S?
2642 *
2643 * With namespace:
2644 *
2645 * [NS 11] Attribute ::= QName Eq AttValue
2646 *
2647 * Also the case QName == xmlns:??? is handled independently as a namespace
2648 * definition.
2649 *
2650 * Returns the attribute name, and the value in *value.
2651 */
2652
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002653static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002654htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
2655 xmlChar *name, *val = NULL;
2656
2657 *value = NULL;
2658 name = htmlParseHTMLName(ctxt);
2659 if (name == NULL) {
2660 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2661 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2662 ctxt->wellFormed = 0;
2663 return(NULL);
2664 }
2665
2666 /*
2667 * read the value
2668 */
2669 SKIP_BLANKS;
2670 if (CUR == '=') {
2671 NEXT;
2672 SKIP_BLANKS;
2673 val = htmlParseAttValue(ctxt);
2674 /******
2675 } else {
2676 * TODO : some attribute must have values, some may not
2677 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2678 ctxt->sax->warning(ctxt->userData,
2679 "No value for attribute %s\n", name); */
2680 }
2681
2682 *value = val;
2683 return(name);
2684}
2685
2686/**
2687 * htmlCheckEncoding:
2688 * @ctxt: an HTML parser context
2689 * @attvalue: the attribute value
2690 *
2691 * Checks an http-equiv attribute from a Meta tag to detect
2692 * the encoding
2693 * If a new encoding is detected the parser is switched to decode
2694 * it and pass UTF8
2695 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002696static void
Owen Taylor3473f882001-02-23 17:55:21 +00002697htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2698 const xmlChar *encoding;
2699
2700 if ((ctxt == NULL) || (attvalue == NULL))
2701 return;
2702
2703 /* do not change encoding */
2704 if (ctxt->input->encoding != NULL)
2705 return;
2706
2707 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
2708 if (encoding != NULL) {
2709 encoding += 8;
2710 } else {
2711 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
2712 if (encoding != NULL)
2713 encoding += 9;
2714 }
2715 if (encoding != NULL) {
2716 xmlCharEncoding enc;
2717 xmlCharEncodingHandlerPtr handler;
2718
2719 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2720
2721 if (ctxt->input->encoding != NULL)
2722 xmlFree((xmlChar *) ctxt->input->encoding);
2723 ctxt->input->encoding = xmlStrdup(encoding);
2724
2725 enc = xmlParseCharEncoding((const char *) encoding);
2726 /*
2727 * registered set of known encodings
2728 */
2729 if (enc != XML_CHAR_ENCODING_ERROR) {
2730 xmlSwitchEncoding(ctxt, enc);
2731 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2732 } else {
2733 /*
2734 * fallback for unknown encodings
2735 */
2736 handler = xmlFindCharEncodingHandler((const char *) encoding);
2737 if (handler != NULL) {
2738 xmlSwitchToEncoding(ctxt, handler);
2739 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2740 } else {
2741 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2742 }
2743 }
2744
2745 if ((ctxt->input->buf != NULL) &&
2746 (ctxt->input->buf->encoder != NULL) &&
2747 (ctxt->input->buf->raw != NULL) &&
2748 (ctxt->input->buf->buffer != NULL)) {
2749 int nbchars;
2750 int processed;
2751
2752 /*
2753 * convert as much as possible to the parser reading buffer.
2754 */
2755 processed = ctxt->input->cur - ctxt->input->base;
2756 xmlBufferShrink(ctxt->input->buf->buffer, processed);
2757 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2758 ctxt->input->buf->buffer,
2759 ctxt->input->buf->raw);
2760 if (nbchars < 0) {
2761 ctxt->errNo = XML_ERR_INVALID_ENCODING;
2762 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2763 ctxt->sax->error(ctxt->userData,
2764 "htmlCheckEncoding: encoder error\n");
2765 }
2766 ctxt->input->base =
2767 ctxt->input->cur = ctxt->input->buf->buffer->content;
2768 }
2769 }
2770}
2771
2772/**
2773 * htmlCheckMeta:
2774 * @ctxt: an HTML parser context
2775 * @atts: the attributes values
2776 *
2777 * Checks an attributes from a Meta tag
2778 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002779static void
Owen Taylor3473f882001-02-23 17:55:21 +00002780htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2781 int i;
2782 const xmlChar *att, *value;
2783 int http = 0;
2784 const xmlChar *content = NULL;
2785
2786 if ((ctxt == NULL) || (atts == NULL))
2787 return;
2788
2789 i = 0;
2790 att = atts[i++];
2791 while (att != NULL) {
2792 value = atts[i++];
2793 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
2794 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
2795 http = 1;
2796 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
2797 content = value;
2798 att = atts[i++];
2799 }
2800 if ((http) && (content != NULL))
2801 htmlCheckEncoding(ctxt, content);
2802
2803}
2804
2805/**
2806 * htmlParseStartTag:
2807 * @ctxt: an HTML parser context
2808 *
2809 * parse a start of tag either for rule element or
2810 * EmptyElement. In both case we don't parse the tag closing chars.
2811 *
2812 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2813 *
2814 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2815 *
2816 * With namespace:
2817 *
2818 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2819 *
2820 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2821 *
2822 */
2823
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002824static void
Owen Taylor3473f882001-02-23 17:55:21 +00002825htmlParseStartTag(htmlParserCtxtPtr ctxt) {
2826 xmlChar *name;
2827 xmlChar *attname;
2828 xmlChar *attvalue;
2829 const xmlChar **atts = NULL;
2830 int nbatts = 0;
2831 int maxatts = 0;
2832 int meta = 0;
2833 int i;
2834
2835 if (CUR != '<') return;
2836 NEXT;
2837
2838 GROW;
2839 name = htmlParseHTMLName(ctxt);
2840 if (name == NULL) {
2841 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2842 ctxt->sax->error(ctxt->userData,
2843 "htmlParseStartTag: invalid element name\n");
2844 ctxt->wellFormed = 0;
2845 /* Dump the bogus tag like browsers do */
2846 while ((IS_CHAR(CUR)) && (CUR != '>'))
2847 NEXT;
2848 return;
2849 }
2850 if (xmlStrEqual(name, BAD_CAST"meta"))
2851 meta = 1;
2852
2853 /*
2854 * Check for auto-closure of HTML elements.
2855 */
2856 htmlAutoClose(ctxt, name);
2857
2858 /*
2859 * Check for implied HTML elements.
2860 */
2861 htmlCheckImplied(ctxt, name);
2862
2863 /*
2864 * Avoid html at any level > 0, head at any level != 1
2865 * or any attempt to recurse body
2866 */
2867 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
2868 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2869 ctxt->sax->error(ctxt->userData,
2870 "htmlParseStartTag: misplaced <html> tag\n");
2871 ctxt->wellFormed = 0;
2872 xmlFree(name);
2873 return;
2874 }
2875 if ((ctxt->nameNr != 1) &&
2876 (xmlStrEqual(name, BAD_CAST"head"))) {
2877 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2878 ctxt->sax->error(ctxt->userData,
2879 "htmlParseStartTag: misplaced <head> tag\n");
2880 ctxt->wellFormed = 0;
2881 xmlFree(name);
2882 return;
2883 }
2884 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002885 int indx;
2886 for (indx = 0;indx < ctxt->nameNr;indx++) {
2887 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor3473f882001-02-23 17:55:21 +00002888 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2889 ctxt->sax->error(ctxt->userData,
2890 "htmlParseStartTag: misplaced <body> tag\n");
2891 ctxt->wellFormed = 0;
2892 xmlFree(name);
2893 return;
2894 }
2895 }
2896 }
2897
2898 /*
2899 * Now parse the attributes, it ends up with the ending
2900 *
2901 * (S Attribute)* S?
2902 */
2903 SKIP_BLANKS;
2904 while ((IS_CHAR(CUR)) &&
2905 (CUR != '>') &&
2906 ((CUR != '/') || (NXT(1) != '>'))) {
2907 long cons = ctxt->nbChars;
2908
2909 GROW;
2910 attname = htmlParseAttribute(ctxt, &attvalue);
2911 if (attname != NULL) {
2912
2913 /*
2914 * Well formedness requires at most one declaration of an attribute
2915 */
2916 for (i = 0; i < nbatts;i += 2) {
2917 if (xmlStrEqual(atts[i], attname)) {
2918 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2919 ctxt->sax->error(ctxt->userData,
2920 "Attribute %s redefined\n",
2921 attname);
2922 ctxt->wellFormed = 0;
2923 xmlFree(attname);
2924 if (attvalue != NULL)
2925 xmlFree(attvalue);
2926 goto failed;
2927 }
2928 }
2929
2930 /*
2931 * Add the pair to atts
2932 */
2933 if (atts == NULL) {
2934 maxatts = 10;
2935 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
2936 if (atts == NULL) {
2937 xmlGenericError(xmlGenericErrorContext,
2938 "malloc of %ld byte failed\n",
2939 maxatts * (long)sizeof(xmlChar *));
2940 if (name != NULL) xmlFree(name);
2941 return;
2942 }
2943 } else if (nbatts + 4 > maxatts) {
2944 maxatts *= 2;
2945 atts = (const xmlChar **) xmlRealloc((void *) atts,
2946 maxatts * sizeof(xmlChar *));
2947 if (atts == NULL) {
2948 xmlGenericError(xmlGenericErrorContext,
2949 "realloc of %ld byte failed\n",
2950 maxatts * (long)sizeof(xmlChar *));
2951 if (name != NULL) xmlFree(name);
2952 return;
2953 }
2954 }
2955 atts[nbatts++] = attname;
2956 atts[nbatts++] = attvalue;
2957 atts[nbatts] = NULL;
2958 atts[nbatts + 1] = NULL;
2959 }
2960 else {
2961 /* Dump the bogus attribute string up to the next blank or
2962 * the end of the tag. */
2963 while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
2964 && ((CUR != '/') || (NXT(1) != '>')))
2965 NEXT;
2966 }
2967
2968failed:
2969 SKIP_BLANKS;
2970 if (cons == ctxt->nbChars) {
2971 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2972 ctxt->sax->error(ctxt->userData,
2973 "htmlParseStartTag: problem parsing attributes\n");
2974 ctxt->wellFormed = 0;
2975 break;
2976 }
2977 }
2978
2979 /*
2980 * Handle specific association to the META tag
2981 */
2982 if (meta)
2983 htmlCheckMeta(ctxt, atts);
2984
2985 /*
2986 * SAX: Start of Element !
2987 */
2988 htmlnamePush(ctxt, xmlStrdup(name));
2989#ifdef DEBUG
2990 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
2991#endif
2992 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
2993 ctxt->sax->startElement(ctxt->userData, name, atts);
2994
2995 if (atts != NULL) {
2996 for (i = 0;i < nbatts;i++) {
2997 if (atts[i] != NULL)
2998 xmlFree((xmlChar *) atts[i]);
2999 }
3000 xmlFree((void *) atts);
3001 }
3002 if (name != NULL) xmlFree(name);
3003}
3004
3005/**
3006 * htmlParseEndTag:
3007 * @ctxt: an HTML parser context
3008 *
3009 * parse an end of tag
3010 *
3011 * [42] ETag ::= '</' Name S? '>'
3012 *
3013 * With namespace
3014 *
3015 * [NS 9] ETag ::= '</' QName S? '>'
3016 */
3017
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003018static void
Owen Taylor3473f882001-02-23 17:55:21 +00003019htmlParseEndTag(htmlParserCtxtPtr ctxt) {
3020 xmlChar *name;
3021 xmlChar *oldname;
3022 int i;
3023
3024 if ((CUR != '<') || (NXT(1) != '/')) {
3025 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3026 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3027 ctxt->wellFormed = 0;
3028 return;
3029 }
3030 SKIP(2);
3031
3032 name = htmlParseHTMLName(ctxt);
3033 if (name == NULL) return;
3034
3035 /*
3036 * We should definitely be at the ending "S? '>'" part
3037 */
3038 SKIP_BLANKS;
3039 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3040 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3041 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3042 ctxt->wellFormed = 0;
3043 } else
3044 NEXT;
3045
3046 /*
3047 * If the name read is not one of the element in the parsing stack
3048 * then return, it's just an error.
3049 */
3050 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
3051 if (xmlStrEqual(name, ctxt->nameTab[i])) break;
3052 }
3053 if (i < 0) {
3054 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3055 ctxt->sax->error(ctxt->userData,
3056 "Unexpected end tag : %s\n", name);
3057 xmlFree(name);
3058 ctxt->wellFormed = 0;
3059 return;
3060 }
3061
3062
3063 /*
3064 * Check for auto-closure of HTML elements.
3065 */
3066
3067 htmlAutoCloseOnClose(ctxt, name);
3068
3069 /*
3070 * Well formedness constraints, opening and closing must match.
3071 * With the exception that the autoclose may have popped stuff out
3072 * of the stack.
3073 */
3074 if (!xmlStrEqual(name, ctxt->name)) {
3075#ifdef DEBUG
3076 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
3077#endif
3078 if ((ctxt->name != NULL) &&
3079 (!xmlStrEqual(ctxt->name, name))) {
3080 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3081 ctxt->sax->error(ctxt->userData,
3082 "Opening and ending tag mismatch: %s and %s\n",
3083 name, ctxt->name);
3084 ctxt->wellFormed = 0;
3085 }
3086 }
3087
3088 /*
3089 * SAX: End of Tag
3090 */
3091 oldname = ctxt->name;
3092 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3093 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3094 ctxt->sax->endElement(ctxt->userData, name);
3095 oldname = htmlnamePop(ctxt);
3096 if (oldname != NULL) {
3097#ifdef DEBUG
3098 xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
3099#endif
3100 xmlFree(oldname);
3101#ifdef DEBUG
3102 } else {
3103 xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
3104#endif
3105 }
3106 }
3107
3108 if (name != NULL)
3109 xmlFree(name);
3110
3111 return;
3112}
3113
3114
3115/**
3116 * htmlParseReference:
3117 * @ctxt: an HTML parser context
3118 *
3119 * parse and handle entity references in content,
3120 * this will end-up in a call to character() since this is either a
3121 * CharRef, or a predefined entity.
3122 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003123static void
Owen Taylor3473f882001-02-23 17:55:21 +00003124htmlParseReference(htmlParserCtxtPtr ctxt) {
3125 htmlEntityDescPtr ent;
3126 xmlChar out[6];
3127 xmlChar *name;
3128 if (CUR != '&') return;
3129
3130 if (NXT(1) == '#') {
3131 unsigned int c;
3132 int bits, i = 0;
3133
3134 c = htmlParseCharRef(ctxt);
3135 if (c == 0)
3136 return;
3137
3138 if (c < 0x80) { out[i++]= c; bits= -6; }
3139 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3140 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3141 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3142
3143 for ( ; bits >= 0; bits-= 6) {
3144 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3145 }
3146 out[i] = 0;
3147
3148 htmlCheckParagraph(ctxt);
3149 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3150 ctxt->sax->characters(ctxt->userData, out, i);
3151 } else {
3152 ent = htmlParseEntityRef(ctxt, &name);
3153 if (name == NULL) {
3154 htmlCheckParagraph(ctxt);
3155 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3156 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3157 return;
3158 }
3159 if ((ent == NULL) || (ent->value <= 0)) {
3160 htmlCheckParagraph(ctxt);
3161 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3162 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3163 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3164 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3165 }
3166 } else {
3167 unsigned int c;
3168 int bits, i = 0;
3169
3170 c = ent->value;
3171 if (c < 0x80)
3172 { out[i++]= c; bits= -6; }
3173 else if (c < 0x800)
3174 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3175 else if (c < 0x10000)
3176 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3177 else
3178 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3179
3180 for ( ; bits >= 0; bits-= 6) {
3181 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3182 }
3183 out[i] = 0;
3184
3185 htmlCheckParagraph(ctxt);
3186 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3187 ctxt->sax->characters(ctxt->userData, out, i);
3188 }
3189 xmlFree(name);
3190 }
3191}
3192
3193/**
3194 * htmlParseContent:
3195 * @ctxt: an HTML parser context
3196 * @name: the node name
3197 *
3198 * Parse a content: comment, sub-element, reference or text.
3199 *
3200 */
3201
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003202static void
Owen Taylor3473f882001-02-23 17:55:21 +00003203htmlParseContent(htmlParserCtxtPtr ctxt) {
3204 xmlChar *currentNode;
3205 int depth;
3206
3207 currentNode = xmlStrdup(ctxt->name);
3208 depth = ctxt->nameNr;
3209 while (1) {
3210 long cons = ctxt->nbChars;
3211
3212 GROW;
3213 /*
3214 * Our tag or one of it's parent or children is ending.
3215 */
3216 if ((CUR == '<') && (NXT(1) == '/')) {
3217 htmlParseEndTag(ctxt);
3218 if (currentNode != NULL) xmlFree(currentNode);
3219 return;
3220 }
3221
3222 /*
3223 * Has this node been popped out during parsing of
3224 * the next element
3225 */
3226 if ((!xmlStrEqual(currentNode, ctxt->name)) &&
3227 (depth >= ctxt->nameNr)) {
3228 if (currentNode != NULL) xmlFree(currentNode);
3229 return;
3230 }
3231
Daniel Veillardf9533d12001-03-03 10:04:57 +00003232 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3233 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003234 /*
3235 * Handle SCRIPT/STYLE separately
3236 */
3237 htmlParseScript(ctxt);
3238 } else {
3239 /*
3240 * Sometimes DOCTYPE arrives in the middle of the document
3241 */
3242 if ((CUR == '<') && (NXT(1) == '!') &&
3243 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3244 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3245 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3246 (UPP(8) == 'E')) {
3247 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3248 ctxt->sax->error(ctxt->userData,
3249 "Misplaced DOCTYPE declaration\n");
3250 ctxt->wellFormed = 0;
3251 htmlParseDocTypeDecl(ctxt);
3252 }
3253
3254 /*
3255 * First case : a comment
3256 */
3257 if ((CUR == '<') && (NXT(1) == '!') &&
3258 (NXT(2) == '-') && (NXT(3) == '-')) {
3259 htmlParseComment(ctxt);
3260 }
3261
3262 /*
3263 * Second case : a sub-element.
3264 */
3265 else if (CUR == '<') {
3266 htmlParseElement(ctxt);
3267 }
3268
3269 /*
3270 * Third case : a reference. If if has not been resolved,
3271 * parsing returns it's Name, create the node
3272 */
3273 else if (CUR == '&') {
3274 htmlParseReference(ctxt);
3275 }
3276
3277 /*
3278 * Fourth : end of the resource
3279 */
3280 else if (CUR == 0) {
Daniel Veillardf9533d12001-03-03 10:04:57 +00003281 int level = ctxt->nodeNr;
Owen Taylor3473f882001-02-23 17:55:21 +00003282 htmlAutoClose(ctxt, NULL);
Daniel Veillardf9533d12001-03-03 10:04:57 +00003283 if (level == ctxt->nodeNr)
3284 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003285 }
3286
3287 /*
3288 * Last case, text. Note that References are handled directly.
3289 */
3290 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003291 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003292 }
3293
3294 if (cons == ctxt->nbChars) {
3295 if (ctxt->node != NULL) {
3296 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3297 ctxt->sax->error(ctxt->userData,
3298 "detected an error in element content\n");
3299 ctxt->wellFormed = 0;
3300 }
3301 break;
3302 }
3303 }
3304 GROW;
3305 }
3306 if (currentNode != NULL) xmlFree(currentNode);
3307}
3308
3309/**
3310 * htmlParseElement:
3311 * @ctxt: an HTML parser context
3312 *
3313 * parse an HTML element, this is highly recursive
3314 *
3315 * [39] element ::= EmptyElemTag | STag content ETag
3316 *
3317 * [41] Attribute ::= Name Eq AttValue
3318 */
3319
3320void
3321htmlParseElement(htmlParserCtxtPtr ctxt) {
3322 xmlChar *name;
3323 xmlChar *currentNode = NULL;
3324 htmlElemDescPtr info;
3325 htmlParserNodeInfo node_info;
3326 xmlChar *oldname;
3327 int depth = ctxt->nameNr;
3328
3329 /* Capture start position */
3330 if (ctxt->record_info) {
3331 node_info.begin_pos = ctxt->input->consumed +
3332 (CUR_PTR - ctxt->input->base);
3333 node_info.begin_line = ctxt->input->line;
3334 }
3335
3336 oldname = xmlStrdup(ctxt->name);
3337 htmlParseStartTag(ctxt);
3338 name = ctxt->name;
3339#ifdef DEBUG
3340 if (oldname == NULL)
3341 xmlGenericError(xmlGenericErrorContext,
3342 "Start of element %s\n", name);
3343 else if (name == NULL)
3344 xmlGenericError(xmlGenericErrorContext,
3345 "Start of element failed, was %s\n", oldname);
3346 else
3347 xmlGenericError(xmlGenericErrorContext,
3348 "Start of element %s, was %s\n", name, oldname);
3349#endif
3350 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3351 (name == NULL)) {
3352 if (CUR == '>')
3353 NEXT;
3354 if (oldname != NULL)
3355 xmlFree(oldname);
3356 return;
3357 }
3358 if (oldname != NULL)
3359 xmlFree(oldname);
3360
3361 /*
3362 * Lookup the info for that element.
3363 */
3364 info = htmlTagLookup(name);
3365 if (info == NULL) {
3366 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3367 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3368 name);
3369 ctxt->wellFormed = 0;
3370 } else if (info->depr) {
3371/***************************
3372 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3373 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3374 name);
3375 ***************************/
3376 }
3377
3378 /*
3379 * Check for an Empty Element labelled the XML/SGML way
3380 */
3381 if ((CUR == '/') && (NXT(1) == '>')) {
3382 SKIP(2);
3383 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3384 ctxt->sax->endElement(ctxt->userData, name);
3385 oldname = htmlnamePop(ctxt);
3386#ifdef DEBUG
3387 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
3388#endif
3389 if (oldname != NULL)
3390 xmlFree(oldname);
3391 return;
3392 }
3393
3394 if (CUR == '>') {
3395 NEXT;
3396 } else {
3397 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3398 ctxt->sax->error(ctxt->userData,
3399 "Couldn't find end of Start Tag %s\n",
3400 name);
3401 ctxt->wellFormed = 0;
3402
3403 /*
3404 * end of parsing of this node.
3405 */
3406 if (xmlStrEqual(name, ctxt->name)) {
3407 nodePop(ctxt);
3408 oldname = htmlnamePop(ctxt);
3409#ifdef DEBUG
3410 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
3411#endif
3412 if (oldname != NULL)
3413 xmlFree(oldname);
3414 }
3415
3416 /*
3417 * Capture end position and add node
3418 */
3419 if ( currentNode != NULL && ctxt->record_info ) {
3420 node_info.end_pos = ctxt->input->consumed +
3421 (CUR_PTR - ctxt->input->base);
3422 node_info.end_line = ctxt->input->line;
3423 node_info.node = ctxt->node;
3424 xmlParserAddNodeInfo(ctxt, &node_info);
3425 }
3426 return;
3427 }
3428
3429 /*
3430 * Check for an Empty Element from DTD definition
3431 */
3432 if ((info != NULL) && (info->empty)) {
3433 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3434 ctxt->sax->endElement(ctxt->userData, name);
3435 oldname = htmlnamePop(ctxt);
3436#ifdef DEBUG
3437 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
3438#endif
3439 if (oldname != NULL)
3440 xmlFree(oldname);
3441 return;
3442 }
3443
3444 /*
3445 * Parse the content of the element:
3446 */
3447 currentNode = xmlStrdup(ctxt->name);
3448 depth = ctxt->nameNr;
3449 while (IS_CHAR(CUR)) {
3450 htmlParseContent(ctxt);
3451 if (ctxt->nameNr < depth) break;
3452 }
3453
3454 if (!IS_CHAR(CUR)) {
3455 /************
3456 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3457 ctxt->sax->error(ctxt->userData,
3458 "Premature end of data in tag %s\n", currentNode);
3459 ctxt->wellFormed = 0;
3460 *************/
3461
3462 /*
3463 * end of parsing of this node.
3464 */
3465 nodePop(ctxt);
3466 oldname = htmlnamePop(ctxt);
3467#ifdef DEBUG
3468 xmlGenericError(xmlGenericErrorContext,"Premature end of tag %s : popping out %s\n", name, oldname);
3469#endif
3470 if (oldname != NULL)
3471 xmlFree(oldname);
3472 if (currentNode != NULL)
3473 xmlFree(currentNode);
3474 return;
3475 }
3476
3477 /*
3478 * Capture end position and add node
3479 */
3480 if ( currentNode != NULL && ctxt->record_info ) {
3481 node_info.end_pos = ctxt->input->consumed +
3482 (CUR_PTR - ctxt->input->base);
3483 node_info.end_line = ctxt->input->line;
3484 node_info.node = ctxt->node;
3485 xmlParserAddNodeInfo(ctxt, &node_info);
3486 }
3487 if (currentNode != NULL)
3488 xmlFree(currentNode);
3489}
3490
3491/**
3492 * htmlParseDocument :
3493 * @ctxt: an HTML parser context
3494 *
3495 * parse an HTML document (and build a tree if using the standard SAX
3496 * interface).
3497 *
3498 * Returns 0, -1 in case of error. the parser context is augmented
3499 * as a result of the parsing.
3500 */
3501
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003502static int
Owen Taylor3473f882001-02-23 17:55:21 +00003503htmlParseDocument(htmlParserCtxtPtr ctxt) {
3504 xmlDtdPtr dtd;
3505
3506 htmlDefaultSAXHandlerInit();
3507 ctxt->html = 1;
3508
3509 GROW;
3510 /*
3511 * SAX: beginning of the document processing.
3512 */
3513 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3514 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3515
3516 /*
3517 * Wipe out everything which is before the first '<'
3518 */
3519 SKIP_BLANKS;
3520 if (CUR == 0) {
3521 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3522 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3523 ctxt->wellFormed = 0;
3524 }
3525
3526 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3527 ctxt->sax->startDocument(ctxt->userData);
3528
3529
3530 /*
3531 * Parse possible comments before any content
3532 */
3533 while ((CUR == '<') && (NXT(1) == '!') &&
3534 (NXT(2) == '-') && (NXT(3) == '-')) {
3535 htmlParseComment(ctxt);
3536 SKIP_BLANKS;
3537 }
3538
3539
3540 /*
3541 * Then possibly doc type declaration(s) and more Misc
3542 * (doctypedecl Misc*)?
3543 */
3544 if ((CUR == '<') && (NXT(1) == '!') &&
3545 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3546 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3547 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3548 (UPP(8) == 'E')) {
3549 htmlParseDocTypeDecl(ctxt);
3550 }
3551 SKIP_BLANKS;
3552
3553 /*
3554 * Parse possible comments before any content
3555 */
3556 while ((CUR == '<') && (NXT(1) == '!') &&
3557 (NXT(2) == '-') && (NXT(3) == '-')) {
3558 htmlParseComment(ctxt);
3559 SKIP_BLANKS;
3560 }
3561
3562 /*
3563 * Time to start parsing the tree itself
3564 */
3565 htmlParseContent(ctxt);
3566
3567 /*
3568 * autoclose
3569 */
3570 if (CUR == 0)
3571 htmlAutoClose(ctxt, NULL);
3572
3573
3574 /*
3575 * SAX: end of the document processing.
3576 */
3577 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3578 ctxt->sax->endDocument(ctxt->userData);
3579
3580 if (ctxt->myDoc != NULL) {
3581 dtd = xmlGetIntSubset(ctxt->myDoc);
3582 if (dtd == NULL)
3583 ctxt->myDoc->intSubset =
3584 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3585 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3586 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3587 }
3588 if (! ctxt->wellFormed) return(-1);
3589 return(0);
3590}
3591
3592
3593/************************************************************************
3594 * *
3595 * Parser contexts handling *
3596 * *
3597 ************************************************************************/
3598
3599/**
3600 * xmlInitParserCtxt:
3601 * @ctxt: an HTML parser context
3602 *
3603 * Initialize a parser context
3604 */
3605
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003606static void
Owen Taylor3473f882001-02-23 17:55:21 +00003607htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3608{
3609 htmlSAXHandler *sax;
3610
3611 if (ctxt == NULL) return;
3612 memset(ctxt, 0, sizeof(htmlParserCtxt));
3613
3614 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3615 if (sax == NULL) {
3616 xmlGenericError(xmlGenericErrorContext,
3617 "htmlInitParserCtxt: out of memory\n");
3618 }
3619 else
3620 memset(sax, 0, sizeof(htmlSAXHandler));
3621
3622 /* Allocate the Input stack */
3623 ctxt->inputTab = (htmlParserInputPtr *)
3624 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3625 if (ctxt->inputTab == NULL) {
3626 xmlGenericError(xmlGenericErrorContext,
3627 "htmlInitParserCtxt: out of memory\n");
3628 ctxt->inputNr = 0;
3629 ctxt->inputMax = 0;
3630 ctxt->input = NULL;
3631 return;
3632 }
3633 ctxt->inputNr = 0;
3634 ctxt->inputMax = 5;
3635 ctxt->input = NULL;
3636 ctxt->version = NULL;
3637 ctxt->encoding = NULL;
3638 ctxt->standalone = -1;
3639 ctxt->instate = XML_PARSER_START;
3640
3641 /* Allocate the Node stack */
3642 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3643 if (ctxt->nodeTab == NULL) {
3644 xmlGenericError(xmlGenericErrorContext,
3645 "htmlInitParserCtxt: out of memory\n");
3646 ctxt->nodeNr = 0;
3647 ctxt->nodeMax = 0;
3648 ctxt->node = NULL;
3649 ctxt->inputNr = 0;
3650 ctxt->inputMax = 0;
3651 ctxt->input = NULL;
3652 return;
3653 }
3654 ctxt->nodeNr = 0;
3655 ctxt->nodeMax = 10;
3656 ctxt->node = NULL;
3657
3658 /* Allocate the Name stack */
3659 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
3660 if (ctxt->nameTab == NULL) {
3661 xmlGenericError(xmlGenericErrorContext,
3662 "htmlInitParserCtxt: out of memory\n");
3663 ctxt->nameNr = 0;
3664 ctxt->nameMax = 10;
3665 ctxt->name = NULL;
3666 ctxt->nodeNr = 0;
3667 ctxt->nodeMax = 0;
3668 ctxt->node = NULL;
3669 ctxt->inputNr = 0;
3670 ctxt->inputMax = 0;
3671 ctxt->input = NULL;
3672 return;
3673 }
3674 ctxt->nameNr = 0;
3675 ctxt->nameMax = 10;
3676 ctxt->name = NULL;
3677
3678 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3679 else {
3680 ctxt->sax = sax;
3681 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3682 }
3683 ctxt->userData = ctxt;
3684 ctxt->myDoc = NULL;
3685 ctxt->wellFormed = 1;
3686 ctxt->replaceEntities = 0;
3687 ctxt->html = 1;
3688 ctxt->record_info = 0;
3689 ctxt->validate = 0;
3690 ctxt->nbChars = 0;
3691 ctxt->checkIndex = 0;
3692 xmlInitNodeInfoSeq(&ctxt->node_seq);
3693}
3694
3695/**
3696 * htmlFreeParserCtxt:
3697 * @ctxt: an HTML parser context
3698 *
3699 * Free all the memory used by a parser context. However the parsed
3700 * document in ctxt->myDoc is not freed.
3701 */
3702
3703void
3704htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3705{
3706 xmlFreeParserCtxt(ctxt);
3707}
3708
3709/**
3710 * htmlCreateDocParserCtxt :
3711 * @cur: a pointer to an array of xmlChar
3712 * @encoding: a free form C string describing the HTML document encoding, or NULL
3713 *
3714 * Create a parser context for an HTML document.
3715 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003716 * TODO: check the need to add encoding handling there
3717 *
Owen Taylor3473f882001-02-23 17:55:21 +00003718 * Returns the new parser context or NULL
3719 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003720static htmlParserCtxtPtr
3721htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding UNUSED) {
Owen Taylor3473f882001-02-23 17:55:21 +00003722 htmlParserCtxtPtr ctxt;
3723 htmlParserInputPtr input;
3724 /* htmlCharEncoding enc; */
3725
3726 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
3727 if (ctxt == NULL) {
3728 perror("malloc");
3729 return(NULL);
3730 }
3731 htmlInitParserCtxt(ctxt);
3732 input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
3733 if (input == NULL) {
3734 perror("malloc");
3735 xmlFree(ctxt);
3736 return(NULL);
3737 }
3738 memset(input, 0, sizeof(htmlParserInput));
3739
3740 input->line = 1;
3741 input->col = 1;
3742 input->base = cur;
3743 input->cur = cur;
3744
3745 inputPush(ctxt, input);
3746 return(ctxt);
3747}
3748
3749/************************************************************************
3750 * *
3751 * Progressive parsing interfaces *
3752 * *
3753 ************************************************************************/
3754
3755/**
3756 * htmlParseLookupSequence:
3757 * @ctxt: an HTML parser context
3758 * @first: the first char to lookup
3759 * @next: the next char to lookup or zero
3760 * @third: the next char to lookup or zero
3761 *
3762 * Try to find if a sequence (first, next, third) or just (first next) or
3763 * (first) is available in the input stream.
3764 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3765 * to avoid rescanning sequences of bytes, it DOES change the state of the
3766 * parser, do not use liberally.
3767 * This is basically similar to xmlParseLookupSequence()
3768 *
3769 * Returns the index to the current parsing point if the full sequence
3770 * is available, -1 otherwise.
3771 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003772static int
Owen Taylor3473f882001-02-23 17:55:21 +00003773htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3774 xmlChar next, xmlChar third) {
3775 int base, len;
3776 htmlParserInputPtr in;
3777 const xmlChar *buf;
3778
3779 in = ctxt->input;
3780 if (in == NULL) return(-1);
3781 base = in->cur - in->base;
3782 if (base < 0) return(-1);
3783 if (ctxt->checkIndex > base)
3784 base = ctxt->checkIndex;
3785 if (in->buf == NULL) {
3786 buf = in->base;
3787 len = in->length;
3788 } else {
3789 buf = in->buf->buffer->content;
3790 len = in->buf->buffer->use;
3791 }
3792 /* take into account the sequence length */
3793 if (third) len -= 2;
3794 else if (next) len --;
3795 for (;base < len;base++) {
3796 if (buf[base] == first) {
3797 if (third != 0) {
3798 if ((buf[base + 1] != next) ||
3799 (buf[base + 2] != third)) continue;
3800 } else if (next != 0) {
3801 if (buf[base + 1] != next) continue;
3802 }
3803 ctxt->checkIndex = 0;
3804#ifdef DEBUG_PUSH
3805 if (next == 0)
3806 xmlGenericError(xmlGenericErrorContext,
3807 "HPP: lookup '%c' found at %d\n",
3808 first, base);
3809 else if (third == 0)
3810 xmlGenericError(xmlGenericErrorContext,
3811 "HPP: lookup '%c%c' found at %d\n",
3812 first, next, base);
3813 else
3814 xmlGenericError(xmlGenericErrorContext,
3815 "HPP: lookup '%c%c%c' found at %d\n",
3816 first, next, third, base);
3817#endif
3818 return(base - (in->cur - in->base));
3819 }
3820 }
3821 ctxt->checkIndex = base;
3822#ifdef DEBUG_PUSH
3823 if (next == 0)
3824 xmlGenericError(xmlGenericErrorContext,
3825 "HPP: lookup '%c' failed\n", first);
3826 else if (third == 0)
3827 xmlGenericError(xmlGenericErrorContext,
3828 "HPP: lookup '%c%c' failed\n", first, next);
3829 else
3830 xmlGenericError(xmlGenericErrorContext,
3831 "HPP: lookup '%c%c%c' failed\n", first, next, third);
3832#endif
3833 return(-1);
3834}
3835
3836/**
3837 * htmlParseTryOrFinish:
3838 * @ctxt: an HTML parser context
3839 * @terminate: last chunk indicator
3840 *
3841 * Try to progress on parsing
3842 *
3843 * Returns zero if no parsing was possible
3844 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003845static int
Owen Taylor3473f882001-02-23 17:55:21 +00003846htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
3847 int ret = 0;
3848 htmlParserInputPtr in;
3849 int avail = 0;
3850 xmlChar cur, next;
3851
3852#ifdef DEBUG_PUSH
3853 switch (ctxt->instate) {
3854 case XML_PARSER_EOF:
3855 xmlGenericError(xmlGenericErrorContext,
3856 "HPP: try EOF\n"); break;
3857 case XML_PARSER_START:
3858 xmlGenericError(xmlGenericErrorContext,
3859 "HPP: try START\n"); break;
3860 case XML_PARSER_MISC:
3861 xmlGenericError(xmlGenericErrorContext,
3862 "HPP: try MISC\n");break;
3863 case XML_PARSER_COMMENT:
3864 xmlGenericError(xmlGenericErrorContext,
3865 "HPP: try COMMENT\n");break;
3866 case XML_PARSER_PROLOG:
3867 xmlGenericError(xmlGenericErrorContext,
3868 "HPP: try PROLOG\n");break;
3869 case XML_PARSER_START_TAG:
3870 xmlGenericError(xmlGenericErrorContext,
3871 "HPP: try START_TAG\n");break;
3872 case XML_PARSER_CONTENT:
3873 xmlGenericError(xmlGenericErrorContext,
3874 "HPP: try CONTENT\n");break;
3875 case XML_PARSER_CDATA_SECTION:
3876 xmlGenericError(xmlGenericErrorContext,
3877 "HPP: try CDATA_SECTION\n");break;
3878 case XML_PARSER_END_TAG:
3879 xmlGenericError(xmlGenericErrorContext,
3880 "HPP: try END_TAG\n");break;
3881 case XML_PARSER_ENTITY_DECL:
3882 xmlGenericError(xmlGenericErrorContext,
3883 "HPP: try ENTITY_DECL\n");break;
3884 case XML_PARSER_ENTITY_VALUE:
3885 xmlGenericError(xmlGenericErrorContext,
3886 "HPP: try ENTITY_VALUE\n");break;
3887 case XML_PARSER_ATTRIBUTE_VALUE:
3888 xmlGenericError(xmlGenericErrorContext,
3889 "HPP: try ATTRIBUTE_VALUE\n");break;
3890 case XML_PARSER_DTD:
3891 xmlGenericError(xmlGenericErrorContext,
3892 "HPP: try DTD\n");break;
3893 case XML_PARSER_EPILOG:
3894 xmlGenericError(xmlGenericErrorContext,
3895 "HPP: try EPILOG\n");break;
3896 case XML_PARSER_PI:
3897 xmlGenericError(xmlGenericErrorContext,
3898 "HPP: try PI\n");break;
3899 case XML_PARSER_SYSTEM_LITERAL:
3900 xmlGenericError(xmlGenericErrorContext,
3901 "HPP: try SYSTEM_LITERAL\n");break;
3902 }
3903#endif
3904
3905 while (1) {
3906
3907 in = ctxt->input;
3908 if (in == NULL) break;
3909 if (in->buf == NULL)
3910 avail = in->length - (in->cur - in->base);
3911 else
3912 avail = in->buf->buffer->use - (in->cur - in->base);
3913 if ((avail == 0) && (terminate)) {
3914 htmlAutoClose(ctxt, NULL);
3915 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
3916 /*
3917 * SAX: end of the document processing.
3918 */
3919 ctxt->instate = XML_PARSER_EOF;
3920 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3921 ctxt->sax->endDocument(ctxt->userData);
3922 }
3923 }
3924 if (avail < 1)
3925 goto done;
3926 switch (ctxt->instate) {
3927 case XML_PARSER_EOF:
3928 /*
3929 * Document parsing is done !
3930 */
3931 goto done;
3932 case XML_PARSER_START:
3933 /*
3934 * Very first chars read from the document flow.
3935 */
3936 cur = in->cur[0];
3937 if (IS_BLANK(cur)) {
3938 SKIP_BLANKS;
3939 if (in->buf == NULL)
3940 avail = in->length - (in->cur - in->base);
3941 else
3942 avail = in->buf->buffer->use - (in->cur - in->base);
3943 }
3944 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3945 ctxt->sax->setDocumentLocator(ctxt->userData,
3946 &xmlDefaultSAXLocator);
3947 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
3948 (!ctxt->disableSAX))
3949 ctxt->sax->startDocument(ctxt->userData);
3950
3951 cur = in->cur[0];
3952 next = in->cur[1];
3953 if ((cur == '<') && (next == '!') &&
3954 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3955 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3956 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3957 (UPP(8) == 'E')) {
3958 if ((!terminate) &&
3959 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
3960 goto done;
3961#ifdef DEBUG_PUSH
3962 xmlGenericError(xmlGenericErrorContext,
3963 "HPP: Parsing internal subset\n");
3964#endif
3965 htmlParseDocTypeDecl(ctxt);
3966 ctxt->instate = XML_PARSER_PROLOG;
3967#ifdef DEBUG_PUSH
3968 xmlGenericError(xmlGenericErrorContext,
3969 "HPP: entering PROLOG\n");
3970#endif
3971 } else {
3972 ctxt->instate = XML_PARSER_MISC;
3973 }
3974#ifdef DEBUG_PUSH
3975 xmlGenericError(xmlGenericErrorContext,
3976 "HPP: entering MISC\n");
3977#endif
3978 break;
3979 case XML_PARSER_MISC:
3980 SKIP_BLANKS;
3981 if (in->buf == NULL)
3982 avail = in->length - (in->cur - in->base);
3983 else
3984 avail = in->buf->buffer->use - (in->cur - in->base);
3985 if (avail < 2)
3986 goto done;
3987 cur = in->cur[0];
3988 next = in->cur[1];
3989 if ((cur == '<') && (next == '!') &&
3990 (in->cur[2] == '-') && (in->cur[3] == '-')) {
3991 if ((!terminate) &&
3992 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
3993 goto done;
3994#ifdef DEBUG_PUSH
3995 xmlGenericError(xmlGenericErrorContext,
3996 "HPP: Parsing Comment\n");
3997#endif
3998 htmlParseComment(ctxt);
3999 ctxt->instate = XML_PARSER_MISC;
4000 } else if ((cur == '<') && (next == '!') &&
4001 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4002 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4003 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4004 (UPP(8) == 'E')) {
4005 if ((!terminate) &&
4006 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4007 goto done;
4008#ifdef DEBUG_PUSH
4009 xmlGenericError(xmlGenericErrorContext,
4010 "HPP: Parsing internal subset\n");
4011#endif
4012 htmlParseDocTypeDecl(ctxt);
4013 ctxt->instate = XML_PARSER_PROLOG;
4014#ifdef DEBUG_PUSH
4015 xmlGenericError(xmlGenericErrorContext,
4016 "HPP: entering PROLOG\n");
4017#endif
4018 } else if ((cur == '<') && (next == '!') &&
4019 (avail < 9)) {
4020 goto done;
4021 } else {
4022 ctxt->instate = XML_PARSER_START_TAG;
4023#ifdef DEBUG_PUSH
4024 xmlGenericError(xmlGenericErrorContext,
4025 "HPP: entering START_TAG\n");
4026#endif
4027 }
4028 break;
4029 case XML_PARSER_PROLOG:
4030 SKIP_BLANKS;
4031 if (in->buf == NULL)
4032 avail = in->length - (in->cur - in->base);
4033 else
4034 avail = in->buf->buffer->use - (in->cur - in->base);
4035 if (avail < 2)
4036 goto done;
4037 cur = in->cur[0];
4038 next = in->cur[1];
4039 if ((cur == '<') && (next == '!') &&
4040 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4041 if ((!terminate) &&
4042 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4043 goto done;
4044#ifdef DEBUG_PUSH
4045 xmlGenericError(xmlGenericErrorContext,
4046 "HPP: Parsing Comment\n");
4047#endif
4048 htmlParseComment(ctxt);
4049 ctxt->instate = XML_PARSER_PROLOG;
4050 } else if ((cur == '<') && (next == '!') &&
4051 (avail < 4)) {
4052 goto done;
4053 } else {
4054 ctxt->instate = XML_PARSER_START_TAG;
4055#ifdef DEBUG_PUSH
4056 xmlGenericError(xmlGenericErrorContext,
4057 "HPP: entering START_TAG\n");
4058#endif
4059 }
4060 break;
4061 case XML_PARSER_EPILOG:
4062 if (in->buf == NULL)
4063 avail = in->length - (in->cur - in->base);
4064 else
4065 avail = in->buf->buffer->use - (in->cur - in->base);
4066 if (avail < 1)
4067 goto done;
4068 cur = in->cur[0];
4069 if (IS_BLANK(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004070 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004071 goto done;
4072 }
4073 if (avail < 2)
4074 goto done;
4075 next = in->cur[1];
4076 if ((cur == '<') && (next == '!') &&
4077 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4078 if ((!terminate) &&
4079 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4080 goto done;
4081#ifdef DEBUG_PUSH
4082 xmlGenericError(xmlGenericErrorContext,
4083 "HPP: Parsing Comment\n");
4084#endif
4085 htmlParseComment(ctxt);
4086 ctxt->instate = XML_PARSER_EPILOG;
4087 } else if ((cur == '<') && (next == '!') &&
4088 (avail < 4)) {
4089 goto done;
4090 } else {
4091 ctxt->errNo = XML_ERR_DOCUMENT_END;
4092 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4093 ctxt->sax->error(ctxt->userData,
4094 "Extra content at the end of the document\n");
4095 ctxt->wellFormed = 0;
4096 ctxt->instate = XML_PARSER_EOF;
4097#ifdef DEBUG_PUSH
4098 xmlGenericError(xmlGenericErrorContext,
4099 "HPP: entering EOF\n");
4100#endif
4101 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4102 ctxt->sax->endDocument(ctxt->userData);
4103 goto done;
4104 }
4105 break;
4106 case XML_PARSER_START_TAG: {
4107 xmlChar *name, *oldname;
4108 int depth = ctxt->nameNr;
4109 htmlElemDescPtr info;
4110
4111 if (avail < 2)
4112 goto done;
4113 cur = in->cur[0];
4114 if (cur != '<') {
4115 ctxt->instate = XML_PARSER_CONTENT;
4116#ifdef DEBUG_PUSH
4117 xmlGenericError(xmlGenericErrorContext,
4118 "HPP: entering CONTENT\n");
4119#endif
4120 break;
4121 }
4122 if ((!terminate) &&
4123 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4124 goto done;
4125
4126 oldname = xmlStrdup(ctxt->name);
4127 htmlParseStartTag(ctxt);
4128 name = ctxt->name;
4129#ifdef DEBUG
4130 if (oldname == NULL)
4131 xmlGenericError(xmlGenericErrorContext,
4132 "Start of element %s\n", name);
4133 else if (name == NULL)
4134 xmlGenericError(xmlGenericErrorContext,
4135 "Start of element failed, was %s\n",
4136 oldname);
4137 else
4138 xmlGenericError(xmlGenericErrorContext,
4139 "Start of element %s, was %s\n",
4140 name, oldname);
4141#endif
4142 if (((depth == ctxt->nameNr) &&
4143 (xmlStrEqual(oldname, ctxt->name))) ||
4144 (name == NULL)) {
4145 if (CUR == '>')
4146 NEXT;
4147 if (oldname != NULL)
4148 xmlFree(oldname);
4149 break;
4150 }
4151 if (oldname != NULL)
4152 xmlFree(oldname);
4153
4154 /*
4155 * Lookup the info for that element.
4156 */
4157 info = htmlTagLookup(name);
4158 if (info == NULL) {
4159 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4160 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4161 name);
4162 ctxt->wellFormed = 0;
4163 } else if (info->depr) {
4164 /***************************
4165 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4166 ctxt->sax->warning(ctxt->userData,
4167 "Tag %s is deprecated\n",
4168 name);
4169 ***************************/
4170 }
4171
4172 /*
4173 * Check for an Empty Element labelled the XML/SGML way
4174 */
4175 if ((CUR == '/') && (NXT(1) == '>')) {
4176 SKIP(2);
4177 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4178 ctxt->sax->endElement(ctxt->userData, name);
4179 oldname = htmlnamePop(ctxt);
4180#ifdef DEBUG
4181 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
4182 oldname);
4183#endif
4184 if (oldname != NULL)
4185 xmlFree(oldname);
4186 ctxt->instate = XML_PARSER_CONTENT;
4187#ifdef DEBUG_PUSH
4188 xmlGenericError(xmlGenericErrorContext,
4189 "HPP: entering CONTENT\n");
4190#endif
4191 break;
4192 }
4193
4194 if (CUR == '>') {
4195 NEXT;
4196 } else {
4197 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4198 ctxt->sax->error(ctxt->userData,
4199 "Couldn't find end of Start Tag %s\n",
4200 name);
4201 ctxt->wellFormed = 0;
4202
4203 /*
4204 * end of parsing of this node.
4205 */
4206 if (xmlStrEqual(name, ctxt->name)) {
4207 nodePop(ctxt);
4208 oldname = htmlnamePop(ctxt);
4209#ifdef DEBUG
4210 xmlGenericError(xmlGenericErrorContext,
4211 "End of start tag problem: popping out %s\n", oldname);
4212#endif
4213 if (oldname != NULL)
4214 xmlFree(oldname);
4215 }
4216
4217 ctxt->instate = XML_PARSER_CONTENT;
4218#ifdef DEBUG_PUSH
4219 xmlGenericError(xmlGenericErrorContext,
4220 "HPP: entering CONTENT\n");
4221#endif
4222 break;
4223 }
4224
4225 /*
4226 * Check for an Empty Element from DTD definition
4227 */
4228 if ((info != NULL) && (info->empty)) {
4229 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4230 ctxt->sax->endElement(ctxt->userData, name);
4231 oldname = htmlnamePop(ctxt);
4232#ifdef DEBUG
4233 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
4234#endif
4235 if (oldname != NULL)
4236 xmlFree(oldname);
4237 }
4238 ctxt->instate = XML_PARSER_CONTENT;
4239#ifdef DEBUG_PUSH
4240 xmlGenericError(xmlGenericErrorContext,
4241 "HPP: entering CONTENT\n");
4242#endif
4243 break;
4244 }
4245 case XML_PARSER_CONTENT: {
4246 long cons;
4247 /*
4248 * Handle preparsed entities and charRef
4249 */
4250 if (ctxt->token != 0) {
4251 xmlChar chr[2] = { 0 , 0 } ;
4252
4253 chr[0] = (xmlChar) ctxt->token;
4254 htmlCheckParagraph(ctxt);
4255 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4256 ctxt->sax->characters(ctxt->userData, chr, 1);
4257 ctxt->token = 0;
4258 ctxt->checkIndex = 0;
4259 }
4260 if ((avail == 1) && (terminate)) {
4261 cur = in->cur[0];
4262 if ((cur != '<') && (cur != '&')) {
4263 if (ctxt->sax != NULL) {
4264 if (IS_BLANK(cur)) {
4265 if (ctxt->sax->ignorableWhitespace != NULL)
4266 ctxt->sax->ignorableWhitespace(
4267 ctxt->userData, &cur, 1);
4268 } else {
4269 htmlCheckParagraph(ctxt);
4270 if (ctxt->sax->characters != NULL)
4271 ctxt->sax->characters(
4272 ctxt->userData, &cur, 1);
4273 }
4274 }
4275 ctxt->token = 0;
4276 ctxt->checkIndex = 0;
4277 NEXT;
4278 }
4279 break;
4280 }
4281 if (avail < 2)
4282 goto done;
4283 cur = in->cur[0];
4284 next = in->cur[1];
4285 cons = ctxt->nbChars;
4286 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4287 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4288 /*
4289 * Handle SCRIPT/STYLE separately
4290 */
4291 if ((!terminate) &&
4292 (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4293 goto done;
4294 htmlParseScript(ctxt);
4295 if ((cur == '<') && (next == '/')) {
4296 ctxt->instate = XML_PARSER_END_TAG;
4297 ctxt->checkIndex = 0;
4298#ifdef DEBUG_PUSH
4299 xmlGenericError(xmlGenericErrorContext,
4300 "HPP: entering END_TAG\n");
4301#endif
4302 break;
4303 }
4304 } else {
4305 /*
4306 * Sometimes DOCTYPE arrives in the middle of the document
4307 */
4308 if ((cur == '<') && (next == '!') &&
4309 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4310 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4311 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4312 (UPP(8) == 'E')) {
4313 if ((!terminate) &&
4314 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4315 goto done;
4316 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4317 ctxt->sax->error(ctxt->userData,
4318 "Misplaced DOCTYPE declaration\n");
4319 ctxt->wellFormed = 0;
4320 htmlParseDocTypeDecl(ctxt);
4321 } else if ((cur == '<') && (next == '!') &&
4322 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4323 if ((!terminate) &&
4324 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4325 goto done;
4326#ifdef DEBUG_PUSH
4327 xmlGenericError(xmlGenericErrorContext,
4328 "HPP: Parsing Comment\n");
4329#endif
4330 htmlParseComment(ctxt);
4331 ctxt->instate = XML_PARSER_CONTENT;
4332 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4333 goto done;
4334 } else if ((cur == '<') && (next == '/')) {
4335 ctxt->instate = XML_PARSER_END_TAG;
4336 ctxt->checkIndex = 0;
4337#ifdef DEBUG_PUSH
4338 xmlGenericError(xmlGenericErrorContext,
4339 "HPP: entering END_TAG\n");
4340#endif
4341 break;
4342 } else if (cur == '<') {
4343 ctxt->instate = XML_PARSER_START_TAG;
4344 ctxt->checkIndex = 0;
4345#ifdef DEBUG_PUSH
4346 xmlGenericError(xmlGenericErrorContext,
4347 "HPP: entering START_TAG\n");
4348#endif
4349 break;
4350 } else if (cur == '&') {
4351 if ((!terminate) &&
4352 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4353 goto done;
4354#ifdef DEBUG_PUSH
4355 xmlGenericError(xmlGenericErrorContext,
4356 "HPP: Parsing Reference\n");
4357#endif
4358 /* TODO: check generation of subtrees if noent !!! */
4359 htmlParseReference(ctxt);
4360 } else {
4361 /* TODO Avoid the extra copy, handle directly !!!!!! */
4362 /*
4363 * Goal of the following test is :
4364 * - minimize calls to the SAX 'character' callback
4365 * when they are mergeable
4366 */
4367 if ((ctxt->inputNr == 1) &&
4368 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4369 if ((!terminate) &&
4370 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4371 goto done;
4372 }
4373 ctxt->checkIndex = 0;
4374#ifdef DEBUG_PUSH
4375 xmlGenericError(xmlGenericErrorContext,
4376 "HPP: Parsing char data\n");
4377#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004378 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004379 }
4380 }
4381 if (cons == ctxt->nbChars) {
4382 if (ctxt->node != NULL) {
4383 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4384 ctxt->sax->error(ctxt->userData,
4385 "detected an error in element content\n");
4386 ctxt->wellFormed = 0;
4387 }
4388 NEXT;
4389 break;
4390 }
4391
4392 break;
4393 }
4394 case XML_PARSER_END_TAG:
4395 if (avail < 2)
4396 goto done;
4397 if ((!terminate) &&
4398 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4399 goto done;
4400 htmlParseEndTag(ctxt);
4401 if (ctxt->nameNr == 0) {
4402 ctxt->instate = XML_PARSER_EPILOG;
4403 } else {
4404 ctxt->instate = XML_PARSER_CONTENT;
4405 }
4406 ctxt->checkIndex = 0;
4407#ifdef DEBUG_PUSH
4408 xmlGenericError(xmlGenericErrorContext,
4409 "HPP: entering CONTENT\n");
4410#endif
4411 break;
4412 case XML_PARSER_CDATA_SECTION:
4413 xmlGenericError(xmlGenericErrorContext,
4414 "HPP: internal error, state == CDATA\n");
4415 ctxt->instate = XML_PARSER_CONTENT;
4416 ctxt->checkIndex = 0;
4417#ifdef DEBUG_PUSH
4418 xmlGenericError(xmlGenericErrorContext,
4419 "HPP: entering CONTENT\n");
4420#endif
4421 break;
4422 case XML_PARSER_DTD:
4423 xmlGenericError(xmlGenericErrorContext,
4424 "HPP: internal error, state == DTD\n");
4425 ctxt->instate = XML_PARSER_CONTENT;
4426 ctxt->checkIndex = 0;
4427#ifdef DEBUG_PUSH
4428 xmlGenericError(xmlGenericErrorContext,
4429 "HPP: entering CONTENT\n");
4430#endif
4431 break;
4432 case XML_PARSER_COMMENT:
4433 xmlGenericError(xmlGenericErrorContext,
4434 "HPP: internal error, state == COMMENT\n");
4435 ctxt->instate = XML_PARSER_CONTENT;
4436 ctxt->checkIndex = 0;
4437#ifdef DEBUG_PUSH
4438 xmlGenericError(xmlGenericErrorContext,
4439 "HPP: entering CONTENT\n");
4440#endif
4441 break;
4442 case XML_PARSER_PI:
4443 xmlGenericError(xmlGenericErrorContext,
4444 "HPP: internal error, state == PI\n");
4445 ctxt->instate = XML_PARSER_CONTENT;
4446 ctxt->checkIndex = 0;
4447#ifdef DEBUG_PUSH
4448 xmlGenericError(xmlGenericErrorContext,
4449 "HPP: entering CONTENT\n");
4450#endif
4451 break;
4452 case XML_PARSER_ENTITY_DECL:
4453 xmlGenericError(xmlGenericErrorContext,
4454 "HPP: internal error, state == ENTITY_DECL\n");
4455 ctxt->instate = XML_PARSER_CONTENT;
4456 ctxt->checkIndex = 0;
4457#ifdef DEBUG_PUSH
4458 xmlGenericError(xmlGenericErrorContext,
4459 "HPP: entering CONTENT\n");
4460#endif
4461 break;
4462 case XML_PARSER_ENTITY_VALUE:
4463 xmlGenericError(xmlGenericErrorContext,
4464 "HPP: internal error, state == ENTITY_VALUE\n");
4465 ctxt->instate = XML_PARSER_CONTENT;
4466 ctxt->checkIndex = 0;
4467#ifdef DEBUG_PUSH
4468 xmlGenericError(xmlGenericErrorContext,
4469 "HPP: entering DTD\n");
4470#endif
4471 break;
4472 case XML_PARSER_ATTRIBUTE_VALUE:
4473 xmlGenericError(xmlGenericErrorContext,
4474 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
4475 ctxt->instate = XML_PARSER_START_TAG;
4476 ctxt->checkIndex = 0;
4477#ifdef DEBUG_PUSH
4478 xmlGenericError(xmlGenericErrorContext,
4479 "HPP: entering START_TAG\n");
4480#endif
4481 break;
4482 case XML_PARSER_SYSTEM_LITERAL:
4483 xmlGenericError(xmlGenericErrorContext,
4484 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
4485 ctxt->instate = XML_PARSER_CONTENT;
4486 ctxt->checkIndex = 0;
4487#ifdef DEBUG_PUSH
4488 xmlGenericError(xmlGenericErrorContext,
4489 "HPP: entering CONTENT\n");
4490#endif
4491 break;
4492 case XML_PARSER_IGNORE:
4493 xmlGenericError(xmlGenericErrorContext,
4494 "HPP: internal error, state == XML_PARSER_IGNORE\n");
4495 ctxt->instate = XML_PARSER_CONTENT;
4496 ctxt->checkIndex = 0;
4497#ifdef DEBUG_PUSH
4498 xmlGenericError(xmlGenericErrorContext,
4499 "HPP: entering CONTENT\n");
4500#endif
4501 break;
4502 }
4503 }
4504done:
4505 if ((avail == 0) && (terminate)) {
4506 htmlAutoClose(ctxt, NULL);
4507 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4508 /*
4509 * SAX: end of the document processing.
4510 */
4511 ctxt->instate = XML_PARSER_EOF;
4512 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4513 ctxt->sax->endDocument(ctxt->userData);
4514 }
4515 }
4516 if ((ctxt->myDoc != NULL) &&
4517 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4518 (ctxt->instate == XML_PARSER_EPILOG))) {
4519 xmlDtdPtr dtd;
4520 dtd = xmlGetIntSubset(ctxt->myDoc);
4521 if (dtd == NULL)
4522 ctxt->myDoc->intSubset =
4523 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4524 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4525 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4526 }
4527#ifdef DEBUG_PUSH
4528 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4529#endif
4530 return(ret);
4531}
4532
4533/**
Owen Taylor3473f882001-02-23 17:55:21 +00004534 * htmlParseChunk:
4535 * @ctxt: an XML parser context
4536 * @chunk: an char array
4537 * @size: the size in byte of the chunk
4538 * @terminate: last chunk indicator
4539 *
4540 * Parse a Chunk of memory
4541 *
4542 * Returns zero if no error, the xmlParserErrors otherwise.
4543 */
4544int
4545htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4546 int terminate) {
4547 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4548 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4549 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4550 int cur = ctxt->input->cur - ctxt->input->base;
4551
4552 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4553 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4554 ctxt->input->cur = ctxt->input->base + cur;
4555#ifdef DEBUG_PUSH
4556 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4557#endif
4558
4559 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4560 htmlParseTryOrFinish(ctxt, terminate);
4561 } else if (ctxt->instate != XML_PARSER_EOF) {
4562 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
4563 htmlParseTryOrFinish(ctxt, terminate);
4564 }
4565 if (terminate) {
4566 if ((ctxt->instate != XML_PARSER_EOF) &&
4567 (ctxt->instate != XML_PARSER_EPILOG) &&
4568 (ctxt->instate != XML_PARSER_MISC)) {
4569 ctxt->errNo = XML_ERR_DOCUMENT_END;
4570 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4571 ctxt->sax->error(ctxt->userData,
4572 "Extra content at the end of the document\n");
4573 ctxt->wellFormed = 0;
4574 }
4575 if (ctxt->instate != XML_PARSER_EOF) {
4576 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4577 ctxt->sax->endDocument(ctxt->userData);
4578 }
4579 ctxt->instate = XML_PARSER_EOF;
4580 }
4581 return((xmlParserErrors) ctxt->errNo);
4582}
4583
4584/************************************************************************
4585 * *
4586 * User entry points *
4587 * *
4588 ************************************************************************/
4589
4590/**
4591 * htmlCreatePushParserCtxt :
4592 * @sax: a SAX handler
4593 * @user_data: The user data returned on SAX callbacks
4594 * @chunk: a pointer to an array of chars
4595 * @size: number of chars in the array
4596 * @filename: an optional file name or URI
4597 * @enc: an optional encoding
4598 *
4599 * Create a parser context for using the HTML parser in push mode
4600 * To allow content encoding detection, @size should be >= 4
4601 * The value of @filename is used for fetching external entities
4602 * and error/warning reports.
4603 *
4604 * Returns the new parser context or NULL
4605 */
4606htmlParserCtxtPtr
4607htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4608 const char *chunk, int size, const char *filename,
4609 xmlCharEncoding enc) {
4610 htmlParserCtxtPtr ctxt;
4611 htmlParserInputPtr inputStream;
4612 xmlParserInputBufferPtr buf;
4613
4614 buf = xmlAllocParserInputBuffer(enc);
4615 if (buf == NULL) return(NULL);
4616
4617 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4618 if (ctxt == NULL) {
4619 xmlFree(buf);
4620 return(NULL);
4621 }
4622 memset(ctxt, 0, sizeof(htmlParserCtxt));
4623 htmlInitParserCtxt(ctxt);
4624 if (sax != NULL) {
4625 if (ctxt->sax != &htmlDefaultSAXHandler)
4626 xmlFree(ctxt->sax);
4627 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4628 if (ctxt->sax == NULL) {
4629 xmlFree(buf);
4630 xmlFree(ctxt);
4631 return(NULL);
4632 }
4633 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4634 if (user_data != NULL)
4635 ctxt->userData = user_data;
4636 }
4637 if (filename == NULL) {
4638 ctxt->directory = NULL;
4639 } else {
4640 ctxt->directory = xmlParserGetDirectory(filename);
4641 }
4642
4643 inputStream = htmlNewInputStream(ctxt);
4644 if (inputStream == NULL) {
4645 xmlFreeParserCtxt(ctxt);
4646 return(NULL);
4647 }
4648
4649 if (filename == NULL)
4650 inputStream->filename = NULL;
4651 else
4652 inputStream->filename = xmlMemStrdup(filename);
4653 inputStream->buf = buf;
4654 inputStream->base = inputStream->buf->buffer->content;
4655 inputStream->cur = inputStream->buf->buffer->content;
4656
4657 inputPush(ctxt, inputStream);
4658
4659 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4660 (ctxt->input->buf != NULL)) {
4661 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4662#ifdef DEBUG_PUSH
4663 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4664#endif
4665 }
4666
4667 return(ctxt);
4668}
4669
4670/**
4671 * htmlSAXParseDoc :
4672 * @cur: a pointer to an array of xmlChar
4673 * @encoding: a free form C string describing the HTML document encoding, or NULL
4674 * @sax: the SAX handler block
4675 * @userData: if using SAX, this pointer will be provided on callbacks.
4676 *
4677 * parse an HTML in-memory document and build a tree.
4678 * It use the given SAX function block to handle the parsing callback.
4679 * If sax is NULL, fallback to the default DOM tree building routines.
4680 *
4681 * Returns the resulting document tree
4682 */
4683
4684htmlDocPtr
4685htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
4686 htmlDocPtr ret;
4687 htmlParserCtxtPtr ctxt;
4688
4689 if (cur == NULL) return(NULL);
4690
4691
4692 ctxt = htmlCreateDocParserCtxt(cur, encoding);
4693 if (ctxt == NULL) return(NULL);
4694 if (sax != NULL) {
4695 ctxt->sax = sax;
4696 ctxt->userData = userData;
4697 }
4698
4699 htmlParseDocument(ctxt);
4700 ret = ctxt->myDoc;
4701 if (sax != NULL) {
4702 ctxt->sax = NULL;
4703 ctxt->userData = NULL;
4704 }
4705 htmlFreeParserCtxt(ctxt);
4706
4707 return(ret);
4708}
4709
4710/**
4711 * htmlParseDoc :
4712 * @cur: a pointer to an array of xmlChar
4713 * @encoding: a free form C string describing the HTML document encoding, or NULL
4714 *
4715 * parse an HTML in-memory document and build a tree.
4716 *
4717 * Returns the resulting document tree
4718 */
4719
4720htmlDocPtr
4721htmlParseDoc(xmlChar *cur, const char *encoding) {
4722 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4723}
4724
4725
4726/**
4727 * htmlCreateFileParserCtxt :
4728 * @filename: the filename
4729 * @encoding: a free form C string describing the HTML document encoding, or NULL
4730 *
4731 * Create a parser context for a file content.
4732 * Automatic support for ZLIB/Compress compressed document is provided
4733 * by default if found at compile-time.
4734 *
4735 * Returns the new parser context or NULL
4736 */
4737htmlParserCtxtPtr
4738htmlCreateFileParserCtxt(const char *filename, const char *encoding)
4739{
4740 htmlParserCtxtPtr ctxt;
4741 htmlParserInputPtr inputStream;
4742 xmlParserInputBufferPtr buf;
4743 /* htmlCharEncoding enc; */
4744 xmlChar *content, *content_line = (xmlChar *) "charset=";
4745
4746 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
4747 if (buf == NULL) return(NULL);
4748
4749 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4750 if (ctxt == NULL) {
4751 perror("malloc");
4752 return(NULL);
4753 }
4754 memset(ctxt, 0, sizeof(htmlParserCtxt));
4755 htmlInitParserCtxt(ctxt);
4756 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
4757 if (inputStream == NULL) {
4758 perror("malloc");
4759 xmlFree(ctxt);
4760 return(NULL);
4761 }
4762 memset(inputStream, 0, sizeof(htmlParserInput));
4763
4764 inputStream->filename = xmlMemStrdup(filename);
4765 inputStream->line = 1;
4766 inputStream->col = 1;
4767 inputStream->buf = buf;
4768 inputStream->directory = NULL;
4769
4770 inputStream->base = inputStream->buf->buffer->content;
4771 inputStream->cur = inputStream->buf->buffer->content;
4772 inputStream->free = NULL;
4773
4774 inputPush(ctxt, inputStream);
4775
4776 /* set encoding */
4777 if (encoding) {
4778 content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
4779 if (content) {
4780 strcpy ((char *)content, (char *)content_line);
4781 strcat ((char *)content, (char *)encoding);
4782 htmlCheckEncoding (ctxt, content);
4783 xmlFree (content);
4784 }
4785 }
4786
4787 return(ctxt);
4788}
4789
4790/**
4791 * htmlSAXParseFile :
4792 * @filename: the filename
4793 * @encoding: a free form C string describing the HTML document encoding, or NULL
4794 * @sax: the SAX handler block
4795 * @userData: if using SAX, this pointer will be provided on callbacks.
4796 *
4797 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4798 * compressed document is provided by default if found at compile-time.
4799 * It use the given SAX function block to handle the parsing callback.
4800 * If sax is NULL, fallback to the default DOM tree building routines.
4801 *
4802 * Returns the resulting document tree
4803 */
4804
4805htmlDocPtr
4806htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
4807 void *userData) {
4808 htmlDocPtr ret;
4809 htmlParserCtxtPtr ctxt;
4810 htmlSAXHandlerPtr oldsax = NULL;
4811
4812 ctxt = htmlCreateFileParserCtxt(filename, encoding);
4813 if (ctxt == NULL) return(NULL);
4814 if (sax != NULL) {
4815 oldsax = ctxt->sax;
4816 ctxt->sax = sax;
4817 ctxt->userData = userData;
4818 }
4819
4820 htmlParseDocument(ctxt);
4821
4822 ret = ctxt->myDoc;
4823 if (sax != NULL) {
4824 ctxt->sax = oldsax;
4825 ctxt->userData = NULL;
4826 }
4827 htmlFreeParserCtxt(ctxt);
4828
4829 return(ret);
4830}
4831
4832/**
4833 * htmlParseFile :
4834 * @filename: the filename
4835 * @encoding: a free form C string describing the HTML document encoding, or NULL
4836 *
4837 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4838 * compressed document is provided by default if found at compile-time.
4839 *
4840 * Returns the resulting document tree
4841 */
4842
4843htmlDocPtr
4844htmlParseFile(const char *filename, const char *encoding) {
4845 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
4846}
4847
4848/**
4849 * htmlHandleOmittedElem:
4850 * @val: int 0 or 1
4851 *
4852 * Set and return the previous value for handling HTML omitted tags.
4853 *
4854 * Returns the last value for 0 for no handling, 1 for auto insertion.
4855 */
4856
4857int
4858htmlHandleOmittedElem(int val) {
4859 int old = htmlOmittedDefaultValue;
4860
4861 htmlOmittedDefaultValue = val;
4862 return(old);
4863}
4864
4865#endif /* LIBXML_HTML_ENABLED */