blob: 4b3bac867d6ddb0788e5f9bc8569fa34313763d7 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * Daniel.Veillard@w3.org
7 */
8
9#ifdef WIN32
10#include "win32config.h"
11#else
12#include "config.h"
13#endif
14
15#include <libxml/xmlversion.h>
16#ifdef LIBXML_HTML_ENABLED
17#include <stdio.h>
18#include <string.h>
19#ifdef HAVE_CTYPE_H
20#include <ctype.h>
21#endif
22#ifdef HAVE_STDLIB_H
23#include <stdlib.h>
24#endif
25#ifdef HAVE_SYS_STAT_H
26#include <sys/stat.h>
27#endif
28#ifdef HAVE_FCNTL_H
29#include <fcntl.h>
30#endif
31#ifdef HAVE_UNISTD_H
32#include <unistd.h>
33#endif
34#ifdef HAVE_ZLIB_H
35#include <zlib.h>
36#endif
37
38#include <libxml/xmlmemory.h>
39#include <libxml/tree.h>
40#include <libxml/parser.h>
41#include <libxml/parserInternals.h>
42#include <libxml/xmlerror.h>
43#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000044#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000045#include <libxml/entities.h>
46#include <libxml/encoding.h>
47#include <libxml/valid.h>
48#include <libxml/xmlIO.h>
49
50#define HTML_MAX_NAMELEN 1000
51#define HTML_PARSER_BIG_BUFFER_SIZE 1000
52#define HTML_PARSER_BUFFER_SIZE 100
53
54/* #define DEBUG */
55/* #define DEBUG_PUSH */
56
57int htmlOmittedDefaultValue = 1;
58
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
60 xmlChar end, xmlChar end2, xmlChar end3);
61
62/************************************************************************
63 * *
Owen Taylor3473f882001-02-23 17:55:21 +000064 * Parser stacks related functions and macros *
65 * *
66 ************************************************************************/
67
68/*
69 * Generic function for accessing stacks in the Parser Context
70 */
71
72#define PUSH_AND_POP(scope, type, name) \
73scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
74 if (ctxt->name##Nr >= ctxt->name##Max) { \
75 ctxt->name##Max *= 2; \
76 ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
77 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
78 if (ctxt->name##Tab == NULL) { \
79 xmlGenericError(xmlGenericErrorContext, \
80 "realloc failed !\n"); \
81 return(0); \
82 } \
83 } \
84 ctxt->name##Tab[ctxt->name##Nr] = value; \
85 ctxt->name = value; \
86 return(ctxt->name##Nr++); \
87} \
88scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
89 type ret; \
90 if (ctxt->name##Nr < 0) return(0); \
91 ctxt->name##Nr--; \
92 if (ctxt->name##Nr < 0) return(0); \
93 if (ctxt->name##Nr > 0) \
94 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
95 else \
96 ctxt->name = NULL; \
97 ret = ctxt->name##Tab[ctxt->name##Nr]; \
98 ctxt->name##Tab[ctxt->name##Nr] = 0; \
99 return(ret); \
100} \
101
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000102/* PUSH_AND_POP(static, xmlNodePtr, node) */
103PUSH_AND_POP(static, xmlChar*, name)
Owen Taylor3473f882001-02-23 17:55:21 +0000104
105/*
106 * Macros for accessing the content. Those should be used only by the parser,
107 * and not exported.
108 *
109 * Dirty macros, i.e. one need to make assumption on the context to use them
110 *
111 * CUR_PTR return the current pointer to the xmlChar to be parsed.
112 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
113 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
114 * in UNICODE mode. This should be used internally by the parser
115 * only to compare to ASCII values otherwise it would break when
116 * running with UTF-8 encoding.
117 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
118 * to compare on ASCII based substring.
119 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
120 * it should be used only to compare on ASCII based substring.
121 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
122 * strings within the parser.
123 *
124 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
125 *
126 * CURRENT Returns the current char value, with the full decoding of
127 * UTF-8 if we are using this mode. It returns an int.
128 * NEXT Skip to the next character, this does the proper decoding
129 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
130 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
131 */
132
133#define UPPER (toupper(*ctxt->input->cur))
134
135#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
136
137#define NXT(val) ctxt->input->cur[(val)]
138
139#define UPP(val) (toupper(ctxt->input->cur[(val)]))
140
141#define CUR_PTR ctxt->input->cur
142
143#define SHRINK xmlParserInputShrink(ctxt->input)
144
145#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
146
147#define CURRENT ((int) (*ctxt->input->cur))
148
149#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
150
151/* Inported from XML */
152
153/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
154#define CUR ((int) (*ctxt->input->cur))
155#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
156
157#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
158#define NXT(val) ctxt->input->cur[(val)]
159#define CUR_PTR ctxt->input->cur
160
161
162#define NEXTL(l) do { \
163 if (*(ctxt->input->cur) == '\n') { \
164 ctxt->input->line++; ctxt->input->col = 1; \
165 } else ctxt->input->col++; \
166 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
167 } while (0)
168
169/************
170 \
171 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
172 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
173 ************/
174
175#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
176#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
177
178#define COPY_BUF(l,b,i,v) \
179 if (l == 1) b[i++] = (xmlChar) v; \
180 else i += xmlCopyChar(l,&b[i],v)
181
182/**
183 * htmlCurrentChar:
184 * @ctxt: the HTML parser context
185 * @len: pointer to the length of the char read
186 *
187 * The current char value, if using UTF-8 this may actaully span multiple
188 * bytes in the input buffer. Implement the end of line normalization:
189 * 2.11 End-of-Line Handling
190 * If the encoding is unspecified, in the case we find an ISO-Latin-1
191 * char, then the encoding converter is plugged in automatically.
192 *
193 * Returns the current char value and its lenght
194 */
195
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000196static int
Owen Taylor3473f882001-02-23 17:55:21 +0000197htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
198 if (ctxt->instate == XML_PARSER_EOF)
199 return(0);
200
201 if (ctxt->token != 0) {
202 *len = 0;
203 return(ctxt->token);
204 }
205 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
206 /*
207 * We are supposed to handle UTF8, check it's valid
208 * From rfc2044: encoding of the Unicode values on UTF-8:
209 *
210 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
211 * 0000 0000-0000 007F 0xxxxxxx
212 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
213 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
214 *
215 * Check for the 0x110000 limit too
216 */
217 const unsigned char *cur = ctxt->input->cur;
218 unsigned char c;
219 unsigned int val;
220
221 c = *cur;
222 if (c & 0x80) {
223 if (cur[1] == 0)
224 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
225 if ((cur[1] & 0xc0) != 0x80)
226 goto encoding_error;
227 if ((c & 0xe0) == 0xe0) {
228
229 if (cur[2] == 0)
230 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
231 if ((cur[2] & 0xc0) != 0x80)
232 goto encoding_error;
233 if ((c & 0xf0) == 0xf0) {
234 if (cur[3] == 0)
235 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
236 if (((c & 0xf8) != 0xf0) ||
237 ((cur[3] & 0xc0) != 0x80))
238 goto encoding_error;
239 /* 4-byte code */
240 *len = 4;
241 val = (cur[0] & 0x7) << 18;
242 val |= (cur[1] & 0x3f) << 12;
243 val |= (cur[2] & 0x3f) << 6;
244 val |= cur[3] & 0x3f;
245 } else {
246 /* 3-byte code */
247 *len = 3;
248 val = (cur[0] & 0xf) << 12;
249 val |= (cur[1] & 0x3f) << 6;
250 val |= cur[2] & 0x3f;
251 }
252 } else {
253 /* 2-byte code */
254 *len = 2;
255 val = (cur[0] & 0x1f) << 6;
256 val |= cur[1] & 0x3f;
257 }
258 if (!IS_CHAR(val)) {
259 ctxt->errNo = XML_ERR_INVALID_ENCODING;
260 if ((ctxt->sax != NULL) &&
261 (ctxt->sax->error != NULL))
262 ctxt->sax->error(ctxt->userData,
263 "Char 0x%X out of allowed range\n", val);
264 ctxt->wellFormed = 0;
265 ctxt->disableSAX = 1;
266 }
267 return(val);
268 } else {
269 /* 1-byte code */
270 *len = 1;
271 return((int) *ctxt->input->cur);
272 }
273 }
274 /*
275 * Assume it's a fixed lenght encoding (1) with
276 * a compatibke encoding for the ASCII set, since
277 * XML constructs only use < 128 chars
278 */
279 *len = 1;
280 if ((int) *ctxt->input->cur < 0x80)
281 return((int) *ctxt->input->cur);
282
283 /*
284 * Humm this is bad, do an automatic flow conversion
285 */
286 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
287 ctxt->charset = XML_CHAR_ENCODING_UTF8;
288 return(xmlCurrentChar(ctxt, len));
289
290encoding_error:
291 /*
292 * If we detect an UTF8 error that probably mean that the
293 * input encoding didn't get properly advertized in the
294 * declaration header. Report the error and switch the encoding
295 * to ISO-Latin-1 (if you don't like this policy, just declare the
296 * encoding !)
297 */
298 ctxt->errNo = XML_ERR_INVALID_ENCODING;
299 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
300 ctxt->sax->error(ctxt->userData,
301 "Input is not proper UTF-8, indicate encoding !\n");
302 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
303 ctxt->input->cur[0], ctxt->input->cur[1],
304 ctxt->input->cur[2], ctxt->input->cur[3]);
305 }
306
307 ctxt->charset = XML_CHAR_ENCODING_8859_1;
308 *len = 1;
309 return((int) *ctxt->input->cur);
310}
311
312/**
Owen Taylor3473f882001-02-23 17:55:21 +0000313 * htmlSkipBlankChars:
314 * @ctxt: the HTML parser context
315 *
316 * skip all blanks character found at that point in the input streams.
317 *
318 * Returns the number of space chars skipped
319 */
320
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000321static int
Owen Taylor3473f882001-02-23 17:55:21 +0000322htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
323 int res = 0;
324
325 while (IS_BLANK(*(ctxt->input->cur))) {
326 if ((*ctxt->input->cur == 0) &&
327 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
328 xmlPopInput(ctxt);
329 } else {
330 if (*(ctxt->input->cur) == '\n') {
331 ctxt->input->line++; ctxt->input->col = 1;
332 } else ctxt->input->col++;
333 ctxt->input->cur++;
334 ctxt->nbChars++;
335 if (*ctxt->input->cur == 0)
336 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
337 }
338 res++;
339 }
340 return(res);
341}
342
343
344
345/************************************************************************
346 * *
347 * The list of HTML elements and their properties *
348 * *
349 ************************************************************************/
350
351/*
352 * Start Tag: 1 means the start tag can be ommited
353 * End Tag: 1 means the end tag can be ommited
354 * 2 means it's forbidden (empty elements)
355 * Depr: this element is deprecated
356 * DTD: 1 means that this element is valid only in the Loose DTD
357 * 2 means that this element is valid only in the Frameset DTD
358 *
359 * Name,Start Tag,End Tag,Save End, Empty, Depr., DTD, Description
360 */
361htmlElemDesc html40ElementTable[] = {
362{ "a", 0, 0, 0, 0, 0, 0, "anchor " },
363{ "abbr", 0, 0, 0, 0, 0, 0, "abbreviated form" },
364{ "acronym", 0, 0, 0, 0, 0, 0, "" },
365{ "address", 0, 0, 0, 0, 0, 0, "information on author " },
366{ "applet", 0, 0, 0, 0, 1, 1, "java applet " },
367{ "area", 0, 2, 2, 1, 0, 0, "client-side image map area " },
368{ "b", 0, 0, 0, 0, 0, 0, "bold text style" },
369{ "base", 0, 2, 2, 1, 0, 0, "document base uri " },
370{ "basefont", 0, 2, 2, 1, 1, 1, "base font size " },
371{ "bdo", 0, 0, 0, 0, 0, 0, "i18n bidi over-ride " },
372{ "big", 0, 0, 0, 0, 0, 0, "large text style" },
373{ "blockquote", 0, 0, 0, 0, 0, 0, "long quotation " },
374{ "body", 1, 1, 0, 0, 0, 0, "document body " },
375{ "br", 0, 2, 2, 1, 0, 0, "forced line break " },
376{ "button", 0, 0, 0, 0, 0, 0, "push button " },
377{ "caption", 0, 0, 0, 0, 0, 0, "table caption " },
378{ "center", 0, 0, 0, 0, 1, 1, "shorthand for div align=center " },
379{ "cite", 0, 0, 0, 0, 0, 0, "citation" },
380{ "code", 0, 0, 0, 0, 0, 0, "computer code fragment" },
381{ "col", 0, 2, 2, 1, 0, 0, "table column " },
382{ "colgroup", 0, 1, 0, 0, 0, 0, "table column group " },
383{ "dd", 0, 1, 0, 0, 0, 0, "definition description " },
384{ "del", 0, 0, 0, 0, 0, 0, "deleted text " },
385{ "dfn", 0, 0, 0, 0, 0, 0, "instance definition" },
386{ "dir", 0, 0, 0, 0, 1, 1, "directory list" },
387{ "div", 0, 0, 0, 0, 0, 0, "generic language/style container"},
388{ "dl", 0, 0, 0, 0, 0, 0, "definition list " },
389{ "dt", 0, 1, 0, 0, 0, 0, "definition term " },
390{ "em", 0, 0, 0, 0, 0, 0, "emphasis" },
391{ "fieldset", 0, 0, 0, 0, 0, 0, "form control group " },
392{ "font", 0, 0, 0, 0, 1, 1, "local change to font " },
393{ "form", 0, 0, 0, 0, 0, 0, "interactive form " },
394{ "frame", 0, 2, 2, 1, 0, 2, "subwindow " },
395{ "frameset", 0, 0, 0, 0, 0, 2, "window subdivision" },
396{ "h1", 0, 0, 0, 0, 0, 0, "heading " },
397{ "h2", 0, 0, 0, 0, 0, 0, "heading " },
398{ "h3", 0, 0, 0, 0, 0, 0, "heading " },
399{ "h4", 0, 0, 0, 0, 0, 0, "heading " },
400{ "h5", 0, 0, 0, 0, 0, 0, "heading " },
401{ "h6", 0, 0, 0, 0, 0, 0, "heading " },
402{ "head", 1, 1, 0, 0, 0, 0, "document head " },
403{ "hr", 0, 2, 2, 1, 0, 0, "horizontal rule " },
404{ "html", 1, 1, 0, 0, 0, 0, "document root element " },
405{ "i", 0, 0, 0, 0, 0, 0, "italic text style" },
406{ "iframe", 0, 0, 0, 0, 0, 1, "inline subwindow " },
407{ "img", 0, 2, 2, 1, 0, 0, "embedded image " },
408{ "input", 0, 2, 2, 1, 0, 0, "form control " },
409{ "ins", 0, 0, 0, 0, 0, 0, "inserted text" },
410{ "isindex", 0, 2, 2, 1, 1, 1, "single line prompt " },
411{ "kbd", 0, 0, 0, 0, 0, 0, "text to be entered by the user" },
412{ "label", 0, 0, 0, 0, 0, 0, "form field label text " },
413{ "legend", 0, 0, 0, 0, 0, 0, "fieldset legend " },
414{ "li", 0, 1, 1, 0, 0, 0, "list item " },
415{ "link", 0, 2, 2, 1, 0, 0, "a media-independent link " },
416{ "map", 0, 0, 0, 0, 0, 0, "client-side image map " },
417{ "menu", 0, 0, 0, 0, 1, 1, "menu list " },
418{ "meta", 0, 2, 2, 1, 0, 0, "generic metainformation " },
419{ "noframes", 0, 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " },
420{ "noscript", 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
421{ "object", 0, 0, 0, 0, 0, 0, "generic embedded object " },
422{ "ol", 0, 0, 0, 0, 0, 0, "ordered list " },
423{ "optgroup", 0, 0, 0, 0, 0, 0, "option group " },
424{ "option", 0, 1, 0, 0, 0, 0, "selectable choice " },
425{ "p", 0, 1, 1, 0, 0, 0, "paragraph " },
426{ "param", 0, 2, 2, 1, 0, 0, "named property value " },
427{ "pre", 0, 0, 0, 0, 0, 0, "preformatted text " },
428{ "q", 0, 0, 0, 0, 0, 0, "short inline quotation " },
429{ "s", 0, 0, 0, 0, 1, 1, "strike-through text style" },
430{ "samp", 0, 0, 0, 0, 0, 0, "sample program output, scripts, etc." },
431{ "script", 0, 0, 0, 0, 0, 0, "script statements " },
432{ "select", 0, 0, 0, 0, 0, 0, "option selector " },
433{ "small", 0, 0, 0, 0, 0, 0, "small text style" },
434{ "span", 0, 0, 0, 0, 0, 0, "generic language/style container " },
435{ "strike", 0, 0, 0, 0, 1, 1, "strike-through text" },
436{ "strong", 0, 0, 0, 0, 0, 0, "strong emphasis" },
437{ "style", 0, 0, 0, 0, 0, 0, "style info " },
438{ "sub", 0, 0, 0, 0, 0, 0, "subscript" },
439{ "sup", 0, 0, 0, 0, 0, 0, "superscript " },
440{ "table", 0, 0, 0, 0, 0, 0, "&#160;" },
441{ "tbody", 1, 0, 0, 0, 0, 0, "table body " },
442{ "td", 0, 0, 0, 0, 0, 0, "table data cell" },
443{ "textarea", 0, 0, 0, 0, 0, 0, "multi-line text field " },
444{ "tfoot", 0, 1, 0, 0, 0, 0, "table footer " },
445{ "th", 0, 1, 0, 0, 0, 0, "table header cell" },
446{ "thead", 0, 1, 0, 0, 0, 0, "table header " },
447{ "title", 0, 0, 0, 0, 0, 0, "document title " },
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000448{ "tr", 0, 0, 0, 0, 0, 0, "table row " },
Owen Taylor3473f882001-02-23 17:55:21 +0000449{ "tt", 0, 0, 0, 0, 0, 0, "teletype or monospaced text style" },
450{ "u", 0, 0, 0, 0, 1, 1, "underlined text style" },
451{ "ul", 0, 0, 0, 0, 0, 0, "unordered list " },
452{ "var", 0, 0, 0, 0, 0, 0, "instance of a variable or program argument" },
453};
454
455/*
456 * start tags that imply the end of a current element
457 * any tag of each line implies the end of the current element if the type of
458 * that element is in the same line
459 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000460const char *htmlEquEnd[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000461"dt", "dd", "li", "option", NULL,
462"h1", "h2", "h3", "h4", "h5", "h6", NULL,
463"ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,
464NULL
465};
466/*
467 * acording the HTML DTD, HR should be added to the 2nd line above, as it
468 * is not allowed within a H1, H2, H3, etc. But we should tolerate that case
469 * because many documents contain rules in headings...
470 */
471
472/*
473 * start tags that imply the end of current element
474 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000475const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000476"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
477 "dl", "ul", "ol", "menu", "dir", "address", "pre",
478 "listing", "xmp", "head", NULL,
479"head", "p", NULL,
480"title", "p", NULL,
481"body", "head", "style", "link", "title", "p", NULL,
482"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
483 "pre", "listing", "xmp", "head", "li", NULL,
484"hr", "p", "head", NULL,
485"h1", "p", "head", NULL,
486"h2", "p", "head", NULL,
487"h3", "p", "head", NULL,
488"h4", "p", "head", NULL,
489"h5", "p", "head", NULL,
490"h6", "p", "head", NULL,
491"dir", "p", "head", NULL,
492"address", "p", "head", "ul", NULL,
493"pre", "p", "head", "ul", NULL,
494"listing", "p", "head", NULL,
495"xmp", "p", "head", NULL,
496"blockquote", "p", "head", NULL,
497"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
498 "xmp", "head", NULL,
499"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
500 "head", "dd", NULL,
501"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
502 "head", "dt", NULL,
503"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
504 "listing", "xmp", NULL,
505"ol", "p", "head", "ul", NULL,
506"menu", "p", "head", "ul", NULL,
507"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
508"div", "p", "head", NULL,
509"noscript", "p", "head", NULL,
510"center", "font", "b", "i", "p", "head", NULL,
511"a", "a", NULL,
512"caption", "p", NULL,
513"colgroup", "caption", "colgroup", "col", "p", NULL,
514"col", "caption", "col", "p", NULL,
515"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
516 "listing", "xmp", "a", NULL,
517"th", "th", "td", NULL,
518"td", "th", "td", "p", NULL,
519"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
520"thead", "caption", "col", "colgroup", NULL,
521"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
522 "tbody", "p", NULL,
523"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
524 "tfoot", "tbody", "p", NULL,
525"optgroup", "option", NULL,
526"option", "option", NULL,
527"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
528 "pre", "listing", "xmp", "a", NULL,
529NULL
530};
531
532/*
533 * The list of HTML elements which are supposed not to have
534 * CDATA content and where a p element will be implied
535 *
536 * TODO: extend that list by reading the HTML SGML DtD on
537 * implied paragraph
538 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000539static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000540 "html",
541 "head",
542 "body",
543 NULL
544};
545
546/*
547 * The list of HTML attributes which are of content %Script;
548 * NOTE: when adding ones, check htmlIsScriptAttribute() since
549 * it assumes the name starts with 'on'
550 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000551static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000552 "onclick",
553 "ondblclick",
554 "onmousedown",
555 "onmouseup",
556 "onmouseover",
557 "onmousemove",
558 "onmouseout",
559 "onkeypress",
560 "onkeydown",
561 "onkeyup",
562 "onload",
563 "onunload",
564 "onfocus",
565 "onblur",
566 "onsubmit",
567 "onrest",
568 "onchange",
569 "onselect"
570};
571
572
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000573static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +0000574static int htmlStartCloseIndexinitialized = 0;
575
576/************************************************************************
577 * *
578 * functions to handle HTML specific data *
579 * *
580 ************************************************************************/
581
582/**
583 * htmlInitAutoClose:
584 *
585 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
586 * This is not reentrant. Call xmlInitParser() once before processing in
587 * case of use in multithreaded programs.
588 */
589void
590htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000591 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000592
593 if (htmlStartCloseIndexinitialized) return;
594
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000595 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
596 indx = 0;
597 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
598 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +0000599 while (htmlStartClose[i] != NULL) i++;
600 i++;
601 }
602 htmlStartCloseIndexinitialized = 1;
603}
604
605/**
606 * htmlTagLookup:
607 * @tag: The tag name in lowercase
608 *
609 * Lookup the HTML tag in the ElementTable
610 *
611 * Returns the related htmlElemDescPtr or NULL if not found.
612 */
613htmlElemDescPtr
614htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000615 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000616
617 for (i = 0; i < (sizeof(html40ElementTable) /
618 sizeof(html40ElementTable[0]));i++) {
619 if (xmlStrEqual(tag, BAD_CAST html40ElementTable[i].name))
620 return(&html40ElementTable[i]);
621 }
622 return(NULL);
623}
624
625/**
626 * htmlCheckAutoClose:
627 * @newtag: The new tag name
628 * @oldtag: The old tag name
629 *
630 * Checks wether the new tag is one of the registered valid tags for closing old.
631 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
632 *
633 * Returns 0 if no, 1 if yes.
634 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000635static int
Owen Taylor3473f882001-02-23 17:55:21 +0000636htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000637 int i, indx;
638 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +0000639
640 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
641
642 /* inefficient, but not a big deal */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000643 for (indx = 0; indx < 100;indx++) {
644 closed = htmlStartCloseIndex[indx];
645 if (closed == NULL) return(0);
646 if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor3473f882001-02-23 17:55:21 +0000647 }
648
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000649 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +0000650 i++;
651 while (htmlStartClose[i] != NULL) {
652 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
653 return(1);
654 }
655 i++;
656 }
657 return(0);
658}
659
660/**
661 * htmlAutoCloseOnClose:
662 * @ctxt: an HTML parser context
663 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000664 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +0000665 *
666 * The HTmL DtD allows an ending tag to implicitely close other tags.
667 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000668static void
Owen Taylor3473f882001-02-23 17:55:21 +0000669htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
670 htmlElemDescPtr info;
671 xmlChar *oldname;
672 int i;
673
674#ifdef DEBUG
675 xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
676 for (i = 0;i < ctxt->nameNr;i++)
677 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
678#endif
679
680 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
681 if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
682 }
683 if (i < 0) return;
684
685 while (!xmlStrEqual(newtag, ctxt->name)) {
686 info = htmlTagLookup(ctxt->name);
687 if ((info == NULL) || (info->endTag == 1)) {
688#ifdef DEBUG
689 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
690#endif
691 } else {
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000692 return;
Owen Taylor3473f882001-02-23 17:55:21 +0000693 }
694 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
695 ctxt->sax->endElement(ctxt->userData, ctxt->name);
696 oldname = htmlnamePop(ctxt);
697 if (oldname != NULL) {
698#ifdef DEBUG
699 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
700#endif
701 xmlFree(oldname);
702 }
703 }
704}
705
706/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000707 * htmlAutoCloseOnEnd:
708 * @ctxt: an HTML parser context
709 *
710 * Close all remaining tags at the end of the stream
711 */
712static void
713htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
714 xmlChar *oldname;
715 int i;
716
717 if (ctxt->nameNr == 0)
718 return;
719#ifdef DEBUG
720 xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
721#endif
722
723 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
724#ifdef DEBUG
725 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
726#endif
727 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
728 ctxt->sax->endElement(ctxt->userData, ctxt->name);
729 oldname = htmlnamePop(ctxt);
730 if (oldname != NULL) {
731#ifdef DEBUG
732 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
733#endif
734 xmlFree(oldname);
735 }
736 }
737}
738
739/**
Owen Taylor3473f882001-02-23 17:55:21 +0000740 * htmlAutoClose:
741 * @ctxt: an HTML parser context
742 * @newtag: The new tag name or NULL
743 *
744 * The HTmL DtD allows a tag to implicitely close other tags.
745 * The list is kept in htmlStartClose array. This function is
746 * called when a new tag has been detected and generates the
747 * appropriates closes if possible/needed.
748 * If newtag is NULL this mean we are at the end of the resource
749 * and we should check
750 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000751static void
Owen Taylor3473f882001-02-23 17:55:21 +0000752htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
753 xmlChar *oldname;
754 while ((newtag != NULL) && (ctxt->name != NULL) &&
755 (htmlCheckAutoClose(newtag, ctxt->name))) {
756#ifdef DEBUG
757 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
758#endif
759 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
760 ctxt->sax->endElement(ctxt->userData, ctxt->name);
761 oldname = htmlnamePop(ctxt);
762 if (oldname != NULL) {
763#ifdef DEBUG
764 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
765#endif
766 xmlFree(oldname);
767 }
768 }
769 if (newtag == NULL) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000770 htmlAutoCloseOnEnd(ctxt);
771 return;
Owen Taylor3473f882001-02-23 17:55:21 +0000772 }
773 while ((newtag == NULL) && (ctxt->name != NULL) &&
774 ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
775 (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
776 (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
777#ifdef DEBUG
778 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
779#endif
780 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
781 ctxt->sax->endElement(ctxt->userData, ctxt->name);
782 oldname = htmlnamePop(ctxt);
783 if (oldname != NULL) {
784#ifdef DEBUG
785 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
786#endif
787 xmlFree(oldname);
788 }
789 }
790
791}
792
793/**
794 * htmlAutoCloseTag:
795 * @doc: the HTML document
796 * @name: The tag name
797 * @elem: the HTML element
798 *
799 * The HTmL DtD allows a tag to implicitely close other tags.
800 * The list is kept in htmlStartClose array. This function checks
801 * if the element or one of it's children would autoclose the
802 * given tag.
803 *
804 * Returns 1 if autoclose, 0 otherwise
805 */
806int
807htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
808 htmlNodePtr child;
809
810 if (elem == NULL) return(1);
811 if (xmlStrEqual(name, elem->name)) return(0);
812 if (htmlCheckAutoClose(elem->name, name)) return(1);
813 child = elem->children;
814 while (child != NULL) {
815 if (htmlAutoCloseTag(doc, name, child)) return(1);
816 child = child->next;
817 }
818 return(0);
819}
820
821/**
822 * htmlIsAutoClosed:
823 * @doc: the HTML document
824 * @elem: the HTML element
825 *
826 * The HTmL DtD allows a tag to implicitely close other tags.
827 * The list is kept in htmlStartClose array. This function checks
828 * if a tag is autoclosed by one of it's child
829 *
830 * Returns 1 if autoclosed, 0 otherwise
831 */
832int
833htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
834 htmlNodePtr child;
835
836 if (elem == NULL) return(1);
837 child = elem->children;
838 while (child != NULL) {
839 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
840 child = child->next;
841 }
842 return(0);
843}
844
845/**
846 * htmlCheckImplied:
847 * @ctxt: an HTML parser context
848 * @newtag: The new tag name
849 *
850 * The HTML DtD allows a tag to exists only implicitely
851 * called when a new tag has been detected and generates the
852 * appropriates implicit tags if missing
853 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000854static void
Owen Taylor3473f882001-02-23 17:55:21 +0000855htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
856 if (!htmlOmittedDefaultValue)
857 return;
858 if (xmlStrEqual(newtag, BAD_CAST"html"))
859 return;
860 if (ctxt->nameNr <= 0) {
861#ifdef DEBUG
862 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
863#endif
864 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
865 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
866 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
867 }
868 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
869 return;
870 if ((ctxt->nameNr <= 1) &&
871 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
872 (xmlStrEqual(newtag, BAD_CAST"style")) ||
873 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
874 (xmlStrEqual(newtag, BAD_CAST"link")) ||
875 (xmlStrEqual(newtag, BAD_CAST"title")) ||
876 (xmlStrEqual(newtag, BAD_CAST"base")))) {
877 /*
878 * dropped OBJECT ... i you put it first BODY will be
879 * assumed !
880 */
881#ifdef DEBUG
882 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
883#endif
884 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
885 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
886 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
887 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
888 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
889 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
890 int i;
891 for (i = 0;i < ctxt->nameNr;i++) {
892 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
893 return;
894 }
895 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
896 return;
897 }
898 }
899
900#ifdef DEBUG
901 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
902#endif
903 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
904 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
905 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
906 }
907}
908
909/**
910 * htmlCheckParagraph
911 * @ctxt: an HTML parser context
912 *
913 * Check whether a p element need to be implied before inserting
914 * characters in the current element.
915 *
916 * Returns 1 if a paragraph has been inserted, 0 if not and -1
917 * in case of error.
918 */
919
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000920static int
Owen Taylor3473f882001-02-23 17:55:21 +0000921htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
922 const xmlChar *tag;
923 int i;
924
925 if (ctxt == NULL)
926 return(-1);
927 tag = ctxt->name;
928 if (tag == NULL) {
929 htmlAutoClose(ctxt, BAD_CAST"p");
930 htmlCheckImplied(ctxt, BAD_CAST"p");
931 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
932 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
933 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
934 return(1);
935 }
936 if (!htmlOmittedDefaultValue)
937 return(0);
938 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
939 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
940#ifdef DEBUG
941 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
942#endif
943 htmlAutoClose(ctxt, BAD_CAST"p");
944 htmlCheckImplied(ctxt, BAD_CAST"p");
945 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
946 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
947 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
948 return(1);
949 }
950 }
951 return(0);
952}
953
954/**
955 * htmlIsScriptAttribute:
956 * @name: an attribute name
957 *
958 * Check if an attribute is of content type Script
959 *
960 * Returns 1 is the attribute is a script 0 otherwise
961 */
962int
963htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000964 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000965
966 if (name == NULL)
967 return(0);
968 /*
969 * all script attributes start with 'on'
970 */
971 if ((name[0] != 'o') || (name[1] != 'n'))
972 return(0);
973 for (i = 0;
974 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
975 i++) {
976 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
977 return(1);
978 }
979 return(0);
980}
981
982/************************************************************************
983 * *
984 * The list of HTML predefined entities *
985 * *
986 ************************************************************************/
987
988
989htmlEntityDesc html40EntitiesTable[] = {
990/*
991 * the 4 absolute ones, plus apostrophe.
992 */
993{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
994{ 38, "amp", "ampersand, U+0026 ISOnum" },
995{ 39, "apos", "single quote" },
996{ 60, "lt", "less-than sign, U+003C ISOnum" },
997{ 62, "gt", "greater-than sign, U+003E ISOnum" },
998
999/*
1000 * A bunch still in the 128-255 range
1001 * Replacing them depend really on the charset used.
1002 */
1003{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1004{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1005{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1006{ 163, "pound","pound sign, U+00A3 ISOnum" },
1007{ 164, "curren","currency sign, U+00A4 ISOnum" },
1008{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1009{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1010{ 167, "sect", "section sign, U+00A7 ISOnum" },
1011{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1012{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1013{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1014{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1015{ 172, "not", "not sign, U+00AC ISOnum" },
1016{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1017{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1018{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1019{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1020{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1021{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1022{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1023{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1024{ 181, "micro","micro sign, U+00B5 ISOnum" },
1025{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1026{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1027{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1028{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1029{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1030{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1031{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1032{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1033{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1034{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1035{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1036{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1037{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1038{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1039{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1040{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1041{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1042{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1043{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1044{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1045{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1046{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1047{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1048{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1049{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1050{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1051{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1052{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1053{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1054{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1055{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1056{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1057{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1058{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1059{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1060{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1061{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1062{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1063{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1064{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1065{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1066{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1067{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1068{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1069{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1070{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1071{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1072{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1073{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1074{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1075{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1076{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1077{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1078{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1079{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1080{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1081{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1082{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1083{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1084{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1085{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1086{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1087{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1088{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1089{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1090{ 247, "divide","division sign, U+00F7 ISOnum" },
1091{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1092{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1093{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1094{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1095{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1096{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1097{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1098{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1099
1100{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1101{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1102{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1103{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1104{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1105
1106/*
1107 * Anything below should really be kept as entities references
1108 */
1109{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1110
1111{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1112{ 732, "tilde","small tilde, U+02DC ISOdia" },
1113
1114{ 913, "Alpha","greek capital letter alpha, U+0391" },
1115{ 914, "Beta", "greek capital letter beta, U+0392" },
1116{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1117{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1118{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1119{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1120{ 919, "Eta", "greek capital letter eta, U+0397" },
1121{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1122{ 921, "Iota", "greek capital letter iota, U+0399" },
1123{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001124{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001125{ 924, "Mu", "greek capital letter mu, U+039C" },
1126{ 925, "Nu", "greek capital letter nu, U+039D" },
1127{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1128{ 927, "Omicron","greek capital letter omicron, U+039F" },
1129{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1130{ 929, "Rho", "greek capital letter rho, U+03A1" },
1131{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1132{ 932, "Tau", "greek capital letter tau, U+03A4" },
1133{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1134{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1135{ 935, "Chi", "greek capital letter chi, U+03A7" },
1136{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1137{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1138
1139{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1140{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1141{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1142{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1143{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1144{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1145{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1146{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1147{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1148{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1149{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1150{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1151{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1152{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1153{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1154{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1155{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1156{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1157{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1158{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1159{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1160{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1161{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1162{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1163{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1164{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1165{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1166{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1167
1168{ 8194, "ensp", "en space, U+2002 ISOpub" },
1169{ 8195, "emsp", "em space, U+2003 ISOpub" },
1170{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1171{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1172{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1173{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1174{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1175{ 8211, "ndash","en dash, U+2013 ISOpub" },
1176{ 8212, "mdash","em dash, U+2014 ISOpub" },
1177{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1178{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1179{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1180{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1181{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1182{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1183{ 8224, "dagger","dagger, U+2020 ISOpub" },
1184{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1185
1186{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1187{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1188
1189{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1190
1191{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1192{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1193
1194{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1195{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1196
1197{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1198{ 8260, "frasl","fraction slash, U+2044 NEW" },
1199
1200{ 8364, "euro", "euro sign, U+20AC NEW" },
1201
1202{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1203{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1204{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1205{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1206{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1207{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1208{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1209{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1210{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1211{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1212{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1213{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1214{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1215{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1216{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1217{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1218
1219{ 8704, "forall","for all, U+2200 ISOtech" },
1220{ 8706, "part", "partial differential, U+2202 ISOtech" },
1221{ 8707, "exist","there exists, U+2203 ISOtech" },
1222{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1223{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1224{ 8712, "isin", "element of, U+2208 ISOtech" },
1225{ 8713, "notin","not an element of, U+2209 ISOtech" },
1226{ 8715, "ni", "contains as member, U+220B ISOtech" },
1227{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1228{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
1229{ 8722, "minus","minus sign, U+2212 ISOtech" },
1230{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1231{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1232{ 8733, "prop", "proportional to, U+221D ISOtech" },
1233{ 8734, "infin","infinity, U+221E ISOtech" },
1234{ 8736, "ang", "angle, U+2220 ISOamso" },
1235{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1236{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1237{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1238{ 8746, "cup", "union = cup, U+222A ISOtech" },
1239{ 8747, "int", "integral, U+222B ISOtech" },
1240{ 8756, "there4","therefore, U+2234 ISOtech" },
1241{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1242{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1243{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1244{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1245{ 8801, "equiv","identical to, U+2261 ISOtech" },
1246{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1247{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1248{ 8834, "sub", "subset of, U+2282 ISOtech" },
1249{ 8835, "sup", "superset of, U+2283 ISOtech" },
1250{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1251{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1252{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1253{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1254{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1255{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1256{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1257{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1258{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1259{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1260{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1261{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1262{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1263{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1264
1265{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1266{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1267{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1268{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1269
1270};
1271
1272/************************************************************************
1273 * *
1274 * Commodity functions to handle entities *
1275 * *
1276 ************************************************************************/
1277
1278/*
1279 * Macro used to grow the current buffer.
1280 */
1281#define growBuffer(buffer) { \
1282 buffer##_size *= 2; \
1283 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1284 if (buffer == NULL) { \
1285 perror("realloc failed"); \
1286 return(NULL); \
1287 } \
1288}
1289
1290/**
1291 * htmlEntityLookup:
1292 * @name: the entity name
1293 *
1294 * Lookup the given entity in EntitiesTable
1295 *
1296 * TODO: the linear scan is really ugly, an hash table is really needed.
1297 *
1298 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1299 */
1300htmlEntityDescPtr
1301htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001302 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001303
1304 for (i = 0;i < (sizeof(html40EntitiesTable)/
1305 sizeof(html40EntitiesTable[0]));i++) {
1306 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1307#ifdef DEBUG
1308 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
1309#endif
1310 return(&html40EntitiesTable[i]);
1311 }
1312 }
1313 return(NULL);
1314}
1315
1316/**
1317 * htmlEntityValueLookup:
1318 * @value: the entity's unicode value
1319 *
1320 * Lookup the given entity in EntitiesTable
1321 *
1322 * TODO: the linear scan is really ugly, an hash table is really needed.
1323 *
1324 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1325 */
1326htmlEntityDescPtr
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001327htmlEntityValueLookup(unsigned int value) {
1328 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001329#ifdef DEBUG
1330 int lv = 0;
1331#endif
1332
1333 for (i = 0;i < (sizeof(html40EntitiesTable)/
1334 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001335 if (html40EntitiesTable[i].value >= value) {
1336 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001337 break;
1338#ifdef DEBUG
1339 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
1340#endif
1341 return(&html40EntitiesTable[i]);
1342 }
1343#ifdef DEBUG
1344 if (lv > html40EntitiesTable[i].value) {
1345 xmlGenericError(xmlGenericErrorContext,
1346 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1347 lv, html40EntitiesTable[i].value);
1348 }
1349 lv = html40EntitiesTable[i].value;
1350#endif
1351 }
1352 return(NULL);
1353}
1354
1355/**
1356 * UTF8ToHtml:
1357 * @out: a pointer to an array of bytes to store the result
1358 * @outlen: the length of @out
1359 * @in: a pointer to an array of UTF-8 chars
1360 * @inlen: the length of @in
1361 *
1362 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1363 * plus HTML entities block of chars out.
1364 *
1365 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1366 * The value of @inlen after return is the number of octets consumed
1367 * as the return value is positive, else unpredictiable.
1368 * The value of @outlen after return is the number of octets consumed.
1369 */
1370int
1371UTF8ToHtml(unsigned char* out, int *outlen,
1372 const unsigned char* in, int *inlen) {
1373 const unsigned char* processed = in;
1374 const unsigned char* outend;
1375 const unsigned char* outstart = out;
1376 const unsigned char* instart = in;
1377 const unsigned char* inend;
1378 unsigned int c, d;
1379 int trailing;
1380
1381 if (in == NULL) {
1382 /*
1383 * initialization nothing to do
1384 */
1385 *outlen = 0;
1386 *inlen = 0;
1387 return(0);
1388 }
1389 inend = in + (*inlen);
1390 outend = out + (*outlen);
1391 while (in < inend) {
1392 d = *in++;
1393 if (d < 0x80) { c= d; trailing= 0; }
1394 else if (d < 0xC0) {
1395 /* trailing byte in leading position */
1396 *outlen = out - outstart;
1397 *inlen = processed - instart;
1398 return(-2);
1399 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1400 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1401 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1402 else {
1403 /* no chance for this in Ascii */
1404 *outlen = out - outstart;
1405 *inlen = processed - instart;
1406 return(-2);
1407 }
1408
1409 if (inend - in < trailing) {
1410 break;
1411 }
1412
1413 for ( ; trailing; trailing--) {
1414 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1415 break;
1416 c <<= 6;
1417 c |= d & 0x3F;
1418 }
1419
1420 /* assertion: c is a single UTF-4 value */
1421 if (c < 0x80) {
1422 if (out + 1 >= outend)
1423 break;
1424 *out++ = c;
1425 } else {
1426 int len;
1427 htmlEntityDescPtr ent;
1428
1429 /*
1430 * Try to lookup a predefined HTML entity for it
1431 */
1432
1433 ent = htmlEntityValueLookup(c);
1434 if (ent == NULL) {
1435 /* no chance for this in Ascii */
1436 *outlen = out - outstart;
1437 *inlen = processed - instart;
1438 return(-2);
1439 }
1440 len = strlen(ent->name);
1441 if (out + 2 + len >= outend)
1442 break;
1443 *out++ = '&';
1444 memcpy(out, ent->name, len);
1445 out += len;
1446 *out++ = ';';
1447 }
1448 processed = in;
1449 }
1450 *outlen = out - outstart;
1451 *inlen = processed - instart;
1452 return(0);
1453}
1454
1455/**
1456 * htmlEncodeEntities:
1457 * @out: a pointer to an array of bytes to store the result
1458 * @outlen: the length of @out
1459 * @in: a pointer to an array of UTF-8 chars
1460 * @inlen: the length of @in
1461 * @quoteChar: the quote character to escape (' or ") or zero.
1462 *
1463 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1464 * plus HTML entities block of chars out.
1465 *
1466 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1467 * The value of @inlen after return is the number of octets consumed
1468 * as the return value is positive, else unpredictiable.
1469 * The value of @outlen after return is the number of octets consumed.
1470 */
1471int
1472htmlEncodeEntities(unsigned char* out, int *outlen,
1473 const unsigned char* in, int *inlen, int quoteChar) {
1474 const unsigned char* processed = in;
1475 const unsigned char* outend = out + (*outlen);
1476 const unsigned char* outstart = out;
1477 const unsigned char* instart = in;
1478 const unsigned char* inend = in + (*inlen);
1479 unsigned int c, d;
1480 int trailing;
1481
1482 while (in < inend) {
1483 d = *in++;
1484 if (d < 0x80) { c= d; trailing= 0; }
1485 else if (d < 0xC0) {
1486 /* trailing byte in leading position */
1487 *outlen = out - outstart;
1488 *inlen = processed - instart;
1489 return(-2);
1490 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1491 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1492 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1493 else {
1494 /* no chance for this in Ascii */
1495 *outlen = out - outstart;
1496 *inlen = processed - instart;
1497 return(-2);
1498 }
1499
1500 if (inend - in < trailing)
1501 break;
1502
1503 while (trailing--) {
1504 if (((d= *in++) & 0xC0) != 0x80) {
1505 *outlen = out - outstart;
1506 *inlen = processed - instart;
1507 return(-2);
1508 }
1509 c <<= 6;
1510 c |= d & 0x3F;
1511 }
1512
1513 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001514 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1515 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001516 if (out >= outend)
1517 break;
1518 *out++ = c;
1519 } else {
1520 htmlEntityDescPtr ent;
1521 const char *cp;
1522 char nbuf[16];
1523 int len;
1524
1525 /*
1526 * Try to lookup a predefined HTML entity for it
1527 */
1528 ent = htmlEntityValueLookup(c);
1529 if (ent == NULL) {
1530 sprintf(nbuf, "#%u", c);
1531 cp = nbuf;
1532 }
1533 else
1534 cp = ent->name;
1535 len = strlen(cp);
1536 if (out + 2 + len > outend)
1537 break;
1538 *out++ = '&';
1539 memcpy(out, cp, len);
1540 out += len;
1541 *out++ = ';';
1542 }
1543 processed = in;
1544 }
1545 *outlen = out - outstart;
1546 *inlen = processed - instart;
1547 return(0);
1548}
1549
1550/**
1551 * htmlDecodeEntities:
1552 * @ctxt: the parser context
1553 * @len: the len to decode (in bytes !), -1 for no size limit
1554 * @end: an end marker xmlChar, 0 if none
1555 * @end2: an end marker xmlChar, 0 if none
1556 * @end3: an end marker xmlChar, 0 if none
1557 *
1558 * Subtitute the HTML entities by their value
1559 *
1560 * DEPRECATED !!!!
1561 *
1562 * Returns A newly allocated string with the substitution done. The caller
1563 * must deallocate it !
1564 */
1565xmlChar *
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00001566htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
1567 xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001568 static int deprecated = 0;
1569 if (!deprecated) {
1570 xmlGenericError(xmlGenericErrorContext,
1571 "htmlDecodeEntities() deprecated function reached\n");
1572 deprecated = 1;
1573 }
1574 return(NULL);
1575#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00001576 xmlChar *name = NULL;
1577 xmlChar *buffer = NULL;
1578 unsigned int buffer_size = 0;
1579 unsigned int nbchars = 0;
1580 htmlEntityDescPtr ent;
1581 unsigned int max = (unsigned int) len;
1582 int c,l;
1583
1584 if (ctxt->depth > 40) {
1585 ctxt->errNo = XML_ERR_ENTITY_LOOP;
1586 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1587 ctxt->sax->error(ctxt->userData,
1588 "Detected entity reference loop\n");
1589 ctxt->wellFormed = 0;
1590 ctxt->disableSAX = 1;
1591 return(NULL);
1592 }
1593
1594 /*
1595 * allocate a translation buffer.
1596 */
1597 buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1598 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1599 if (buffer == NULL) {
1600 perror("xmlDecodeEntities: malloc failed");
1601 return(NULL);
1602 }
1603
1604 /*
1605 * Ok loop until we reach one of the ending char or a size limit.
1606 */
1607 c = CUR_CHAR(l);
1608 while ((nbchars < max) && (c != end) &&
1609 (c != end2) && (c != end3)) {
1610
1611 if (c == 0) break;
1612 if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
1613 int val = htmlParseCharRef(ctxt);
1614 COPY_BUF(0,buffer,nbchars,val);
1615 NEXTL(l);
1616 } else if ((c == '&') && (ctxt->token != '&')) {
1617 ent = htmlParseEntityRef(ctxt, &name);
1618 if (name != NULL) {
1619 if (ent != NULL) {
1620 int val = ent->value;
1621 COPY_BUF(0,buffer,nbchars,val);
1622 NEXTL(l);
1623 } else {
1624 const xmlChar *cur = name;
1625
1626 buffer[nbchars++] = '&';
1627 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1628 growBuffer(buffer);
1629 }
1630 while (*cur != 0) {
1631 buffer[nbchars++] = *cur++;
1632 }
1633 buffer[nbchars++] = ';';
1634 }
1635 }
1636 } else {
1637 COPY_BUF(l,buffer,nbchars,c);
1638 NEXTL(l);
1639 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1640 growBuffer(buffer);
1641 }
1642 }
1643 c = CUR_CHAR(l);
1644 }
1645 buffer[nbchars++] = 0;
1646 return(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001647#endif
Owen Taylor3473f882001-02-23 17:55:21 +00001648}
1649
1650/************************************************************************
1651 * *
1652 * Commodity functions to handle streams *
1653 * *
1654 ************************************************************************/
1655
1656/**
Owen Taylor3473f882001-02-23 17:55:21 +00001657 * htmlNewInputStream:
1658 * @ctxt: an HTML parser context
1659 *
1660 * Create a new input stream structure
1661 * Returns the new input stream or NULL
1662 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001663static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001664htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1665 htmlParserInputPtr input;
1666
1667 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1668 if (input == NULL) {
1669 ctxt->errNo = XML_ERR_NO_MEMORY;
1670 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1671 ctxt->sax->error(ctxt->userData,
1672 "malloc: couldn't allocate a new input stream\n");
1673 return(NULL);
1674 }
1675 memset(input, 0, sizeof(htmlParserInput));
1676 input->filename = NULL;
1677 input->directory = NULL;
1678 input->base = NULL;
1679 input->cur = NULL;
1680 input->buf = NULL;
1681 input->line = 1;
1682 input->col = 1;
1683 input->buf = NULL;
1684 input->free = NULL;
1685 input->version = NULL;
1686 input->consumed = 0;
1687 input->length = 0;
1688 return(input);
1689}
1690
1691
1692/************************************************************************
1693 * *
1694 * Commodity functions, cleanup needed ? *
1695 * *
1696 ************************************************************************/
1697
1698/**
1699 * areBlanks:
1700 * @ctxt: an HTML parser context
1701 * @str: a xmlChar *
1702 * @len: the size of @str
1703 *
1704 * Is this a sequence of blank chars that one can ignore ?
1705 *
1706 * Returns 1 if ignorable 0 otherwise.
1707 */
1708
1709static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
1710 int i;
1711 xmlNodePtr lastChild;
1712
1713 for (i = 0;i < len;i++)
1714 if (!(IS_BLANK(str[i]))) return(0);
1715
1716 if (CUR == 0) return(1);
1717 if (CUR != '<') return(0);
1718 if (ctxt->name == NULL)
1719 return(1);
1720 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
1721 return(1);
1722 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
1723 return(1);
1724 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
1725 return(1);
1726 if (ctxt->node == NULL) return(0);
1727 lastChild = xmlGetLastChild(ctxt->node);
1728 if (lastChild == NULL) {
1729 if (ctxt->node->content != NULL) return(0);
1730 } else if (xmlNodeIsText(lastChild)) {
1731 return(0);
1732 } else if (xmlStrEqual(lastChild->name, BAD_CAST"b")) {
1733 return(0);
1734 } else if (xmlStrEqual(lastChild->name, BAD_CAST"bold")) {
1735 return(0);
1736 } else if (xmlStrEqual(lastChild->name, BAD_CAST"em")) {
1737 return(0);
1738 }
1739 return(1);
1740}
1741
1742/**
Owen Taylor3473f882001-02-23 17:55:21 +00001743 * htmlNewDocNoDtD:
1744 * @URI: URI for the dtd, or NULL
1745 * @ExternalID: the external ID of the DTD, or NULL
1746 *
1747 * Returns a new document, do not intialize the DTD if not provided
1748 */
1749htmlDocPtr
1750htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
1751 xmlDocPtr cur;
1752
1753 /*
1754 * Allocate a new document and fill the fields.
1755 */
1756 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
1757 if (cur == NULL) {
1758 xmlGenericError(xmlGenericErrorContext,
1759 "xmlNewDoc : malloc failed\n");
1760 return(NULL);
1761 }
1762 memset(cur, 0, sizeof(xmlDoc));
1763
1764 cur->type = XML_HTML_DOCUMENT_NODE;
1765 cur->version = NULL;
1766 cur->intSubset = NULL;
1767 if ((ExternalID != NULL) ||
1768 (URI != NULL))
1769 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
1770 cur->doc = cur;
1771 cur->name = NULL;
1772 cur->children = NULL;
1773 cur->extSubset = NULL;
1774 cur->oldNs = NULL;
1775 cur->encoding = NULL;
1776 cur->standalone = 1;
1777 cur->compression = 0;
1778 cur->ids = NULL;
1779 cur->refs = NULL;
1780#ifndef XML_WITHOUT_CORBA
1781 cur->_private = NULL;
1782#endif
1783 return(cur);
1784}
1785
1786/**
1787 * htmlNewDoc:
1788 * @URI: URI for the dtd, or NULL
1789 * @ExternalID: the external ID of the DTD, or NULL
1790 *
1791 * Returns a new document
1792 */
1793htmlDocPtr
1794htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1795 if ((URI == NULL) && (ExternalID == NULL))
1796 return(htmlNewDocNoDtD(
1797 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
1798 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"));
1799
1800 return(htmlNewDocNoDtD(URI, ExternalID));
1801}
1802
1803
1804/************************************************************************
1805 * *
1806 * The parser itself *
1807 * Relates to http://www.w3.org/TR/html40 *
1808 * *
1809 ************************************************************************/
1810
1811/************************************************************************
1812 * *
1813 * The parser itself *
1814 * *
1815 ************************************************************************/
1816
1817/**
1818 * htmlParseHTMLName:
1819 * @ctxt: an HTML parser context
1820 *
1821 * parse an HTML tag or attribute name, note that we convert it to lowercase
1822 * since HTML names are not case-sensitive.
1823 *
1824 * Returns the Tag Name parsed or NULL
1825 */
1826
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001827static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001828htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1829 xmlChar *ret = NULL;
1830 int i = 0;
1831 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
1832
1833 if (!IS_LETTER(CUR) && (CUR != '_') &&
1834 (CUR != ':')) return(NULL);
1835
1836 while ((i < HTML_PARSER_BUFFER_SIZE) &&
1837 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1838 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
1839 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
1840 else loc[i] = CUR;
1841 i++;
1842
1843 NEXT;
1844 }
1845
1846 ret = xmlStrndup(loc, i);
1847
1848 return(ret);
1849}
1850
1851/**
1852 * htmlParseName:
1853 * @ctxt: an HTML parser context
1854 *
1855 * parse an HTML name, this routine is case sensistive.
1856 *
1857 * Returns the Name parsed or NULL
1858 */
1859
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001860static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001861htmlParseName(htmlParserCtxtPtr ctxt) {
1862 xmlChar buf[HTML_MAX_NAMELEN];
1863 int len = 0;
1864
1865 GROW;
1866 if (!IS_LETTER(CUR) && (CUR != '_')) {
1867 return(NULL);
1868 }
1869
1870 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1871 (CUR == '.') || (CUR == '-') ||
1872 (CUR == '_') || (CUR == ':') ||
1873 (IS_COMBINING(CUR)) ||
1874 (IS_EXTENDER(CUR))) {
1875 buf[len++] = CUR;
1876 NEXT;
1877 if (len >= HTML_MAX_NAMELEN) {
1878 xmlGenericError(xmlGenericErrorContext,
1879 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1880 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1881 (CUR == '.') || (CUR == '-') ||
1882 (CUR == '_') || (CUR == ':') ||
1883 (IS_COMBINING(CUR)) ||
1884 (IS_EXTENDER(CUR)))
1885 NEXT;
1886 break;
1887 }
1888 }
1889 return(xmlStrndup(buf, len));
1890}
1891
1892/**
1893 * htmlParseHTMLAttribute:
1894 * @ctxt: an HTML parser context
1895 * @stop: a char stop value
1896 *
1897 * parse an HTML attribute value till the stop (quote), if
1898 * stop is 0 then it stops at the first space
1899 *
1900 * Returns the attribute parsed or NULL
1901 */
1902
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001903static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001904htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
1905 xmlChar *buffer = NULL;
1906 int buffer_size = 0;
1907 xmlChar *out = NULL;
1908 xmlChar *name = NULL;
1909
1910 xmlChar *cur = NULL;
1911 htmlEntityDescPtr ent;
1912
1913 /*
1914 * allocate a translation buffer.
1915 */
1916 buffer_size = HTML_PARSER_BUFFER_SIZE;
1917 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1918 if (buffer == NULL) {
1919 perror("htmlParseHTMLAttribute: malloc failed");
1920 return(NULL);
1921 }
1922 out = buffer;
1923
1924 /*
1925 * Ok loop until we reach one of the ending chars
1926 */
1927 while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1928 if ((stop == 0) && (IS_BLANK(CUR))) break;
1929 if (CUR == '&') {
1930 if (NXT(1) == '#') {
1931 unsigned int c;
1932 int bits;
1933
1934 c = htmlParseCharRef(ctxt);
1935 if (c < 0x80)
1936 { *out++ = c; bits= -6; }
1937 else if (c < 0x800)
1938 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1939 else if (c < 0x10000)
1940 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1941 else
1942 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1943
1944 for ( ; bits >= 0; bits-= 6) {
1945 *out++ = ((c >> bits) & 0x3F) | 0x80;
1946 }
1947 } else {
1948 ent = htmlParseEntityRef(ctxt, &name);
1949 if (name == NULL) {
1950 *out++ = '&';
1951 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001952 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00001953
1954 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001955 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00001956 }
1957 } else if (ent == NULL) {
1958 *out++ = '&';
1959 cur = name;
1960 while (*cur != 0) {
1961 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001962 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00001963
1964 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001965 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00001966 }
1967 *out++ = *cur++;
1968 }
1969 xmlFree(name);
1970 } else {
1971 unsigned int c;
1972 int bits;
1973
1974 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001975 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00001976
1977 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001978 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00001979 }
1980 c = (xmlChar)ent->value;
1981 if (c < 0x80)
1982 { *out++ = c; bits= -6; }
1983 else if (c < 0x800)
1984 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1985 else if (c < 0x10000)
1986 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1987 else
1988 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1989
1990 for ( ; bits >= 0; bits-= 6) {
1991 *out++ = ((c >> bits) & 0x3F) | 0x80;
1992 }
1993 xmlFree(name);
1994 }
1995 }
1996 } else {
1997 unsigned int c;
1998 int bits, l;
1999
2000 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002001 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002002
2003 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002004 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002005 }
2006 c = CUR_CHAR(l);
2007 if (c < 0x80)
2008 { *out++ = c; bits= -6; }
2009 else if (c < 0x800)
2010 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2011 else if (c < 0x10000)
2012 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2013 else
2014 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2015
2016 for ( ; bits >= 0; bits-= 6) {
2017 *out++ = ((c >> bits) & 0x3F) | 0x80;
2018 }
2019 NEXT;
2020 }
2021 }
2022 *out++ = 0;
2023 return(buffer);
2024}
2025
2026/**
Owen Taylor3473f882001-02-23 17:55:21 +00002027 * htmlParseEntityRef:
2028 * @ctxt: an HTML parser context
2029 * @str: location to store the entity name
2030 *
2031 * parse an HTML ENTITY references
2032 *
2033 * [68] EntityRef ::= '&' Name ';'
2034 *
2035 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2036 * if non-NULL *str will have to be freed by the caller.
2037 */
2038htmlEntityDescPtr
2039htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2040 xmlChar *name;
2041 htmlEntityDescPtr ent = NULL;
2042 *str = NULL;
2043
2044 if (CUR == '&') {
2045 NEXT;
2046 name = htmlParseName(ctxt);
2047 if (name == NULL) {
2048 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2049 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2050 ctxt->wellFormed = 0;
2051 } else {
2052 GROW;
2053 if (CUR == ';') {
2054 *str = name;
2055
2056 /*
2057 * Lookup the entity in the table.
2058 */
2059 ent = htmlEntityLookup(name);
2060 if (ent != NULL) /* OK that's ugly !!! */
2061 NEXT;
2062 } else {
2063 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2064 ctxt->sax->error(ctxt->userData,
2065 "htmlParseEntityRef: expecting ';'\n");
2066 *str = name;
2067 }
2068 }
2069 }
2070 return(ent);
2071}
2072
2073/**
2074 * htmlParseAttValue:
2075 * @ctxt: an HTML parser context
2076 *
2077 * parse a value for an attribute
2078 * Note: the parser won't do substitution of entities here, this
2079 * will be handled later in xmlStringGetNodeList, unless it was
2080 * asked for ctxt->replaceEntities != 0
2081 *
2082 * Returns the AttValue parsed or NULL.
2083 */
2084
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002085static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002086htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2087 xmlChar *ret = NULL;
2088
2089 if (CUR == '"') {
2090 NEXT;
2091 ret = htmlParseHTMLAttribute(ctxt, '"');
2092 if (CUR != '"') {
2093 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2094 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2095 ctxt->wellFormed = 0;
2096 } else
2097 NEXT;
2098 } else if (CUR == '\'') {
2099 NEXT;
2100 ret = htmlParseHTMLAttribute(ctxt, '\'');
2101 if (CUR != '\'') {
2102 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2103 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2104 ctxt->wellFormed = 0;
2105 } else
2106 NEXT;
2107 } else {
2108 /*
2109 * That's an HTMLism, the attribute value may not be quoted
2110 */
2111 ret = htmlParseHTMLAttribute(ctxt, 0);
2112 if (ret == NULL) {
2113 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2114 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2115 ctxt->wellFormed = 0;
2116 }
2117 }
2118 return(ret);
2119}
2120
2121/**
2122 * htmlParseSystemLiteral:
2123 * @ctxt: an HTML parser context
2124 *
2125 * parse an HTML Literal
2126 *
2127 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2128 *
2129 * Returns the SystemLiteral parsed or NULL
2130 */
2131
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002132static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002133htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2134 const xmlChar *q;
2135 xmlChar *ret = NULL;
2136
2137 if (CUR == '"') {
2138 NEXT;
2139 q = CUR_PTR;
2140 while ((IS_CHAR(CUR)) && (CUR != '"'))
2141 NEXT;
2142 if (!IS_CHAR(CUR)) {
2143 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2144 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2145 ctxt->wellFormed = 0;
2146 } else {
2147 ret = xmlStrndup(q, CUR_PTR - q);
2148 NEXT;
2149 }
2150 } else if (CUR == '\'') {
2151 NEXT;
2152 q = CUR_PTR;
2153 while ((IS_CHAR(CUR)) && (CUR != '\''))
2154 NEXT;
2155 if (!IS_CHAR(CUR)) {
2156 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2157 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2158 ctxt->wellFormed = 0;
2159 } else {
2160 ret = xmlStrndup(q, CUR_PTR - q);
2161 NEXT;
2162 }
2163 } else {
2164 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2165 ctxt->sax->error(ctxt->userData,
2166 "SystemLiteral \" or ' expected\n");
2167 ctxt->wellFormed = 0;
2168 }
2169
2170 return(ret);
2171}
2172
2173/**
2174 * htmlParsePubidLiteral:
2175 * @ctxt: an HTML parser context
2176 *
2177 * parse an HTML public literal
2178 *
2179 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2180 *
2181 * Returns the PubidLiteral parsed or NULL.
2182 */
2183
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002184static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002185htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2186 const xmlChar *q;
2187 xmlChar *ret = NULL;
2188 /*
2189 * Name ::= (Letter | '_') (NameChar)*
2190 */
2191 if (CUR == '"') {
2192 NEXT;
2193 q = CUR_PTR;
2194 while (IS_PUBIDCHAR(CUR)) NEXT;
2195 if (CUR != '"') {
2196 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2197 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2198 ctxt->wellFormed = 0;
2199 } else {
2200 ret = xmlStrndup(q, CUR_PTR - q);
2201 NEXT;
2202 }
2203 } else if (CUR == '\'') {
2204 NEXT;
2205 q = CUR_PTR;
2206 while ((IS_LETTER(CUR)) && (CUR != '\''))
2207 NEXT;
2208 if (!IS_LETTER(CUR)) {
2209 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2210 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2211 ctxt->wellFormed = 0;
2212 } else {
2213 ret = xmlStrndup(q, CUR_PTR - q);
2214 NEXT;
2215 }
2216 } else {
2217 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2218 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2219 ctxt->wellFormed = 0;
2220 }
2221
2222 return(ret);
2223}
2224
2225/**
2226 * htmlParseScript:
2227 * @ctxt: an HTML parser context
2228 *
2229 * parse the content of an HTML SCRIPT or STYLE element
2230 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2231 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2232 * http://www.w3.org/TR/html4/types.html#type-script
2233 * http://www.w3.org/TR/html4/types.html#h-6.15
2234 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2235 *
2236 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2237 * element and the value of intrinsic event attributes. User agents must
2238 * not evaluate script data as HTML markup but instead must pass it on as
2239 * data to a script engine.
2240 * NOTES:
2241 * - The content is passed like CDATA
2242 * - the attributes for style and scripting "onXXX" are also described
2243 * as CDATA but SGML allows entities references in attributes so their
2244 * processing is identical as other attributes
2245 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002246static void
Owen Taylor3473f882001-02-23 17:55:21 +00002247htmlParseScript(htmlParserCtxtPtr ctxt) {
2248 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2249 int nbchar = 0;
2250 xmlChar cur;
2251
2252 SHRINK;
2253 cur = CUR;
2254 while (IS_CHAR(cur)) {
2255 if ((cur == '<') && (NXT(1) == '/')) {
2256 /*
2257 * One should break here, the specification is clear:
2258 * Authors should therefore escape "</" within the content.
2259 * Escape mechanisms are specific to each scripting or
2260 * style sheet language.
2261 */
2262 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2263 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2264 break; /* while */
2265 }
2266 buf[nbchar++] = cur;
2267 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2268 if (ctxt->sax->cdataBlock!= NULL) {
2269 /*
2270 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2271 */
2272 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2273 }
2274 nbchar = 0;
2275 }
2276 NEXT;
2277 cur = CUR;
2278 }
2279 if (!(IS_CHAR(cur))) {
2280 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2281 ctxt->sax->error(ctxt->userData,
2282 "Invalid char in CDATA 0x%X\n", cur);
2283 ctxt->wellFormed = 0;
2284 NEXT;
2285 }
2286
2287 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2288 if (ctxt->sax->cdataBlock!= NULL) {
2289 /*
2290 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2291 */
2292 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2293 }
2294 }
2295}
2296
2297
2298/**
2299 * htmlParseCharData:
2300 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002301 *
2302 * parse a CharData section.
2303 * if we are within a CDATA section ']]>' marks an end of section.
2304 *
2305 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2306 */
2307
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002308static void
2309htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002310 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2311 int nbchar = 0;
2312 int cur, l;
2313
2314 SHRINK;
2315 cur = CUR_CHAR(l);
2316 while (((cur != '<') || (ctxt->token == '<')) &&
2317 ((cur != '&') || (ctxt->token == '&')) &&
2318 (IS_CHAR(cur))) {
2319 COPY_BUF(l,buf,nbchar,cur);
2320 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2321 /*
2322 * Ok the segment is to be consumed as chars.
2323 */
2324 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2325 if (areBlanks(ctxt, buf, nbchar)) {
2326 if (ctxt->sax->ignorableWhitespace != NULL)
2327 ctxt->sax->ignorableWhitespace(ctxt->userData,
2328 buf, nbchar);
2329 } else {
2330 htmlCheckParagraph(ctxt);
2331 if (ctxt->sax->characters != NULL)
2332 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2333 }
2334 }
2335 nbchar = 0;
2336 }
2337 NEXTL(l);
2338 cur = CUR_CHAR(l);
2339 }
2340 if (nbchar != 0) {
2341 /*
2342 * Ok the segment is to be consumed as chars.
2343 */
2344 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2345 if (areBlanks(ctxt, buf, nbchar)) {
2346 if (ctxt->sax->ignorableWhitespace != NULL)
2347 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2348 } else {
2349 htmlCheckParagraph(ctxt);
2350 if (ctxt->sax->characters != NULL)
2351 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2352 }
2353 }
2354 }
2355}
2356
2357/**
2358 * htmlParseExternalID:
2359 * @ctxt: an HTML parser context
2360 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002361 *
2362 * Parse an External ID or a Public ID
2363 *
Owen Taylor3473f882001-02-23 17:55:21 +00002364 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2365 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2366 *
2367 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2368 *
2369 * Returns the function returns SystemLiteral and in the second
2370 * case publicID receives PubidLiteral, is strict is off
2371 * it is possible to return NULL and have publicID set.
2372 */
2373
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002374static xmlChar *
2375htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002376 xmlChar *URI = NULL;
2377
2378 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2379 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2380 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2381 SKIP(6);
2382 if (!IS_BLANK(CUR)) {
2383 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2384 ctxt->sax->error(ctxt->userData,
2385 "Space required after 'SYSTEM'\n");
2386 ctxt->wellFormed = 0;
2387 }
2388 SKIP_BLANKS;
2389 URI = htmlParseSystemLiteral(ctxt);
2390 if (URI == NULL) {
2391 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2392 ctxt->sax->error(ctxt->userData,
2393 "htmlParseExternalID: SYSTEM, no URI\n");
2394 ctxt->wellFormed = 0;
2395 }
2396 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2397 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2398 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2399 SKIP(6);
2400 if (!IS_BLANK(CUR)) {
2401 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2402 ctxt->sax->error(ctxt->userData,
2403 "Space required after 'PUBLIC'\n");
2404 ctxt->wellFormed = 0;
2405 }
2406 SKIP_BLANKS;
2407 *publicID = htmlParsePubidLiteral(ctxt);
2408 if (*publicID == NULL) {
2409 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2410 ctxt->sax->error(ctxt->userData,
2411 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2412 ctxt->wellFormed = 0;
2413 }
2414 SKIP_BLANKS;
2415 if ((CUR == '"') || (CUR == '\'')) {
2416 URI = htmlParseSystemLiteral(ctxt);
2417 }
2418 }
2419 return(URI);
2420}
2421
2422/**
2423 * htmlParseComment:
2424 * @ctxt: an HTML parser context
2425 *
2426 * Parse an XML (SGML) comment <!-- .... -->
2427 *
2428 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2429 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002430static void
Owen Taylor3473f882001-02-23 17:55:21 +00002431htmlParseComment(htmlParserCtxtPtr ctxt) {
2432 xmlChar *buf = NULL;
2433 int len;
2434 int size = HTML_PARSER_BUFFER_SIZE;
2435 int q, ql;
2436 int r, rl;
2437 int cur, l;
2438 xmlParserInputState state;
2439
2440 /*
2441 * Check that there is a comment right here.
2442 */
2443 if ((RAW != '<') || (NXT(1) != '!') ||
2444 (NXT(2) != '-') || (NXT(3) != '-')) return;
2445
2446 state = ctxt->instate;
2447 ctxt->instate = XML_PARSER_COMMENT;
2448 SHRINK;
2449 SKIP(4);
2450 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2451 if (buf == NULL) {
2452 xmlGenericError(xmlGenericErrorContext,
2453 "malloc of %d byte failed\n", size);
2454 ctxt->instate = state;
2455 return;
2456 }
2457 q = CUR_CHAR(ql);
2458 NEXTL(ql);
2459 r = CUR_CHAR(rl);
2460 NEXTL(rl);
2461 cur = CUR_CHAR(l);
2462 len = 0;
2463 while (IS_CHAR(cur) &&
2464 ((cur != '>') ||
2465 (r != '-') || (q != '-'))) {
2466 if (len + 5 >= size) {
2467 size *= 2;
2468 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2469 if (buf == NULL) {
2470 xmlGenericError(xmlGenericErrorContext,
2471 "realloc of %d byte failed\n", size);
2472 ctxt->instate = state;
2473 return;
2474 }
2475 }
2476 COPY_BUF(ql,buf,len,q);
2477 q = r;
2478 ql = rl;
2479 r = cur;
2480 rl = l;
2481 NEXTL(l);
2482 cur = CUR_CHAR(l);
2483 if (cur == 0) {
2484 SHRINK;
2485 GROW;
2486 cur = CUR_CHAR(l);
2487 }
2488 }
2489 buf[len] = 0;
2490 if (!IS_CHAR(cur)) {
2491 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2492 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2493 ctxt->sax->error(ctxt->userData,
2494 "Comment not terminated \n<!--%.50s\n", buf);
2495 ctxt->wellFormed = 0;
2496 xmlFree(buf);
2497 } else {
2498 NEXT;
2499 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2500 (!ctxt->disableSAX))
2501 ctxt->sax->comment(ctxt->userData, buf);
2502 xmlFree(buf);
2503 }
2504 ctxt->instate = state;
2505}
2506
2507/**
2508 * htmlParseCharRef:
2509 * @ctxt: an HTML parser context
2510 *
2511 * parse Reference declarations
2512 *
2513 * [66] CharRef ::= '&#' [0-9]+ ';' |
2514 * '&#x' [0-9a-fA-F]+ ';'
2515 *
2516 * Returns the value parsed (as an int)
2517 */
2518int
2519htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2520 int val = 0;
2521
2522 if ((CUR == '&') && (NXT(1) == '#') &&
2523 (NXT(2) == 'x')) {
2524 SKIP(3);
2525 while (CUR != ';') {
2526 if ((CUR >= '0') && (CUR <= '9'))
2527 val = val * 16 + (CUR - '0');
2528 else if ((CUR >= 'a') && (CUR <= 'f'))
2529 val = val * 16 + (CUR - 'a') + 10;
2530 else if ((CUR >= 'A') && (CUR <= 'F'))
2531 val = val * 16 + (CUR - 'A') + 10;
2532 else {
2533 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2534 ctxt->sax->error(ctxt->userData,
2535 "htmlParseCharRef: invalid hexadecimal value\n");
2536 ctxt->wellFormed = 0;
2537 return(0);
2538 }
2539 NEXT;
2540 }
2541 if (CUR == ';')
2542 NEXT;
2543 } else if ((CUR == '&') && (NXT(1) == '#')) {
2544 SKIP(2);
2545 while (CUR != ';') {
2546 if ((CUR >= '0') && (CUR <= '9'))
2547 val = val * 10 + (CUR - '0');
2548 else {
2549 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2550 ctxt->sax->error(ctxt->userData,
2551 "htmlParseCharRef: invalid decimal value\n");
2552 ctxt->wellFormed = 0;
2553 return(0);
2554 }
2555 NEXT;
2556 }
2557 if (CUR == ';')
2558 NEXT;
2559 } else {
2560 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2561 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2562 ctxt->wellFormed = 0;
2563 }
2564 /*
2565 * Check the value IS_CHAR ...
2566 */
2567 if (IS_CHAR(val)) {
2568 return(val);
2569 } else {
2570 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2571 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
2572 val);
2573 ctxt->wellFormed = 0;
2574 }
2575 return(0);
2576}
2577
2578
2579/**
2580 * htmlParseDocTypeDecl :
2581 * @ctxt: an HTML parser context
2582 *
2583 * parse a DOCTYPE declaration
2584 *
2585 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2586 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2587 */
2588
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002589static void
Owen Taylor3473f882001-02-23 17:55:21 +00002590htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
2591 xmlChar *name;
2592 xmlChar *ExternalID = NULL;
2593 xmlChar *URI = NULL;
2594
2595 /*
2596 * We know that '<!DOCTYPE' has been detected.
2597 */
2598 SKIP(9);
2599
2600 SKIP_BLANKS;
2601
2602 /*
2603 * Parse the DOCTYPE name.
2604 */
2605 name = htmlParseName(ctxt);
2606 if (name == NULL) {
2607 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2608 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2609 ctxt->wellFormed = 0;
2610 }
2611 /*
2612 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2613 */
2614
2615 SKIP_BLANKS;
2616
2617 /*
2618 * Check for SystemID and ExternalID
2619 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002620 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00002621 SKIP_BLANKS;
2622
2623 /*
2624 * We should be at the end of the DOCTYPE declaration.
2625 */
2626 if (CUR != '>') {
2627 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2628 ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
2629 ctxt->wellFormed = 0;
2630 /* We shouldn't try to resynchronize ... */
2631 }
2632 NEXT;
2633
2634 /*
2635 * Create or update the document accordingly to the DOCTYPE
2636 */
2637 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2638 (!ctxt->disableSAX))
2639 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
2640
2641 /*
2642 * Cleanup, since we don't use all those identifiers
2643 */
2644 if (URI != NULL) xmlFree(URI);
2645 if (ExternalID != NULL) xmlFree(ExternalID);
2646 if (name != NULL) xmlFree(name);
2647}
2648
2649/**
2650 * htmlParseAttribute:
2651 * @ctxt: an HTML parser context
2652 * @value: a xmlChar ** used to store the value of the attribute
2653 *
2654 * parse an attribute
2655 *
2656 * [41] Attribute ::= Name Eq AttValue
2657 *
2658 * [25] Eq ::= S? '=' S?
2659 *
2660 * With namespace:
2661 *
2662 * [NS 11] Attribute ::= QName Eq AttValue
2663 *
2664 * Also the case QName == xmlns:??? is handled independently as a namespace
2665 * definition.
2666 *
2667 * Returns the attribute name, and the value in *value.
2668 */
2669
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002670static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002671htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
2672 xmlChar *name, *val = NULL;
2673
2674 *value = NULL;
2675 name = htmlParseHTMLName(ctxt);
2676 if (name == NULL) {
2677 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2678 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2679 ctxt->wellFormed = 0;
2680 return(NULL);
2681 }
2682
2683 /*
2684 * read the value
2685 */
2686 SKIP_BLANKS;
2687 if (CUR == '=') {
2688 NEXT;
2689 SKIP_BLANKS;
2690 val = htmlParseAttValue(ctxt);
2691 /******
2692 } else {
2693 * TODO : some attribute must have values, some may not
2694 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2695 ctxt->sax->warning(ctxt->userData,
2696 "No value for attribute %s\n", name); */
2697 }
2698
2699 *value = val;
2700 return(name);
2701}
2702
2703/**
2704 * htmlCheckEncoding:
2705 * @ctxt: an HTML parser context
2706 * @attvalue: the attribute value
2707 *
2708 * Checks an http-equiv attribute from a Meta tag to detect
2709 * the encoding
2710 * If a new encoding is detected the parser is switched to decode
2711 * it and pass UTF8
2712 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002713static void
Owen Taylor3473f882001-02-23 17:55:21 +00002714htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2715 const xmlChar *encoding;
2716
2717 if ((ctxt == NULL) || (attvalue == NULL))
2718 return;
2719
2720 /* do not change encoding */
2721 if (ctxt->input->encoding != NULL)
2722 return;
2723
2724 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
2725 if (encoding != NULL) {
2726 encoding += 8;
2727 } else {
2728 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
2729 if (encoding != NULL)
2730 encoding += 9;
2731 }
2732 if (encoding != NULL) {
2733 xmlCharEncoding enc;
2734 xmlCharEncodingHandlerPtr handler;
2735
2736 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2737
2738 if (ctxt->input->encoding != NULL)
2739 xmlFree((xmlChar *) ctxt->input->encoding);
2740 ctxt->input->encoding = xmlStrdup(encoding);
2741
2742 enc = xmlParseCharEncoding((const char *) encoding);
2743 /*
2744 * registered set of known encodings
2745 */
2746 if (enc != XML_CHAR_ENCODING_ERROR) {
2747 xmlSwitchEncoding(ctxt, enc);
2748 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2749 } else {
2750 /*
2751 * fallback for unknown encodings
2752 */
2753 handler = xmlFindCharEncodingHandler((const char *) encoding);
2754 if (handler != NULL) {
2755 xmlSwitchToEncoding(ctxt, handler);
2756 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2757 } else {
2758 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2759 }
2760 }
2761
2762 if ((ctxt->input->buf != NULL) &&
2763 (ctxt->input->buf->encoder != NULL) &&
2764 (ctxt->input->buf->raw != NULL) &&
2765 (ctxt->input->buf->buffer != NULL)) {
2766 int nbchars;
2767 int processed;
2768
2769 /*
2770 * convert as much as possible to the parser reading buffer.
2771 */
2772 processed = ctxt->input->cur - ctxt->input->base;
2773 xmlBufferShrink(ctxt->input->buf->buffer, processed);
2774 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2775 ctxt->input->buf->buffer,
2776 ctxt->input->buf->raw);
2777 if (nbchars < 0) {
2778 ctxt->errNo = XML_ERR_INVALID_ENCODING;
2779 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2780 ctxt->sax->error(ctxt->userData,
2781 "htmlCheckEncoding: encoder error\n");
2782 }
2783 ctxt->input->base =
2784 ctxt->input->cur = ctxt->input->buf->buffer->content;
2785 }
2786 }
2787}
2788
2789/**
2790 * htmlCheckMeta:
2791 * @ctxt: an HTML parser context
2792 * @atts: the attributes values
2793 *
2794 * Checks an attributes from a Meta tag
2795 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002796static void
Owen Taylor3473f882001-02-23 17:55:21 +00002797htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2798 int i;
2799 const xmlChar *att, *value;
2800 int http = 0;
2801 const xmlChar *content = NULL;
2802
2803 if ((ctxt == NULL) || (atts == NULL))
2804 return;
2805
2806 i = 0;
2807 att = atts[i++];
2808 while (att != NULL) {
2809 value = atts[i++];
2810 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
2811 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
2812 http = 1;
2813 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
2814 content = value;
2815 att = atts[i++];
2816 }
2817 if ((http) && (content != NULL))
2818 htmlCheckEncoding(ctxt, content);
2819
2820}
2821
2822/**
2823 * htmlParseStartTag:
2824 * @ctxt: an HTML parser context
2825 *
2826 * parse a start of tag either for rule element or
2827 * EmptyElement. In both case we don't parse the tag closing chars.
2828 *
2829 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2830 *
2831 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2832 *
2833 * With namespace:
2834 *
2835 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2836 *
2837 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2838 *
2839 */
2840
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002841static void
Owen Taylor3473f882001-02-23 17:55:21 +00002842htmlParseStartTag(htmlParserCtxtPtr ctxt) {
2843 xmlChar *name;
2844 xmlChar *attname;
2845 xmlChar *attvalue;
2846 const xmlChar **atts = NULL;
2847 int nbatts = 0;
2848 int maxatts = 0;
2849 int meta = 0;
2850 int i;
2851
2852 if (CUR != '<') return;
2853 NEXT;
2854
2855 GROW;
2856 name = htmlParseHTMLName(ctxt);
2857 if (name == NULL) {
2858 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2859 ctxt->sax->error(ctxt->userData,
2860 "htmlParseStartTag: invalid element name\n");
2861 ctxt->wellFormed = 0;
2862 /* Dump the bogus tag like browsers do */
2863 while ((IS_CHAR(CUR)) && (CUR != '>'))
2864 NEXT;
2865 return;
2866 }
2867 if (xmlStrEqual(name, BAD_CAST"meta"))
2868 meta = 1;
2869
2870 /*
2871 * Check for auto-closure of HTML elements.
2872 */
2873 htmlAutoClose(ctxt, name);
2874
2875 /*
2876 * Check for implied HTML elements.
2877 */
2878 htmlCheckImplied(ctxt, name);
2879
2880 /*
2881 * Avoid html at any level > 0, head at any level != 1
2882 * or any attempt to recurse body
2883 */
2884 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
2885 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2886 ctxt->sax->error(ctxt->userData,
2887 "htmlParseStartTag: misplaced <html> tag\n");
2888 ctxt->wellFormed = 0;
2889 xmlFree(name);
2890 return;
2891 }
2892 if ((ctxt->nameNr != 1) &&
2893 (xmlStrEqual(name, BAD_CAST"head"))) {
2894 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2895 ctxt->sax->error(ctxt->userData,
2896 "htmlParseStartTag: misplaced <head> tag\n");
2897 ctxt->wellFormed = 0;
2898 xmlFree(name);
2899 return;
2900 }
2901 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002902 int indx;
2903 for (indx = 0;indx < ctxt->nameNr;indx++) {
2904 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor3473f882001-02-23 17:55:21 +00002905 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2906 ctxt->sax->error(ctxt->userData,
2907 "htmlParseStartTag: misplaced <body> tag\n");
2908 ctxt->wellFormed = 0;
2909 xmlFree(name);
2910 return;
2911 }
2912 }
2913 }
2914
2915 /*
2916 * Now parse the attributes, it ends up with the ending
2917 *
2918 * (S Attribute)* S?
2919 */
2920 SKIP_BLANKS;
2921 while ((IS_CHAR(CUR)) &&
2922 (CUR != '>') &&
2923 ((CUR != '/') || (NXT(1) != '>'))) {
2924 long cons = ctxt->nbChars;
2925
2926 GROW;
2927 attname = htmlParseAttribute(ctxt, &attvalue);
2928 if (attname != NULL) {
2929
2930 /*
2931 * Well formedness requires at most one declaration of an attribute
2932 */
2933 for (i = 0; i < nbatts;i += 2) {
2934 if (xmlStrEqual(atts[i], attname)) {
2935 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2936 ctxt->sax->error(ctxt->userData,
2937 "Attribute %s redefined\n",
2938 attname);
2939 ctxt->wellFormed = 0;
2940 xmlFree(attname);
2941 if (attvalue != NULL)
2942 xmlFree(attvalue);
2943 goto failed;
2944 }
2945 }
2946
2947 /*
2948 * Add the pair to atts
2949 */
2950 if (atts == NULL) {
2951 maxatts = 10;
2952 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
2953 if (atts == NULL) {
2954 xmlGenericError(xmlGenericErrorContext,
2955 "malloc of %ld byte failed\n",
2956 maxatts * (long)sizeof(xmlChar *));
2957 if (name != NULL) xmlFree(name);
2958 return;
2959 }
2960 } else if (nbatts + 4 > maxatts) {
2961 maxatts *= 2;
2962 atts = (const xmlChar **) xmlRealloc((void *) atts,
2963 maxatts * sizeof(xmlChar *));
2964 if (atts == NULL) {
2965 xmlGenericError(xmlGenericErrorContext,
2966 "realloc of %ld byte failed\n",
2967 maxatts * (long)sizeof(xmlChar *));
2968 if (name != NULL) xmlFree(name);
2969 return;
2970 }
2971 }
2972 atts[nbatts++] = attname;
2973 atts[nbatts++] = attvalue;
2974 atts[nbatts] = NULL;
2975 atts[nbatts + 1] = NULL;
2976 }
2977 else {
2978 /* Dump the bogus attribute string up to the next blank or
2979 * the end of the tag. */
2980 while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
2981 && ((CUR != '/') || (NXT(1) != '>')))
2982 NEXT;
2983 }
2984
2985failed:
2986 SKIP_BLANKS;
2987 if (cons == ctxt->nbChars) {
2988 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2989 ctxt->sax->error(ctxt->userData,
2990 "htmlParseStartTag: problem parsing attributes\n");
2991 ctxt->wellFormed = 0;
2992 break;
2993 }
2994 }
2995
2996 /*
2997 * Handle specific association to the META tag
2998 */
2999 if (meta)
3000 htmlCheckMeta(ctxt, atts);
3001
3002 /*
3003 * SAX: Start of Element !
3004 */
3005 htmlnamePush(ctxt, xmlStrdup(name));
3006#ifdef DEBUG
3007 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
3008#endif
3009 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3010 ctxt->sax->startElement(ctxt->userData, name, atts);
3011
3012 if (atts != NULL) {
3013 for (i = 0;i < nbatts;i++) {
3014 if (atts[i] != NULL)
3015 xmlFree((xmlChar *) atts[i]);
3016 }
3017 xmlFree((void *) atts);
3018 }
3019 if (name != NULL) xmlFree(name);
3020}
3021
3022/**
3023 * htmlParseEndTag:
3024 * @ctxt: an HTML parser context
3025 *
3026 * parse an end of tag
3027 *
3028 * [42] ETag ::= '</' Name S? '>'
3029 *
3030 * With namespace
3031 *
3032 * [NS 9] ETag ::= '</' QName S? '>'
3033 */
3034
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003035static void
Owen Taylor3473f882001-02-23 17:55:21 +00003036htmlParseEndTag(htmlParserCtxtPtr ctxt) {
3037 xmlChar *name;
3038 xmlChar *oldname;
3039 int i;
3040
3041 if ((CUR != '<') || (NXT(1) != '/')) {
3042 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3043 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3044 ctxt->wellFormed = 0;
3045 return;
3046 }
3047 SKIP(2);
3048
3049 name = htmlParseHTMLName(ctxt);
3050 if (name == NULL) return;
3051
3052 /*
3053 * We should definitely be at the ending "S? '>'" part
3054 */
3055 SKIP_BLANKS;
3056 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3057 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3058 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3059 ctxt->wellFormed = 0;
3060 } else
3061 NEXT;
3062
3063 /*
3064 * If the name read is not one of the element in the parsing stack
3065 * then return, it's just an error.
3066 */
3067 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
3068 if (xmlStrEqual(name, ctxt->nameTab[i])) break;
3069 }
3070 if (i < 0) {
3071 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3072 ctxt->sax->error(ctxt->userData,
3073 "Unexpected end tag : %s\n", name);
3074 xmlFree(name);
3075 ctxt->wellFormed = 0;
3076 return;
3077 }
3078
3079
3080 /*
3081 * Check for auto-closure of HTML elements.
3082 */
3083
3084 htmlAutoCloseOnClose(ctxt, name);
3085
3086 /*
3087 * Well formedness constraints, opening and closing must match.
3088 * With the exception that the autoclose may have popped stuff out
3089 * of the stack.
3090 */
3091 if (!xmlStrEqual(name, ctxt->name)) {
3092#ifdef DEBUG
3093 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
3094#endif
3095 if ((ctxt->name != NULL) &&
3096 (!xmlStrEqual(ctxt->name, name))) {
3097 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3098 ctxt->sax->error(ctxt->userData,
3099 "Opening and ending tag mismatch: %s and %s\n",
3100 name, ctxt->name);
3101 ctxt->wellFormed = 0;
3102 }
3103 }
3104
3105 /*
3106 * SAX: End of Tag
3107 */
3108 oldname = ctxt->name;
3109 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3110 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3111 ctxt->sax->endElement(ctxt->userData, name);
3112 oldname = htmlnamePop(ctxt);
3113 if (oldname != NULL) {
3114#ifdef DEBUG
3115 xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
3116#endif
3117 xmlFree(oldname);
3118#ifdef DEBUG
3119 } else {
3120 xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
3121#endif
3122 }
3123 }
3124
3125 if (name != NULL)
3126 xmlFree(name);
3127
3128 return;
3129}
3130
3131
3132/**
3133 * htmlParseReference:
3134 * @ctxt: an HTML parser context
3135 *
3136 * parse and handle entity references in content,
3137 * this will end-up in a call to character() since this is either a
3138 * CharRef, or a predefined entity.
3139 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003140static void
Owen Taylor3473f882001-02-23 17:55:21 +00003141htmlParseReference(htmlParserCtxtPtr ctxt) {
3142 htmlEntityDescPtr ent;
3143 xmlChar out[6];
3144 xmlChar *name;
3145 if (CUR != '&') return;
3146
3147 if (NXT(1) == '#') {
3148 unsigned int c;
3149 int bits, i = 0;
3150
3151 c = htmlParseCharRef(ctxt);
3152 if (c == 0)
3153 return;
3154
3155 if (c < 0x80) { out[i++]= c; bits= -6; }
3156 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3157 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3158 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3159
3160 for ( ; bits >= 0; bits-= 6) {
3161 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3162 }
3163 out[i] = 0;
3164
3165 htmlCheckParagraph(ctxt);
3166 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3167 ctxt->sax->characters(ctxt->userData, out, i);
3168 } else {
3169 ent = htmlParseEntityRef(ctxt, &name);
3170 if (name == NULL) {
3171 htmlCheckParagraph(ctxt);
3172 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3173 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3174 return;
3175 }
3176 if ((ent == NULL) || (ent->value <= 0)) {
3177 htmlCheckParagraph(ctxt);
3178 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3179 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3180 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3181 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3182 }
3183 } else {
3184 unsigned int c;
3185 int bits, i = 0;
3186
3187 c = ent->value;
3188 if (c < 0x80)
3189 { out[i++]= c; bits= -6; }
3190 else if (c < 0x800)
3191 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3192 else if (c < 0x10000)
3193 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3194 else
3195 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3196
3197 for ( ; bits >= 0; bits-= 6) {
3198 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3199 }
3200 out[i] = 0;
3201
3202 htmlCheckParagraph(ctxt);
3203 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3204 ctxt->sax->characters(ctxt->userData, out, i);
3205 }
3206 xmlFree(name);
3207 }
3208}
3209
3210/**
3211 * htmlParseContent:
3212 * @ctxt: an HTML parser context
3213 * @name: the node name
3214 *
3215 * Parse a content: comment, sub-element, reference or text.
3216 *
3217 */
3218
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003219static void
Owen Taylor3473f882001-02-23 17:55:21 +00003220htmlParseContent(htmlParserCtxtPtr ctxt) {
3221 xmlChar *currentNode;
3222 int depth;
3223
3224 currentNode = xmlStrdup(ctxt->name);
3225 depth = ctxt->nameNr;
3226 while (1) {
3227 long cons = ctxt->nbChars;
3228
3229 GROW;
3230 /*
3231 * Our tag or one of it's parent or children is ending.
3232 */
3233 if ((CUR == '<') && (NXT(1) == '/')) {
3234 htmlParseEndTag(ctxt);
3235 if (currentNode != NULL) xmlFree(currentNode);
3236 return;
3237 }
3238
3239 /*
3240 * Has this node been popped out during parsing of
3241 * the next element
3242 */
3243 if ((!xmlStrEqual(currentNode, ctxt->name)) &&
3244 (depth >= ctxt->nameNr)) {
3245 if (currentNode != NULL) xmlFree(currentNode);
3246 return;
3247 }
3248
Daniel Veillardf9533d12001-03-03 10:04:57 +00003249 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3250 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003251 /*
3252 * Handle SCRIPT/STYLE separately
3253 */
3254 htmlParseScript(ctxt);
3255 } else {
3256 /*
3257 * Sometimes DOCTYPE arrives in the middle of the document
3258 */
3259 if ((CUR == '<') && (NXT(1) == '!') &&
3260 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3261 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3262 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3263 (UPP(8) == 'E')) {
3264 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3265 ctxt->sax->error(ctxt->userData,
3266 "Misplaced DOCTYPE declaration\n");
3267 ctxt->wellFormed = 0;
3268 htmlParseDocTypeDecl(ctxt);
3269 }
3270
3271 /*
3272 * First case : a comment
3273 */
3274 if ((CUR == '<') && (NXT(1) == '!') &&
3275 (NXT(2) == '-') && (NXT(3) == '-')) {
3276 htmlParseComment(ctxt);
3277 }
3278
3279 /*
3280 * Second case : a sub-element.
3281 */
3282 else if (CUR == '<') {
3283 htmlParseElement(ctxt);
3284 }
3285
3286 /*
3287 * Third case : a reference. If if has not been resolved,
3288 * parsing returns it's Name, create the node
3289 */
3290 else if (CUR == '&') {
3291 htmlParseReference(ctxt);
3292 }
3293
3294 /*
3295 * Fourth : end of the resource
3296 */
3297 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003298 htmlAutoCloseOnEnd(ctxt);
3299 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003300 }
3301
3302 /*
3303 * Last case, text. Note that References are handled directly.
3304 */
3305 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003306 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003307 }
3308
3309 if (cons == ctxt->nbChars) {
3310 if (ctxt->node != NULL) {
3311 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3312 ctxt->sax->error(ctxt->userData,
3313 "detected an error in element content\n");
3314 ctxt->wellFormed = 0;
3315 }
3316 break;
3317 }
3318 }
3319 GROW;
3320 }
3321 if (currentNode != NULL) xmlFree(currentNode);
3322}
3323
3324/**
3325 * htmlParseElement:
3326 * @ctxt: an HTML parser context
3327 *
3328 * parse an HTML element, this is highly recursive
3329 *
3330 * [39] element ::= EmptyElemTag | STag content ETag
3331 *
3332 * [41] Attribute ::= Name Eq AttValue
3333 */
3334
3335void
3336htmlParseElement(htmlParserCtxtPtr ctxt) {
3337 xmlChar *name;
3338 xmlChar *currentNode = NULL;
3339 htmlElemDescPtr info;
3340 htmlParserNodeInfo node_info;
3341 xmlChar *oldname;
3342 int depth = ctxt->nameNr;
3343
3344 /* Capture start position */
3345 if (ctxt->record_info) {
3346 node_info.begin_pos = ctxt->input->consumed +
3347 (CUR_PTR - ctxt->input->base);
3348 node_info.begin_line = ctxt->input->line;
3349 }
3350
3351 oldname = xmlStrdup(ctxt->name);
3352 htmlParseStartTag(ctxt);
3353 name = ctxt->name;
3354#ifdef DEBUG
3355 if (oldname == NULL)
3356 xmlGenericError(xmlGenericErrorContext,
3357 "Start of element %s\n", name);
3358 else if (name == NULL)
3359 xmlGenericError(xmlGenericErrorContext,
3360 "Start of element failed, was %s\n", oldname);
3361 else
3362 xmlGenericError(xmlGenericErrorContext,
3363 "Start of element %s, was %s\n", name, oldname);
3364#endif
3365 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3366 (name == NULL)) {
3367 if (CUR == '>')
3368 NEXT;
3369 if (oldname != NULL)
3370 xmlFree(oldname);
3371 return;
3372 }
3373 if (oldname != NULL)
3374 xmlFree(oldname);
3375
3376 /*
3377 * Lookup the info for that element.
3378 */
3379 info = htmlTagLookup(name);
3380 if (info == NULL) {
3381 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3382 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3383 name);
3384 ctxt->wellFormed = 0;
3385 } else if (info->depr) {
3386/***************************
3387 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3388 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3389 name);
3390 ***************************/
3391 }
3392
3393 /*
3394 * Check for an Empty Element labelled the XML/SGML way
3395 */
3396 if ((CUR == '/') && (NXT(1) == '>')) {
3397 SKIP(2);
3398 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3399 ctxt->sax->endElement(ctxt->userData, name);
3400 oldname = htmlnamePop(ctxt);
3401#ifdef DEBUG
3402 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
3403#endif
3404 if (oldname != NULL)
3405 xmlFree(oldname);
3406 return;
3407 }
3408
3409 if (CUR == '>') {
3410 NEXT;
3411 } else {
3412 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3413 ctxt->sax->error(ctxt->userData,
3414 "Couldn't find end of Start Tag %s\n",
3415 name);
3416 ctxt->wellFormed = 0;
3417
3418 /*
3419 * end of parsing of this node.
3420 */
3421 if (xmlStrEqual(name, ctxt->name)) {
3422 nodePop(ctxt);
3423 oldname = htmlnamePop(ctxt);
3424#ifdef DEBUG
3425 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
3426#endif
3427 if (oldname != NULL)
3428 xmlFree(oldname);
3429 }
3430
3431 /*
3432 * Capture end position and add node
3433 */
3434 if ( currentNode != NULL && ctxt->record_info ) {
3435 node_info.end_pos = ctxt->input->consumed +
3436 (CUR_PTR - ctxt->input->base);
3437 node_info.end_line = ctxt->input->line;
3438 node_info.node = ctxt->node;
3439 xmlParserAddNodeInfo(ctxt, &node_info);
3440 }
3441 return;
3442 }
3443
3444 /*
3445 * Check for an Empty Element from DTD definition
3446 */
3447 if ((info != NULL) && (info->empty)) {
3448 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3449 ctxt->sax->endElement(ctxt->userData, name);
3450 oldname = htmlnamePop(ctxt);
3451#ifdef DEBUG
3452 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
3453#endif
3454 if (oldname != NULL)
3455 xmlFree(oldname);
3456 return;
3457 }
3458
3459 /*
3460 * Parse the content of the element:
3461 */
3462 currentNode = xmlStrdup(ctxt->name);
3463 depth = ctxt->nameNr;
3464 while (IS_CHAR(CUR)) {
3465 htmlParseContent(ctxt);
3466 if (ctxt->nameNr < depth) break;
3467 }
3468
Owen Taylor3473f882001-02-23 17:55:21 +00003469 /*
3470 * Capture end position and add node
3471 */
3472 if ( currentNode != NULL && ctxt->record_info ) {
3473 node_info.end_pos = ctxt->input->consumed +
3474 (CUR_PTR - ctxt->input->base);
3475 node_info.end_line = ctxt->input->line;
3476 node_info.node = ctxt->node;
3477 xmlParserAddNodeInfo(ctxt, &node_info);
3478 }
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003479 if (!IS_CHAR(CUR)) {
3480 htmlAutoCloseOnEnd(ctxt);
3481 }
3482
Owen Taylor3473f882001-02-23 17:55:21 +00003483 if (currentNode != NULL)
3484 xmlFree(currentNode);
3485}
3486
3487/**
3488 * htmlParseDocument :
3489 * @ctxt: an HTML parser context
3490 *
3491 * parse an HTML document (and build a tree if using the standard SAX
3492 * interface).
3493 *
3494 * Returns 0, -1 in case of error. the parser context is augmented
3495 * as a result of the parsing.
3496 */
3497
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003498static int
Owen Taylor3473f882001-02-23 17:55:21 +00003499htmlParseDocument(htmlParserCtxtPtr ctxt) {
3500 xmlDtdPtr dtd;
3501
3502 htmlDefaultSAXHandlerInit();
3503 ctxt->html = 1;
3504
3505 GROW;
3506 /*
3507 * SAX: beginning of the document processing.
3508 */
3509 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3510 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3511
3512 /*
3513 * Wipe out everything which is before the first '<'
3514 */
3515 SKIP_BLANKS;
3516 if (CUR == 0) {
3517 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3518 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3519 ctxt->wellFormed = 0;
3520 }
3521
3522 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3523 ctxt->sax->startDocument(ctxt->userData);
3524
3525
3526 /*
3527 * Parse possible comments before any content
3528 */
3529 while ((CUR == '<') && (NXT(1) == '!') &&
3530 (NXT(2) == '-') && (NXT(3) == '-')) {
3531 htmlParseComment(ctxt);
3532 SKIP_BLANKS;
3533 }
3534
3535
3536 /*
3537 * Then possibly doc type declaration(s) and more Misc
3538 * (doctypedecl Misc*)?
3539 */
3540 if ((CUR == '<') && (NXT(1) == '!') &&
3541 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3542 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3543 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3544 (UPP(8) == 'E')) {
3545 htmlParseDocTypeDecl(ctxt);
3546 }
3547 SKIP_BLANKS;
3548
3549 /*
3550 * Parse possible comments before any content
3551 */
3552 while ((CUR == '<') && (NXT(1) == '!') &&
3553 (NXT(2) == '-') && (NXT(3) == '-')) {
3554 htmlParseComment(ctxt);
3555 SKIP_BLANKS;
3556 }
3557
3558 /*
3559 * Time to start parsing the tree itself
3560 */
3561 htmlParseContent(ctxt);
3562
3563 /*
3564 * autoclose
3565 */
3566 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003567 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003568
3569
3570 /*
3571 * SAX: end of the document processing.
3572 */
3573 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3574 ctxt->sax->endDocument(ctxt->userData);
3575
3576 if (ctxt->myDoc != NULL) {
3577 dtd = xmlGetIntSubset(ctxt->myDoc);
3578 if (dtd == NULL)
3579 ctxt->myDoc->intSubset =
3580 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3581 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3582 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3583 }
3584 if (! ctxt->wellFormed) return(-1);
3585 return(0);
3586}
3587
3588
3589/************************************************************************
3590 * *
3591 * Parser contexts handling *
3592 * *
3593 ************************************************************************/
3594
3595/**
3596 * xmlInitParserCtxt:
3597 * @ctxt: an HTML parser context
3598 *
3599 * Initialize a parser context
3600 */
3601
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003602static void
Owen Taylor3473f882001-02-23 17:55:21 +00003603htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3604{
3605 htmlSAXHandler *sax;
3606
3607 if (ctxt == NULL) return;
3608 memset(ctxt, 0, sizeof(htmlParserCtxt));
3609
3610 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3611 if (sax == NULL) {
3612 xmlGenericError(xmlGenericErrorContext,
3613 "htmlInitParserCtxt: out of memory\n");
3614 }
3615 else
3616 memset(sax, 0, sizeof(htmlSAXHandler));
3617
3618 /* Allocate the Input stack */
3619 ctxt->inputTab = (htmlParserInputPtr *)
3620 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3621 if (ctxt->inputTab == NULL) {
3622 xmlGenericError(xmlGenericErrorContext,
3623 "htmlInitParserCtxt: out of memory\n");
3624 ctxt->inputNr = 0;
3625 ctxt->inputMax = 0;
3626 ctxt->input = NULL;
3627 return;
3628 }
3629 ctxt->inputNr = 0;
3630 ctxt->inputMax = 5;
3631 ctxt->input = NULL;
3632 ctxt->version = NULL;
3633 ctxt->encoding = NULL;
3634 ctxt->standalone = -1;
3635 ctxt->instate = XML_PARSER_START;
3636
3637 /* Allocate the Node stack */
3638 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3639 if (ctxt->nodeTab == NULL) {
3640 xmlGenericError(xmlGenericErrorContext,
3641 "htmlInitParserCtxt: out of memory\n");
3642 ctxt->nodeNr = 0;
3643 ctxt->nodeMax = 0;
3644 ctxt->node = NULL;
3645 ctxt->inputNr = 0;
3646 ctxt->inputMax = 0;
3647 ctxt->input = NULL;
3648 return;
3649 }
3650 ctxt->nodeNr = 0;
3651 ctxt->nodeMax = 10;
3652 ctxt->node = NULL;
3653
3654 /* Allocate the Name stack */
3655 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
3656 if (ctxt->nameTab == NULL) {
3657 xmlGenericError(xmlGenericErrorContext,
3658 "htmlInitParserCtxt: out of memory\n");
3659 ctxt->nameNr = 0;
3660 ctxt->nameMax = 10;
3661 ctxt->name = NULL;
3662 ctxt->nodeNr = 0;
3663 ctxt->nodeMax = 0;
3664 ctxt->node = NULL;
3665 ctxt->inputNr = 0;
3666 ctxt->inputMax = 0;
3667 ctxt->input = NULL;
3668 return;
3669 }
3670 ctxt->nameNr = 0;
3671 ctxt->nameMax = 10;
3672 ctxt->name = NULL;
3673
3674 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3675 else {
3676 ctxt->sax = sax;
3677 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3678 }
3679 ctxt->userData = ctxt;
3680 ctxt->myDoc = NULL;
3681 ctxt->wellFormed = 1;
3682 ctxt->replaceEntities = 0;
3683 ctxt->html = 1;
3684 ctxt->record_info = 0;
3685 ctxt->validate = 0;
3686 ctxt->nbChars = 0;
3687 ctxt->checkIndex = 0;
3688 xmlInitNodeInfoSeq(&ctxt->node_seq);
3689}
3690
3691/**
3692 * htmlFreeParserCtxt:
3693 * @ctxt: an HTML parser context
3694 *
3695 * Free all the memory used by a parser context. However the parsed
3696 * document in ctxt->myDoc is not freed.
3697 */
3698
3699void
3700htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3701{
3702 xmlFreeParserCtxt(ctxt);
3703}
3704
3705/**
3706 * htmlCreateDocParserCtxt :
3707 * @cur: a pointer to an array of xmlChar
3708 * @encoding: a free form C string describing the HTML document encoding, or NULL
3709 *
3710 * Create a parser context for an HTML document.
3711 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003712 * TODO: check the need to add encoding handling there
3713 *
Owen Taylor3473f882001-02-23 17:55:21 +00003714 * Returns the new parser context or NULL
3715 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003716static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00003717htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Owen Taylor3473f882001-02-23 17:55:21 +00003718 htmlParserCtxtPtr ctxt;
3719 htmlParserInputPtr input;
3720 /* htmlCharEncoding enc; */
3721
3722 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
3723 if (ctxt == NULL) {
3724 perror("malloc");
3725 return(NULL);
3726 }
3727 htmlInitParserCtxt(ctxt);
3728 input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
3729 if (input == NULL) {
3730 perror("malloc");
3731 xmlFree(ctxt);
3732 return(NULL);
3733 }
3734 memset(input, 0, sizeof(htmlParserInput));
3735
3736 input->line = 1;
3737 input->col = 1;
3738 input->base = cur;
3739 input->cur = cur;
3740
3741 inputPush(ctxt, input);
3742 return(ctxt);
3743}
3744
3745/************************************************************************
3746 * *
3747 * Progressive parsing interfaces *
3748 * *
3749 ************************************************************************/
3750
3751/**
3752 * htmlParseLookupSequence:
3753 * @ctxt: an HTML parser context
3754 * @first: the first char to lookup
3755 * @next: the next char to lookup or zero
3756 * @third: the next char to lookup or zero
3757 *
3758 * Try to find if a sequence (first, next, third) or just (first next) or
3759 * (first) is available in the input stream.
3760 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3761 * to avoid rescanning sequences of bytes, it DOES change the state of the
3762 * parser, do not use liberally.
3763 * This is basically similar to xmlParseLookupSequence()
3764 *
3765 * Returns the index to the current parsing point if the full sequence
3766 * is available, -1 otherwise.
3767 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003768static int
Owen Taylor3473f882001-02-23 17:55:21 +00003769htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3770 xmlChar next, xmlChar third) {
3771 int base, len;
3772 htmlParserInputPtr in;
3773 const xmlChar *buf;
3774
3775 in = ctxt->input;
3776 if (in == NULL) return(-1);
3777 base = in->cur - in->base;
3778 if (base < 0) return(-1);
3779 if (ctxt->checkIndex > base)
3780 base = ctxt->checkIndex;
3781 if (in->buf == NULL) {
3782 buf = in->base;
3783 len = in->length;
3784 } else {
3785 buf = in->buf->buffer->content;
3786 len = in->buf->buffer->use;
3787 }
3788 /* take into account the sequence length */
3789 if (third) len -= 2;
3790 else if (next) len --;
3791 for (;base < len;base++) {
3792 if (buf[base] == first) {
3793 if (third != 0) {
3794 if ((buf[base + 1] != next) ||
3795 (buf[base + 2] != third)) continue;
3796 } else if (next != 0) {
3797 if (buf[base + 1] != next) continue;
3798 }
3799 ctxt->checkIndex = 0;
3800#ifdef DEBUG_PUSH
3801 if (next == 0)
3802 xmlGenericError(xmlGenericErrorContext,
3803 "HPP: lookup '%c' found at %d\n",
3804 first, base);
3805 else if (third == 0)
3806 xmlGenericError(xmlGenericErrorContext,
3807 "HPP: lookup '%c%c' found at %d\n",
3808 first, next, base);
3809 else
3810 xmlGenericError(xmlGenericErrorContext,
3811 "HPP: lookup '%c%c%c' found at %d\n",
3812 first, next, third, base);
3813#endif
3814 return(base - (in->cur - in->base));
3815 }
3816 }
3817 ctxt->checkIndex = base;
3818#ifdef DEBUG_PUSH
3819 if (next == 0)
3820 xmlGenericError(xmlGenericErrorContext,
3821 "HPP: lookup '%c' failed\n", first);
3822 else if (third == 0)
3823 xmlGenericError(xmlGenericErrorContext,
3824 "HPP: lookup '%c%c' failed\n", first, next);
3825 else
3826 xmlGenericError(xmlGenericErrorContext,
3827 "HPP: lookup '%c%c%c' failed\n", first, next, third);
3828#endif
3829 return(-1);
3830}
3831
3832/**
3833 * htmlParseTryOrFinish:
3834 * @ctxt: an HTML parser context
3835 * @terminate: last chunk indicator
3836 *
3837 * Try to progress on parsing
3838 *
3839 * Returns zero if no parsing was possible
3840 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003841static int
Owen Taylor3473f882001-02-23 17:55:21 +00003842htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
3843 int ret = 0;
3844 htmlParserInputPtr in;
3845 int avail = 0;
3846 xmlChar cur, next;
3847
3848#ifdef DEBUG_PUSH
3849 switch (ctxt->instate) {
3850 case XML_PARSER_EOF:
3851 xmlGenericError(xmlGenericErrorContext,
3852 "HPP: try EOF\n"); break;
3853 case XML_PARSER_START:
3854 xmlGenericError(xmlGenericErrorContext,
3855 "HPP: try START\n"); break;
3856 case XML_PARSER_MISC:
3857 xmlGenericError(xmlGenericErrorContext,
3858 "HPP: try MISC\n");break;
3859 case XML_PARSER_COMMENT:
3860 xmlGenericError(xmlGenericErrorContext,
3861 "HPP: try COMMENT\n");break;
3862 case XML_PARSER_PROLOG:
3863 xmlGenericError(xmlGenericErrorContext,
3864 "HPP: try PROLOG\n");break;
3865 case XML_PARSER_START_TAG:
3866 xmlGenericError(xmlGenericErrorContext,
3867 "HPP: try START_TAG\n");break;
3868 case XML_PARSER_CONTENT:
3869 xmlGenericError(xmlGenericErrorContext,
3870 "HPP: try CONTENT\n");break;
3871 case XML_PARSER_CDATA_SECTION:
3872 xmlGenericError(xmlGenericErrorContext,
3873 "HPP: try CDATA_SECTION\n");break;
3874 case XML_PARSER_END_TAG:
3875 xmlGenericError(xmlGenericErrorContext,
3876 "HPP: try END_TAG\n");break;
3877 case XML_PARSER_ENTITY_DECL:
3878 xmlGenericError(xmlGenericErrorContext,
3879 "HPP: try ENTITY_DECL\n");break;
3880 case XML_PARSER_ENTITY_VALUE:
3881 xmlGenericError(xmlGenericErrorContext,
3882 "HPP: try ENTITY_VALUE\n");break;
3883 case XML_PARSER_ATTRIBUTE_VALUE:
3884 xmlGenericError(xmlGenericErrorContext,
3885 "HPP: try ATTRIBUTE_VALUE\n");break;
3886 case XML_PARSER_DTD:
3887 xmlGenericError(xmlGenericErrorContext,
3888 "HPP: try DTD\n");break;
3889 case XML_PARSER_EPILOG:
3890 xmlGenericError(xmlGenericErrorContext,
3891 "HPP: try EPILOG\n");break;
3892 case XML_PARSER_PI:
3893 xmlGenericError(xmlGenericErrorContext,
3894 "HPP: try PI\n");break;
3895 case XML_PARSER_SYSTEM_LITERAL:
3896 xmlGenericError(xmlGenericErrorContext,
3897 "HPP: try SYSTEM_LITERAL\n");break;
3898 }
3899#endif
3900
3901 while (1) {
3902
3903 in = ctxt->input;
3904 if (in == NULL) break;
3905 if (in->buf == NULL)
3906 avail = in->length - (in->cur - in->base);
3907 else
3908 avail = in->buf->buffer->use - (in->cur - in->base);
3909 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003910 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003911 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
3912 /*
3913 * SAX: end of the document processing.
3914 */
3915 ctxt->instate = XML_PARSER_EOF;
3916 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3917 ctxt->sax->endDocument(ctxt->userData);
3918 }
3919 }
3920 if (avail < 1)
3921 goto done;
3922 switch (ctxt->instate) {
3923 case XML_PARSER_EOF:
3924 /*
3925 * Document parsing is done !
3926 */
3927 goto done;
3928 case XML_PARSER_START:
3929 /*
3930 * Very first chars read from the document flow.
3931 */
3932 cur = in->cur[0];
3933 if (IS_BLANK(cur)) {
3934 SKIP_BLANKS;
3935 if (in->buf == NULL)
3936 avail = in->length - (in->cur - in->base);
3937 else
3938 avail = in->buf->buffer->use - (in->cur - in->base);
3939 }
3940 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3941 ctxt->sax->setDocumentLocator(ctxt->userData,
3942 &xmlDefaultSAXLocator);
3943 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
3944 (!ctxt->disableSAX))
3945 ctxt->sax->startDocument(ctxt->userData);
3946
3947 cur = in->cur[0];
3948 next = in->cur[1];
3949 if ((cur == '<') && (next == '!') &&
3950 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3951 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3952 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3953 (UPP(8) == 'E')) {
3954 if ((!terminate) &&
3955 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
3956 goto done;
3957#ifdef DEBUG_PUSH
3958 xmlGenericError(xmlGenericErrorContext,
3959 "HPP: Parsing internal subset\n");
3960#endif
3961 htmlParseDocTypeDecl(ctxt);
3962 ctxt->instate = XML_PARSER_PROLOG;
3963#ifdef DEBUG_PUSH
3964 xmlGenericError(xmlGenericErrorContext,
3965 "HPP: entering PROLOG\n");
3966#endif
3967 } else {
3968 ctxt->instate = XML_PARSER_MISC;
3969 }
3970#ifdef DEBUG_PUSH
3971 xmlGenericError(xmlGenericErrorContext,
3972 "HPP: entering MISC\n");
3973#endif
3974 break;
3975 case XML_PARSER_MISC:
3976 SKIP_BLANKS;
3977 if (in->buf == NULL)
3978 avail = in->length - (in->cur - in->base);
3979 else
3980 avail = in->buf->buffer->use - (in->cur - in->base);
3981 if (avail < 2)
3982 goto done;
3983 cur = in->cur[0];
3984 next = in->cur[1];
3985 if ((cur == '<') && (next == '!') &&
3986 (in->cur[2] == '-') && (in->cur[3] == '-')) {
3987 if ((!terminate) &&
3988 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
3989 goto done;
3990#ifdef DEBUG_PUSH
3991 xmlGenericError(xmlGenericErrorContext,
3992 "HPP: Parsing Comment\n");
3993#endif
3994 htmlParseComment(ctxt);
3995 ctxt->instate = XML_PARSER_MISC;
3996 } else if ((cur == '<') && (next == '!') &&
3997 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3998 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3999 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4000 (UPP(8) == 'E')) {
4001 if ((!terminate) &&
4002 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4003 goto done;
4004#ifdef DEBUG_PUSH
4005 xmlGenericError(xmlGenericErrorContext,
4006 "HPP: Parsing internal subset\n");
4007#endif
4008 htmlParseDocTypeDecl(ctxt);
4009 ctxt->instate = XML_PARSER_PROLOG;
4010#ifdef DEBUG_PUSH
4011 xmlGenericError(xmlGenericErrorContext,
4012 "HPP: entering PROLOG\n");
4013#endif
4014 } else if ((cur == '<') && (next == '!') &&
4015 (avail < 9)) {
4016 goto done;
4017 } else {
4018 ctxt->instate = XML_PARSER_START_TAG;
4019#ifdef DEBUG_PUSH
4020 xmlGenericError(xmlGenericErrorContext,
4021 "HPP: entering START_TAG\n");
4022#endif
4023 }
4024 break;
4025 case XML_PARSER_PROLOG:
4026 SKIP_BLANKS;
4027 if (in->buf == NULL)
4028 avail = in->length - (in->cur - in->base);
4029 else
4030 avail = in->buf->buffer->use - (in->cur - in->base);
4031 if (avail < 2)
4032 goto done;
4033 cur = in->cur[0];
4034 next = in->cur[1];
4035 if ((cur == '<') && (next == '!') &&
4036 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4037 if ((!terminate) &&
4038 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4039 goto done;
4040#ifdef DEBUG_PUSH
4041 xmlGenericError(xmlGenericErrorContext,
4042 "HPP: Parsing Comment\n");
4043#endif
4044 htmlParseComment(ctxt);
4045 ctxt->instate = XML_PARSER_PROLOG;
4046 } else if ((cur == '<') && (next == '!') &&
4047 (avail < 4)) {
4048 goto done;
4049 } else {
4050 ctxt->instate = XML_PARSER_START_TAG;
4051#ifdef DEBUG_PUSH
4052 xmlGenericError(xmlGenericErrorContext,
4053 "HPP: entering START_TAG\n");
4054#endif
4055 }
4056 break;
4057 case XML_PARSER_EPILOG:
4058 if (in->buf == NULL)
4059 avail = in->length - (in->cur - in->base);
4060 else
4061 avail = in->buf->buffer->use - (in->cur - in->base);
4062 if (avail < 1)
4063 goto done;
4064 cur = in->cur[0];
4065 if (IS_BLANK(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004066 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004067 goto done;
4068 }
4069 if (avail < 2)
4070 goto done;
4071 next = in->cur[1];
4072 if ((cur == '<') && (next == '!') &&
4073 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4074 if ((!terminate) &&
4075 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4076 goto done;
4077#ifdef DEBUG_PUSH
4078 xmlGenericError(xmlGenericErrorContext,
4079 "HPP: Parsing Comment\n");
4080#endif
4081 htmlParseComment(ctxt);
4082 ctxt->instate = XML_PARSER_EPILOG;
4083 } else if ((cur == '<') && (next == '!') &&
4084 (avail < 4)) {
4085 goto done;
4086 } else {
4087 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004088 ctxt->wellFormed = 0;
4089 ctxt->instate = XML_PARSER_EOF;
4090#ifdef DEBUG_PUSH
4091 xmlGenericError(xmlGenericErrorContext,
4092 "HPP: entering EOF\n");
4093#endif
4094 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4095 ctxt->sax->endDocument(ctxt->userData);
4096 goto done;
4097 }
4098 break;
4099 case XML_PARSER_START_TAG: {
4100 xmlChar *name, *oldname;
4101 int depth = ctxt->nameNr;
4102 htmlElemDescPtr info;
4103
4104 if (avail < 2)
4105 goto done;
4106 cur = in->cur[0];
4107 if (cur != '<') {
4108 ctxt->instate = XML_PARSER_CONTENT;
4109#ifdef DEBUG_PUSH
4110 xmlGenericError(xmlGenericErrorContext,
4111 "HPP: entering CONTENT\n");
4112#endif
4113 break;
4114 }
4115 if ((!terminate) &&
4116 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4117 goto done;
4118
4119 oldname = xmlStrdup(ctxt->name);
4120 htmlParseStartTag(ctxt);
4121 name = ctxt->name;
4122#ifdef DEBUG
4123 if (oldname == NULL)
4124 xmlGenericError(xmlGenericErrorContext,
4125 "Start of element %s\n", name);
4126 else if (name == NULL)
4127 xmlGenericError(xmlGenericErrorContext,
4128 "Start of element failed, was %s\n",
4129 oldname);
4130 else
4131 xmlGenericError(xmlGenericErrorContext,
4132 "Start of element %s, was %s\n",
4133 name, oldname);
4134#endif
4135 if (((depth == ctxt->nameNr) &&
4136 (xmlStrEqual(oldname, ctxt->name))) ||
4137 (name == NULL)) {
4138 if (CUR == '>')
4139 NEXT;
4140 if (oldname != NULL)
4141 xmlFree(oldname);
4142 break;
4143 }
4144 if (oldname != NULL)
4145 xmlFree(oldname);
4146
4147 /*
4148 * Lookup the info for that element.
4149 */
4150 info = htmlTagLookup(name);
4151 if (info == NULL) {
4152 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4153 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4154 name);
4155 ctxt->wellFormed = 0;
4156 } else if (info->depr) {
4157 /***************************
4158 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4159 ctxt->sax->warning(ctxt->userData,
4160 "Tag %s is deprecated\n",
4161 name);
4162 ***************************/
4163 }
4164
4165 /*
4166 * Check for an Empty Element labelled the XML/SGML way
4167 */
4168 if ((CUR == '/') && (NXT(1) == '>')) {
4169 SKIP(2);
4170 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4171 ctxt->sax->endElement(ctxt->userData, name);
4172 oldname = htmlnamePop(ctxt);
4173#ifdef DEBUG
4174 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
4175 oldname);
4176#endif
4177 if (oldname != NULL)
4178 xmlFree(oldname);
4179 ctxt->instate = XML_PARSER_CONTENT;
4180#ifdef DEBUG_PUSH
4181 xmlGenericError(xmlGenericErrorContext,
4182 "HPP: entering CONTENT\n");
4183#endif
4184 break;
4185 }
4186
4187 if (CUR == '>') {
4188 NEXT;
4189 } else {
4190 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4191 ctxt->sax->error(ctxt->userData,
4192 "Couldn't find end of Start Tag %s\n",
4193 name);
4194 ctxt->wellFormed = 0;
4195
4196 /*
4197 * end of parsing of this node.
4198 */
4199 if (xmlStrEqual(name, ctxt->name)) {
4200 nodePop(ctxt);
4201 oldname = htmlnamePop(ctxt);
4202#ifdef DEBUG
4203 xmlGenericError(xmlGenericErrorContext,
4204 "End of start tag problem: popping out %s\n", oldname);
4205#endif
4206 if (oldname != NULL)
4207 xmlFree(oldname);
4208 }
4209
4210 ctxt->instate = XML_PARSER_CONTENT;
4211#ifdef DEBUG_PUSH
4212 xmlGenericError(xmlGenericErrorContext,
4213 "HPP: entering CONTENT\n");
4214#endif
4215 break;
4216 }
4217
4218 /*
4219 * Check for an Empty Element from DTD definition
4220 */
4221 if ((info != NULL) && (info->empty)) {
4222 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4223 ctxt->sax->endElement(ctxt->userData, name);
4224 oldname = htmlnamePop(ctxt);
4225#ifdef DEBUG
4226 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
4227#endif
4228 if (oldname != NULL)
4229 xmlFree(oldname);
4230 }
4231 ctxt->instate = XML_PARSER_CONTENT;
4232#ifdef DEBUG_PUSH
4233 xmlGenericError(xmlGenericErrorContext,
4234 "HPP: entering CONTENT\n");
4235#endif
4236 break;
4237 }
4238 case XML_PARSER_CONTENT: {
4239 long cons;
4240 /*
4241 * Handle preparsed entities and charRef
4242 */
4243 if (ctxt->token != 0) {
4244 xmlChar chr[2] = { 0 , 0 } ;
4245
4246 chr[0] = (xmlChar) ctxt->token;
4247 htmlCheckParagraph(ctxt);
4248 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4249 ctxt->sax->characters(ctxt->userData, chr, 1);
4250 ctxt->token = 0;
4251 ctxt->checkIndex = 0;
4252 }
4253 if ((avail == 1) && (terminate)) {
4254 cur = in->cur[0];
4255 if ((cur != '<') && (cur != '&')) {
4256 if (ctxt->sax != NULL) {
4257 if (IS_BLANK(cur)) {
4258 if (ctxt->sax->ignorableWhitespace != NULL)
4259 ctxt->sax->ignorableWhitespace(
4260 ctxt->userData, &cur, 1);
4261 } else {
4262 htmlCheckParagraph(ctxt);
4263 if (ctxt->sax->characters != NULL)
4264 ctxt->sax->characters(
4265 ctxt->userData, &cur, 1);
4266 }
4267 }
4268 ctxt->token = 0;
4269 ctxt->checkIndex = 0;
4270 NEXT;
4271 }
4272 break;
4273 }
4274 if (avail < 2)
4275 goto done;
4276 cur = in->cur[0];
4277 next = in->cur[1];
4278 cons = ctxt->nbChars;
4279 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4280 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4281 /*
4282 * Handle SCRIPT/STYLE separately
4283 */
4284 if ((!terminate) &&
4285 (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4286 goto done;
4287 htmlParseScript(ctxt);
4288 if ((cur == '<') && (next == '/')) {
4289 ctxt->instate = XML_PARSER_END_TAG;
4290 ctxt->checkIndex = 0;
4291#ifdef DEBUG_PUSH
4292 xmlGenericError(xmlGenericErrorContext,
4293 "HPP: entering END_TAG\n");
4294#endif
4295 break;
4296 }
4297 } else {
4298 /*
4299 * Sometimes DOCTYPE arrives in the middle of the document
4300 */
4301 if ((cur == '<') && (next == '!') &&
4302 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4303 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4304 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4305 (UPP(8) == 'E')) {
4306 if ((!terminate) &&
4307 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4308 goto done;
4309 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4310 ctxt->sax->error(ctxt->userData,
4311 "Misplaced DOCTYPE declaration\n");
4312 ctxt->wellFormed = 0;
4313 htmlParseDocTypeDecl(ctxt);
4314 } else if ((cur == '<') && (next == '!') &&
4315 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4316 if ((!terminate) &&
4317 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4318 goto done;
4319#ifdef DEBUG_PUSH
4320 xmlGenericError(xmlGenericErrorContext,
4321 "HPP: Parsing Comment\n");
4322#endif
4323 htmlParseComment(ctxt);
4324 ctxt->instate = XML_PARSER_CONTENT;
4325 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4326 goto done;
4327 } else if ((cur == '<') && (next == '/')) {
4328 ctxt->instate = XML_PARSER_END_TAG;
4329 ctxt->checkIndex = 0;
4330#ifdef DEBUG_PUSH
4331 xmlGenericError(xmlGenericErrorContext,
4332 "HPP: entering END_TAG\n");
4333#endif
4334 break;
4335 } else if (cur == '<') {
4336 ctxt->instate = XML_PARSER_START_TAG;
4337 ctxt->checkIndex = 0;
4338#ifdef DEBUG_PUSH
4339 xmlGenericError(xmlGenericErrorContext,
4340 "HPP: entering START_TAG\n");
4341#endif
4342 break;
4343 } else if (cur == '&') {
4344 if ((!terminate) &&
4345 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4346 goto done;
4347#ifdef DEBUG_PUSH
4348 xmlGenericError(xmlGenericErrorContext,
4349 "HPP: Parsing Reference\n");
4350#endif
4351 /* TODO: check generation of subtrees if noent !!! */
4352 htmlParseReference(ctxt);
4353 } else {
4354 /* TODO Avoid the extra copy, handle directly !!!!!! */
4355 /*
4356 * Goal of the following test is :
4357 * - minimize calls to the SAX 'character' callback
4358 * when they are mergeable
4359 */
4360 if ((ctxt->inputNr == 1) &&
4361 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4362 if ((!terminate) &&
4363 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4364 goto done;
4365 }
4366 ctxt->checkIndex = 0;
4367#ifdef DEBUG_PUSH
4368 xmlGenericError(xmlGenericErrorContext,
4369 "HPP: Parsing char data\n");
4370#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004371 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004372 }
4373 }
4374 if (cons == ctxt->nbChars) {
4375 if (ctxt->node != NULL) {
4376 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4377 ctxt->sax->error(ctxt->userData,
4378 "detected an error in element content\n");
4379 ctxt->wellFormed = 0;
4380 }
4381 NEXT;
4382 break;
4383 }
4384
4385 break;
4386 }
4387 case XML_PARSER_END_TAG:
4388 if (avail < 2)
4389 goto done;
4390 if ((!terminate) &&
4391 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4392 goto done;
4393 htmlParseEndTag(ctxt);
4394 if (ctxt->nameNr == 0) {
4395 ctxt->instate = XML_PARSER_EPILOG;
4396 } else {
4397 ctxt->instate = XML_PARSER_CONTENT;
4398 }
4399 ctxt->checkIndex = 0;
4400#ifdef DEBUG_PUSH
4401 xmlGenericError(xmlGenericErrorContext,
4402 "HPP: entering CONTENT\n");
4403#endif
4404 break;
4405 case XML_PARSER_CDATA_SECTION:
4406 xmlGenericError(xmlGenericErrorContext,
4407 "HPP: internal error, state == CDATA\n");
4408 ctxt->instate = XML_PARSER_CONTENT;
4409 ctxt->checkIndex = 0;
4410#ifdef DEBUG_PUSH
4411 xmlGenericError(xmlGenericErrorContext,
4412 "HPP: entering CONTENT\n");
4413#endif
4414 break;
4415 case XML_PARSER_DTD:
4416 xmlGenericError(xmlGenericErrorContext,
4417 "HPP: internal error, state == DTD\n");
4418 ctxt->instate = XML_PARSER_CONTENT;
4419 ctxt->checkIndex = 0;
4420#ifdef DEBUG_PUSH
4421 xmlGenericError(xmlGenericErrorContext,
4422 "HPP: entering CONTENT\n");
4423#endif
4424 break;
4425 case XML_PARSER_COMMENT:
4426 xmlGenericError(xmlGenericErrorContext,
4427 "HPP: internal error, state == COMMENT\n");
4428 ctxt->instate = XML_PARSER_CONTENT;
4429 ctxt->checkIndex = 0;
4430#ifdef DEBUG_PUSH
4431 xmlGenericError(xmlGenericErrorContext,
4432 "HPP: entering CONTENT\n");
4433#endif
4434 break;
4435 case XML_PARSER_PI:
4436 xmlGenericError(xmlGenericErrorContext,
4437 "HPP: internal error, state == PI\n");
4438 ctxt->instate = XML_PARSER_CONTENT;
4439 ctxt->checkIndex = 0;
4440#ifdef DEBUG_PUSH
4441 xmlGenericError(xmlGenericErrorContext,
4442 "HPP: entering CONTENT\n");
4443#endif
4444 break;
4445 case XML_PARSER_ENTITY_DECL:
4446 xmlGenericError(xmlGenericErrorContext,
4447 "HPP: internal error, state == ENTITY_DECL\n");
4448 ctxt->instate = XML_PARSER_CONTENT;
4449 ctxt->checkIndex = 0;
4450#ifdef DEBUG_PUSH
4451 xmlGenericError(xmlGenericErrorContext,
4452 "HPP: entering CONTENT\n");
4453#endif
4454 break;
4455 case XML_PARSER_ENTITY_VALUE:
4456 xmlGenericError(xmlGenericErrorContext,
4457 "HPP: internal error, state == ENTITY_VALUE\n");
4458 ctxt->instate = XML_PARSER_CONTENT;
4459 ctxt->checkIndex = 0;
4460#ifdef DEBUG_PUSH
4461 xmlGenericError(xmlGenericErrorContext,
4462 "HPP: entering DTD\n");
4463#endif
4464 break;
4465 case XML_PARSER_ATTRIBUTE_VALUE:
4466 xmlGenericError(xmlGenericErrorContext,
4467 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
4468 ctxt->instate = XML_PARSER_START_TAG;
4469 ctxt->checkIndex = 0;
4470#ifdef DEBUG_PUSH
4471 xmlGenericError(xmlGenericErrorContext,
4472 "HPP: entering START_TAG\n");
4473#endif
4474 break;
4475 case XML_PARSER_SYSTEM_LITERAL:
4476 xmlGenericError(xmlGenericErrorContext,
4477 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
4478 ctxt->instate = XML_PARSER_CONTENT;
4479 ctxt->checkIndex = 0;
4480#ifdef DEBUG_PUSH
4481 xmlGenericError(xmlGenericErrorContext,
4482 "HPP: entering CONTENT\n");
4483#endif
4484 break;
4485 case XML_PARSER_IGNORE:
4486 xmlGenericError(xmlGenericErrorContext,
4487 "HPP: internal error, state == XML_PARSER_IGNORE\n");
4488 ctxt->instate = XML_PARSER_CONTENT;
4489 ctxt->checkIndex = 0;
4490#ifdef DEBUG_PUSH
4491 xmlGenericError(xmlGenericErrorContext,
4492 "HPP: entering CONTENT\n");
4493#endif
4494 break;
4495 }
4496 }
4497done:
4498 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004499 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004500 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4501 /*
4502 * SAX: end of the document processing.
4503 */
4504 ctxt->instate = XML_PARSER_EOF;
4505 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4506 ctxt->sax->endDocument(ctxt->userData);
4507 }
4508 }
4509 if ((ctxt->myDoc != NULL) &&
4510 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4511 (ctxt->instate == XML_PARSER_EPILOG))) {
4512 xmlDtdPtr dtd;
4513 dtd = xmlGetIntSubset(ctxt->myDoc);
4514 if (dtd == NULL)
4515 ctxt->myDoc->intSubset =
4516 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4517 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4518 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4519 }
4520#ifdef DEBUG_PUSH
4521 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4522#endif
4523 return(ret);
4524}
4525
4526/**
Owen Taylor3473f882001-02-23 17:55:21 +00004527 * htmlParseChunk:
4528 * @ctxt: an XML parser context
4529 * @chunk: an char array
4530 * @size: the size in byte of the chunk
4531 * @terminate: last chunk indicator
4532 *
4533 * Parse a Chunk of memory
4534 *
4535 * Returns zero if no error, the xmlParserErrors otherwise.
4536 */
4537int
4538htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4539 int terminate) {
4540 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4541 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4542 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4543 int cur = ctxt->input->cur - ctxt->input->base;
4544
4545 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4546 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4547 ctxt->input->cur = ctxt->input->base + cur;
4548#ifdef DEBUG_PUSH
4549 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4550#endif
4551
4552 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4553 htmlParseTryOrFinish(ctxt, terminate);
4554 } else if (ctxt->instate != XML_PARSER_EOF) {
4555 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
4556 htmlParseTryOrFinish(ctxt, terminate);
4557 }
4558 if (terminate) {
4559 if ((ctxt->instate != XML_PARSER_EOF) &&
4560 (ctxt->instate != XML_PARSER_EPILOG) &&
4561 (ctxt->instate != XML_PARSER_MISC)) {
4562 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004563 ctxt->wellFormed = 0;
4564 }
4565 if (ctxt->instate != XML_PARSER_EOF) {
4566 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4567 ctxt->sax->endDocument(ctxt->userData);
4568 }
4569 ctxt->instate = XML_PARSER_EOF;
4570 }
4571 return((xmlParserErrors) ctxt->errNo);
4572}
4573
4574/************************************************************************
4575 * *
4576 * User entry points *
4577 * *
4578 ************************************************************************/
4579
4580/**
4581 * htmlCreatePushParserCtxt :
4582 * @sax: a SAX handler
4583 * @user_data: The user data returned on SAX callbacks
4584 * @chunk: a pointer to an array of chars
4585 * @size: number of chars in the array
4586 * @filename: an optional file name or URI
4587 * @enc: an optional encoding
4588 *
4589 * Create a parser context for using the HTML parser in push mode
4590 * To allow content encoding detection, @size should be >= 4
4591 * The value of @filename is used for fetching external entities
4592 * and error/warning reports.
4593 *
4594 * Returns the new parser context or NULL
4595 */
4596htmlParserCtxtPtr
4597htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4598 const char *chunk, int size, const char *filename,
4599 xmlCharEncoding enc) {
4600 htmlParserCtxtPtr ctxt;
4601 htmlParserInputPtr inputStream;
4602 xmlParserInputBufferPtr buf;
4603
4604 buf = xmlAllocParserInputBuffer(enc);
4605 if (buf == NULL) return(NULL);
4606
4607 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4608 if (ctxt == NULL) {
4609 xmlFree(buf);
4610 return(NULL);
4611 }
4612 memset(ctxt, 0, sizeof(htmlParserCtxt));
4613 htmlInitParserCtxt(ctxt);
4614 if (sax != NULL) {
4615 if (ctxt->sax != &htmlDefaultSAXHandler)
4616 xmlFree(ctxt->sax);
4617 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4618 if (ctxt->sax == NULL) {
4619 xmlFree(buf);
4620 xmlFree(ctxt);
4621 return(NULL);
4622 }
4623 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4624 if (user_data != NULL)
4625 ctxt->userData = user_data;
4626 }
4627 if (filename == NULL) {
4628 ctxt->directory = NULL;
4629 } else {
4630 ctxt->directory = xmlParserGetDirectory(filename);
4631 }
4632
4633 inputStream = htmlNewInputStream(ctxt);
4634 if (inputStream == NULL) {
4635 xmlFreeParserCtxt(ctxt);
4636 return(NULL);
4637 }
4638
4639 if (filename == NULL)
4640 inputStream->filename = NULL;
4641 else
4642 inputStream->filename = xmlMemStrdup(filename);
4643 inputStream->buf = buf;
4644 inputStream->base = inputStream->buf->buffer->content;
4645 inputStream->cur = inputStream->buf->buffer->content;
4646
4647 inputPush(ctxt, inputStream);
4648
4649 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4650 (ctxt->input->buf != NULL)) {
4651 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4652#ifdef DEBUG_PUSH
4653 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4654#endif
4655 }
4656
4657 return(ctxt);
4658}
4659
4660/**
4661 * htmlSAXParseDoc :
4662 * @cur: a pointer to an array of xmlChar
4663 * @encoding: a free form C string describing the HTML document encoding, or NULL
4664 * @sax: the SAX handler block
4665 * @userData: if using SAX, this pointer will be provided on callbacks.
4666 *
4667 * parse an HTML in-memory document and build a tree.
4668 * It use the given SAX function block to handle the parsing callback.
4669 * If sax is NULL, fallback to the default DOM tree building routines.
4670 *
4671 * Returns the resulting document tree
4672 */
4673
4674htmlDocPtr
4675htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
4676 htmlDocPtr ret;
4677 htmlParserCtxtPtr ctxt;
4678
4679 if (cur == NULL) return(NULL);
4680
4681
4682 ctxt = htmlCreateDocParserCtxt(cur, encoding);
4683 if (ctxt == NULL) return(NULL);
4684 if (sax != NULL) {
4685 ctxt->sax = sax;
4686 ctxt->userData = userData;
4687 }
4688
4689 htmlParseDocument(ctxt);
4690 ret = ctxt->myDoc;
4691 if (sax != NULL) {
4692 ctxt->sax = NULL;
4693 ctxt->userData = NULL;
4694 }
4695 htmlFreeParserCtxt(ctxt);
4696
4697 return(ret);
4698}
4699
4700/**
4701 * htmlParseDoc :
4702 * @cur: a pointer to an array of xmlChar
4703 * @encoding: a free form C string describing the HTML document encoding, or NULL
4704 *
4705 * parse an HTML in-memory document and build a tree.
4706 *
4707 * Returns the resulting document tree
4708 */
4709
4710htmlDocPtr
4711htmlParseDoc(xmlChar *cur, const char *encoding) {
4712 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4713}
4714
4715
4716/**
4717 * htmlCreateFileParserCtxt :
4718 * @filename: the filename
4719 * @encoding: a free form C string describing the HTML document encoding, or NULL
4720 *
4721 * Create a parser context for a file content.
4722 * Automatic support for ZLIB/Compress compressed document is provided
4723 * by default if found at compile-time.
4724 *
4725 * Returns the new parser context or NULL
4726 */
4727htmlParserCtxtPtr
4728htmlCreateFileParserCtxt(const char *filename, const char *encoding)
4729{
4730 htmlParserCtxtPtr ctxt;
4731 htmlParserInputPtr inputStream;
4732 xmlParserInputBufferPtr buf;
4733 /* htmlCharEncoding enc; */
4734 xmlChar *content, *content_line = (xmlChar *) "charset=";
4735
4736 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
4737 if (buf == NULL) return(NULL);
4738
4739 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4740 if (ctxt == NULL) {
4741 perror("malloc");
4742 return(NULL);
4743 }
4744 memset(ctxt, 0, sizeof(htmlParserCtxt));
4745 htmlInitParserCtxt(ctxt);
4746 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
4747 if (inputStream == NULL) {
4748 perror("malloc");
4749 xmlFree(ctxt);
4750 return(NULL);
4751 }
4752 memset(inputStream, 0, sizeof(htmlParserInput));
4753
4754 inputStream->filename = xmlMemStrdup(filename);
4755 inputStream->line = 1;
4756 inputStream->col = 1;
4757 inputStream->buf = buf;
4758 inputStream->directory = NULL;
4759
4760 inputStream->base = inputStream->buf->buffer->content;
4761 inputStream->cur = inputStream->buf->buffer->content;
4762 inputStream->free = NULL;
4763
4764 inputPush(ctxt, inputStream);
4765
4766 /* set encoding */
4767 if (encoding) {
4768 content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
4769 if (content) {
4770 strcpy ((char *)content, (char *)content_line);
4771 strcat ((char *)content, (char *)encoding);
4772 htmlCheckEncoding (ctxt, content);
4773 xmlFree (content);
4774 }
4775 }
4776
4777 return(ctxt);
4778}
4779
4780/**
4781 * htmlSAXParseFile :
4782 * @filename: the filename
4783 * @encoding: a free form C string describing the HTML document encoding, or NULL
4784 * @sax: the SAX handler block
4785 * @userData: if using SAX, this pointer will be provided on callbacks.
4786 *
4787 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4788 * compressed document is provided by default if found at compile-time.
4789 * It use the given SAX function block to handle the parsing callback.
4790 * If sax is NULL, fallback to the default DOM tree building routines.
4791 *
4792 * Returns the resulting document tree
4793 */
4794
4795htmlDocPtr
4796htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
4797 void *userData) {
4798 htmlDocPtr ret;
4799 htmlParserCtxtPtr ctxt;
4800 htmlSAXHandlerPtr oldsax = NULL;
4801
4802 ctxt = htmlCreateFileParserCtxt(filename, encoding);
4803 if (ctxt == NULL) return(NULL);
4804 if (sax != NULL) {
4805 oldsax = ctxt->sax;
4806 ctxt->sax = sax;
4807 ctxt->userData = userData;
4808 }
4809
4810 htmlParseDocument(ctxt);
4811
4812 ret = ctxt->myDoc;
4813 if (sax != NULL) {
4814 ctxt->sax = oldsax;
4815 ctxt->userData = NULL;
4816 }
4817 htmlFreeParserCtxt(ctxt);
4818
4819 return(ret);
4820}
4821
4822/**
4823 * htmlParseFile :
4824 * @filename: the filename
4825 * @encoding: a free form C string describing the HTML document encoding, or NULL
4826 *
4827 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4828 * compressed document is provided by default if found at compile-time.
4829 *
4830 * Returns the resulting document tree
4831 */
4832
4833htmlDocPtr
4834htmlParseFile(const char *filename, const char *encoding) {
4835 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
4836}
4837
4838/**
4839 * htmlHandleOmittedElem:
4840 * @val: int 0 or 1
4841 *
4842 * Set and return the previous value for handling HTML omitted tags.
4843 *
4844 * Returns the last value for 0 for no handling, 1 for auto insertion.
4845 */
4846
4847int
4848htmlHandleOmittedElem(int val) {
4849 int old = htmlOmittedDefaultValue;
4850
4851 htmlOmittedDefaultValue = val;
4852 return(old);
4853}
4854
4855#endif /* LIBXML_HTML_ENABLED */