blob: 1e147789583404831d1e0671f766a317608577ac [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * Daniel.Veillard@w3.org
7 */
8
Bjorn Reese70a9da52001-04-21 16:57:29 +00009#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000010#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000011
Owen Taylor3473f882001-02-23 17:55:21 +000012#include <string.h>
13#ifdef HAVE_CTYPE_H
14#include <ctype.h>
15#endif
16#ifdef HAVE_STDLIB_H
17#include <stdlib.h>
18#endif
19#ifdef HAVE_SYS_STAT_H
20#include <sys/stat.h>
21#endif
22#ifdef HAVE_FCNTL_H
23#include <fcntl.h>
24#endif
25#ifdef HAVE_UNISTD_H
26#include <unistd.h>
27#endif
28#ifdef HAVE_ZLIB_H
29#include <zlib.h>
30#endif
31
32#include <libxml/xmlmemory.h>
33#include <libxml/tree.h>
34#include <libxml/parser.h>
35#include <libxml/parserInternals.h>
36#include <libxml/xmlerror.h>
37#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000038#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000039#include <libxml/entities.h>
40#include <libxml/encoding.h>
41#include <libxml/valid.h>
42#include <libxml/xmlIO.h>
43
44#define HTML_MAX_NAMELEN 1000
45#define HTML_PARSER_BIG_BUFFER_SIZE 1000
46#define HTML_PARSER_BUFFER_SIZE 100
47
48/* #define DEBUG */
49/* #define DEBUG_PUSH */
50
51int htmlOmittedDefaultValue = 1;
52
Daniel Veillard56a4cb82001-03-24 17:00:36 +000053xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
54 xmlChar end, xmlChar end2, xmlChar end3);
55
56/************************************************************************
57 * *
Owen Taylor3473f882001-02-23 17:55:21 +000058 * Parser stacks related functions and macros *
59 * *
60 ************************************************************************/
61
62/*
63 * Generic function for accessing stacks in the Parser Context
64 */
65
66#define PUSH_AND_POP(scope, type, name) \
67scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
68 if (ctxt->name##Nr >= ctxt->name##Max) { \
69 ctxt->name##Max *= 2; \
70 ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
71 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
72 if (ctxt->name##Tab == NULL) { \
73 xmlGenericError(xmlGenericErrorContext, \
74 "realloc failed !\n"); \
75 return(0); \
76 } \
77 } \
78 ctxt->name##Tab[ctxt->name##Nr] = value; \
79 ctxt->name = value; \
80 return(ctxt->name##Nr++); \
81} \
82scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
83 type ret; \
84 if (ctxt->name##Nr < 0) return(0); \
85 ctxt->name##Nr--; \
86 if (ctxt->name##Nr < 0) return(0); \
87 if (ctxt->name##Nr > 0) \
88 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
89 else \
90 ctxt->name = NULL; \
91 ret = ctxt->name##Tab[ctxt->name##Nr]; \
92 ctxt->name##Tab[ctxt->name##Nr] = 0; \
93 return(ret); \
94} \
95
Daniel Veillard56a4cb82001-03-24 17:00:36 +000096/* PUSH_AND_POP(static, xmlNodePtr, node) */
97PUSH_AND_POP(static, xmlChar*, name)
Owen Taylor3473f882001-02-23 17:55:21 +000098
99/*
100 * Macros for accessing the content. Those should be used only by the parser,
101 * and not exported.
102 *
103 * Dirty macros, i.e. one need to make assumption on the context to use them
104 *
105 * CUR_PTR return the current pointer to the xmlChar to be parsed.
106 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
107 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
108 * in UNICODE mode. This should be used internally by the parser
109 * only to compare to ASCII values otherwise it would break when
110 * running with UTF-8 encoding.
111 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
112 * to compare on ASCII based substring.
113 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
114 * it should be used only to compare on ASCII based substring.
115 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
116 * strings within the parser.
117 *
118 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
119 *
120 * CURRENT Returns the current char value, with the full decoding of
121 * UTF-8 if we are using this mode. It returns an int.
122 * NEXT Skip to the next character, this does the proper decoding
123 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
124 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
125 */
126
127#define UPPER (toupper(*ctxt->input->cur))
128
129#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
130
131#define NXT(val) ctxt->input->cur[(val)]
132
133#define UPP(val) (toupper(ctxt->input->cur[(val)]))
134
135#define CUR_PTR ctxt->input->cur
136
137#define SHRINK xmlParserInputShrink(ctxt->input)
138
139#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
140
141#define CURRENT ((int) (*ctxt->input->cur))
142
143#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
144
145/* Inported from XML */
146
147/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
148#define CUR ((int) (*ctxt->input->cur))
149#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
150
151#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
152#define NXT(val) ctxt->input->cur[(val)]
153#define CUR_PTR ctxt->input->cur
154
155
156#define NEXTL(l) do { \
157 if (*(ctxt->input->cur) == '\n') { \
158 ctxt->input->line++; ctxt->input->col = 1; \
159 } else ctxt->input->col++; \
160 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
161 } while (0)
162
163/************
164 \
165 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
166 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
167 ************/
168
169#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
170#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
171
172#define COPY_BUF(l,b,i,v) \
173 if (l == 1) b[i++] = (xmlChar) v; \
174 else i += xmlCopyChar(l,&b[i],v)
175
176/**
177 * htmlCurrentChar:
178 * @ctxt: the HTML parser context
179 * @len: pointer to the length of the char read
180 *
181 * The current char value, if using UTF-8 this may actaully span multiple
182 * bytes in the input buffer. Implement the end of line normalization:
183 * 2.11 End-of-Line Handling
184 * If the encoding is unspecified, in the case we find an ISO-Latin-1
185 * char, then the encoding converter is plugged in automatically.
186 *
187 * Returns the current char value and its lenght
188 */
189
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000190static int
Owen Taylor3473f882001-02-23 17:55:21 +0000191htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
192 if (ctxt->instate == XML_PARSER_EOF)
193 return(0);
194
195 if (ctxt->token != 0) {
196 *len = 0;
197 return(ctxt->token);
198 }
199 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
200 /*
201 * We are supposed to handle UTF8, check it's valid
202 * From rfc2044: encoding of the Unicode values on UTF-8:
203 *
204 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
205 * 0000 0000-0000 007F 0xxxxxxx
206 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
207 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
208 *
209 * Check for the 0x110000 limit too
210 */
211 const unsigned char *cur = ctxt->input->cur;
212 unsigned char c;
213 unsigned int val;
214
215 c = *cur;
216 if (c & 0x80) {
217 if (cur[1] == 0)
218 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
219 if ((cur[1] & 0xc0) != 0x80)
220 goto encoding_error;
221 if ((c & 0xe0) == 0xe0) {
222
223 if (cur[2] == 0)
224 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
225 if ((cur[2] & 0xc0) != 0x80)
226 goto encoding_error;
227 if ((c & 0xf0) == 0xf0) {
228 if (cur[3] == 0)
229 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
230 if (((c & 0xf8) != 0xf0) ||
231 ((cur[3] & 0xc0) != 0x80))
232 goto encoding_error;
233 /* 4-byte code */
234 *len = 4;
235 val = (cur[0] & 0x7) << 18;
236 val |= (cur[1] & 0x3f) << 12;
237 val |= (cur[2] & 0x3f) << 6;
238 val |= cur[3] & 0x3f;
239 } else {
240 /* 3-byte code */
241 *len = 3;
242 val = (cur[0] & 0xf) << 12;
243 val |= (cur[1] & 0x3f) << 6;
244 val |= cur[2] & 0x3f;
245 }
246 } else {
247 /* 2-byte code */
248 *len = 2;
249 val = (cur[0] & 0x1f) << 6;
250 val |= cur[1] & 0x3f;
251 }
252 if (!IS_CHAR(val)) {
253 ctxt->errNo = XML_ERR_INVALID_ENCODING;
254 if ((ctxt->sax != NULL) &&
255 (ctxt->sax->error != NULL))
256 ctxt->sax->error(ctxt->userData,
257 "Char 0x%X out of allowed range\n", val);
258 ctxt->wellFormed = 0;
259 ctxt->disableSAX = 1;
260 }
261 return(val);
262 } else {
263 /* 1-byte code */
264 *len = 1;
265 return((int) *ctxt->input->cur);
266 }
267 }
268 /*
269 * Assume it's a fixed lenght encoding (1) with
270 * a compatibke encoding for the ASCII set, since
271 * XML constructs only use < 128 chars
272 */
273 *len = 1;
274 if ((int) *ctxt->input->cur < 0x80)
275 return((int) *ctxt->input->cur);
276
277 /*
278 * Humm this is bad, do an automatic flow conversion
279 */
280 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
281 ctxt->charset = XML_CHAR_ENCODING_UTF8;
282 return(xmlCurrentChar(ctxt, len));
283
284encoding_error:
285 /*
286 * If we detect an UTF8 error that probably mean that the
287 * input encoding didn't get properly advertized in the
288 * declaration header. Report the error and switch the encoding
289 * to ISO-Latin-1 (if you don't like this policy, just declare the
290 * encoding !)
291 */
292 ctxt->errNo = XML_ERR_INVALID_ENCODING;
293 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
294 ctxt->sax->error(ctxt->userData,
295 "Input is not proper UTF-8, indicate encoding !\n");
296 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
297 ctxt->input->cur[0], ctxt->input->cur[1],
298 ctxt->input->cur[2], ctxt->input->cur[3]);
299 }
300
301 ctxt->charset = XML_CHAR_ENCODING_8859_1;
302 *len = 1;
303 return((int) *ctxt->input->cur);
304}
305
306/**
Owen Taylor3473f882001-02-23 17:55:21 +0000307 * htmlSkipBlankChars:
308 * @ctxt: the HTML parser context
309 *
310 * skip all blanks character found at that point in the input streams.
311 *
312 * Returns the number of space chars skipped
313 */
314
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000315static int
Owen Taylor3473f882001-02-23 17:55:21 +0000316htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
317 int res = 0;
318
319 while (IS_BLANK(*(ctxt->input->cur))) {
320 if ((*ctxt->input->cur == 0) &&
321 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
322 xmlPopInput(ctxt);
323 } else {
324 if (*(ctxt->input->cur) == '\n') {
325 ctxt->input->line++; ctxt->input->col = 1;
326 } else ctxt->input->col++;
327 ctxt->input->cur++;
328 ctxt->nbChars++;
329 if (*ctxt->input->cur == 0)
330 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
331 }
332 res++;
333 }
334 return(res);
335}
336
337
338
339/************************************************************************
340 * *
341 * The list of HTML elements and their properties *
342 * *
343 ************************************************************************/
344
345/*
346 * Start Tag: 1 means the start tag can be ommited
347 * End Tag: 1 means the end tag can be ommited
348 * 2 means it's forbidden (empty elements)
349 * Depr: this element is deprecated
350 * DTD: 1 means that this element is valid only in the Loose DTD
351 * 2 means that this element is valid only in the Frameset DTD
352 *
353 * Name,Start Tag,End Tag,Save End, Empty, Depr., DTD, Description
354 */
355htmlElemDesc html40ElementTable[] = {
356{ "a", 0, 0, 0, 0, 0, 0, "anchor " },
357{ "abbr", 0, 0, 0, 0, 0, 0, "abbreviated form" },
358{ "acronym", 0, 0, 0, 0, 0, 0, "" },
359{ "address", 0, 0, 0, 0, 0, 0, "information on author " },
360{ "applet", 0, 0, 0, 0, 1, 1, "java applet " },
361{ "area", 0, 2, 2, 1, 0, 0, "client-side image map area " },
362{ "b", 0, 0, 0, 0, 0, 0, "bold text style" },
363{ "base", 0, 2, 2, 1, 0, 0, "document base uri " },
364{ "basefont", 0, 2, 2, 1, 1, 1, "base font size " },
365{ "bdo", 0, 0, 0, 0, 0, 0, "i18n bidi over-ride " },
366{ "big", 0, 0, 0, 0, 0, 0, "large text style" },
367{ "blockquote", 0, 0, 0, 0, 0, 0, "long quotation " },
368{ "body", 1, 1, 0, 0, 0, 0, "document body " },
369{ "br", 0, 2, 2, 1, 0, 0, "forced line break " },
370{ "button", 0, 0, 0, 0, 0, 0, "push button " },
371{ "caption", 0, 0, 0, 0, 0, 0, "table caption " },
372{ "center", 0, 0, 0, 0, 1, 1, "shorthand for div align=center " },
373{ "cite", 0, 0, 0, 0, 0, 0, "citation" },
374{ "code", 0, 0, 0, 0, 0, 0, "computer code fragment" },
375{ "col", 0, 2, 2, 1, 0, 0, "table column " },
376{ "colgroup", 0, 1, 0, 0, 0, 0, "table column group " },
377{ "dd", 0, 1, 0, 0, 0, 0, "definition description " },
378{ "del", 0, 0, 0, 0, 0, 0, "deleted text " },
379{ "dfn", 0, 0, 0, 0, 0, 0, "instance definition" },
380{ "dir", 0, 0, 0, 0, 1, 1, "directory list" },
381{ "div", 0, 0, 0, 0, 0, 0, "generic language/style container"},
382{ "dl", 0, 0, 0, 0, 0, 0, "definition list " },
383{ "dt", 0, 1, 0, 0, 0, 0, "definition term " },
384{ "em", 0, 0, 0, 0, 0, 0, "emphasis" },
385{ "fieldset", 0, 0, 0, 0, 0, 0, "form control group " },
386{ "font", 0, 0, 0, 0, 1, 1, "local change to font " },
387{ "form", 0, 0, 0, 0, 0, 0, "interactive form " },
388{ "frame", 0, 2, 2, 1, 0, 2, "subwindow " },
389{ "frameset", 0, 0, 0, 0, 0, 2, "window subdivision" },
390{ "h1", 0, 0, 0, 0, 0, 0, "heading " },
391{ "h2", 0, 0, 0, 0, 0, 0, "heading " },
392{ "h3", 0, 0, 0, 0, 0, 0, "heading " },
393{ "h4", 0, 0, 0, 0, 0, 0, "heading " },
394{ "h5", 0, 0, 0, 0, 0, 0, "heading " },
395{ "h6", 0, 0, 0, 0, 0, 0, "heading " },
396{ "head", 1, 1, 0, 0, 0, 0, "document head " },
397{ "hr", 0, 2, 2, 1, 0, 0, "horizontal rule " },
398{ "html", 1, 1, 0, 0, 0, 0, "document root element " },
399{ "i", 0, 0, 0, 0, 0, 0, "italic text style" },
400{ "iframe", 0, 0, 0, 0, 0, 1, "inline subwindow " },
401{ "img", 0, 2, 2, 1, 0, 0, "embedded image " },
402{ "input", 0, 2, 2, 1, 0, 0, "form control " },
403{ "ins", 0, 0, 0, 0, 0, 0, "inserted text" },
404{ "isindex", 0, 2, 2, 1, 1, 1, "single line prompt " },
405{ "kbd", 0, 0, 0, 0, 0, 0, "text to be entered by the user" },
406{ "label", 0, 0, 0, 0, 0, 0, "form field label text " },
407{ "legend", 0, 0, 0, 0, 0, 0, "fieldset legend " },
408{ "li", 0, 1, 1, 0, 0, 0, "list item " },
409{ "link", 0, 2, 2, 1, 0, 0, "a media-independent link " },
410{ "map", 0, 0, 0, 0, 0, 0, "client-side image map " },
411{ "menu", 0, 0, 0, 0, 1, 1, "menu list " },
412{ "meta", 0, 2, 2, 1, 0, 0, "generic metainformation " },
413{ "noframes", 0, 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " },
414{ "noscript", 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
415{ "object", 0, 0, 0, 0, 0, 0, "generic embedded object " },
416{ "ol", 0, 0, 0, 0, 0, 0, "ordered list " },
417{ "optgroup", 0, 0, 0, 0, 0, 0, "option group " },
418{ "option", 0, 1, 0, 0, 0, 0, "selectable choice " },
419{ "p", 0, 1, 1, 0, 0, 0, "paragraph " },
420{ "param", 0, 2, 2, 1, 0, 0, "named property value " },
421{ "pre", 0, 0, 0, 0, 0, 0, "preformatted text " },
422{ "q", 0, 0, 0, 0, 0, 0, "short inline quotation " },
423{ "s", 0, 0, 0, 0, 1, 1, "strike-through text style" },
424{ "samp", 0, 0, 0, 0, 0, 0, "sample program output, scripts, etc." },
425{ "script", 0, 0, 0, 0, 0, 0, "script statements " },
426{ "select", 0, 0, 0, 0, 0, 0, "option selector " },
427{ "small", 0, 0, 0, 0, 0, 0, "small text style" },
428{ "span", 0, 0, 0, 0, 0, 0, "generic language/style container " },
429{ "strike", 0, 0, 0, 0, 1, 1, "strike-through text" },
430{ "strong", 0, 0, 0, 0, 0, 0, "strong emphasis" },
431{ "style", 0, 0, 0, 0, 0, 0, "style info " },
432{ "sub", 0, 0, 0, 0, 0, 0, "subscript" },
433{ "sup", 0, 0, 0, 0, 0, 0, "superscript " },
434{ "table", 0, 0, 0, 0, 0, 0, "&#160;" },
435{ "tbody", 1, 0, 0, 0, 0, 0, "table body " },
436{ "td", 0, 0, 0, 0, 0, 0, "table data cell" },
437{ "textarea", 0, 0, 0, 0, 0, 0, "multi-line text field " },
438{ "tfoot", 0, 1, 0, 0, 0, 0, "table footer " },
439{ "th", 0, 1, 0, 0, 0, 0, "table header cell" },
440{ "thead", 0, 1, 0, 0, 0, 0, "table header " },
441{ "title", 0, 0, 0, 0, 0, 0, "document title " },
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000442{ "tr", 0, 0, 0, 0, 0, 0, "table row " },
Owen Taylor3473f882001-02-23 17:55:21 +0000443{ "tt", 0, 0, 0, 0, 0, 0, "teletype or monospaced text style" },
444{ "u", 0, 0, 0, 0, 1, 1, "underlined text style" },
445{ "ul", 0, 0, 0, 0, 0, 0, "unordered list " },
446{ "var", 0, 0, 0, 0, 0, 0, "instance of a variable or program argument" },
447};
448
449/*
450 * start tags that imply the end of a current element
451 * any tag of each line implies the end of the current element if the type of
452 * that element is in the same line
453 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000454const char *htmlEquEnd[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000455"dt", "dd", "li", "option", NULL,
456"h1", "h2", "h3", "h4", "h5", "h6", NULL,
457"ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,
458NULL
459};
460/*
461 * acording the HTML DTD, HR should be added to the 2nd line above, as it
462 * is not allowed within a H1, H2, H3, etc. But we should tolerate that case
463 * because many documents contain rules in headings...
464 */
465
466/*
467 * start tags that imply the end of current element
468 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000469const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000470"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
471 "dl", "ul", "ol", "menu", "dir", "address", "pre",
472 "listing", "xmp", "head", NULL,
473"head", "p", NULL,
474"title", "p", NULL,
475"body", "head", "style", "link", "title", "p", NULL,
476"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
477 "pre", "listing", "xmp", "head", "li", NULL,
478"hr", "p", "head", NULL,
479"h1", "p", "head", NULL,
480"h2", "p", "head", NULL,
481"h3", "p", "head", NULL,
482"h4", "p", "head", NULL,
483"h5", "p", "head", NULL,
484"h6", "p", "head", NULL,
485"dir", "p", "head", NULL,
486"address", "p", "head", "ul", NULL,
487"pre", "p", "head", "ul", NULL,
488"listing", "p", "head", NULL,
489"xmp", "p", "head", NULL,
490"blockquote", "p", "head", NULL,
491"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
492 "xmp", "head", NULL,
493"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
494 "head", "dd", NULL,
495"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
496 "head", "dt", NULL,
497"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
498 "listing", "xmp", NULL,
499"ol", "p", "head", "ul", NULL,
500"menu", "p", "head", "ul", NULL,
501"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
502"div", "p", "head", NULL,
503"noscript", "p", "head", NULL,
504"center", "font", "b", "i", "p", "head", NULL,
505"a", "a", NULL,
506"caption", "p", NULL,
507"colgroup", "caption", "colgroup", "col", "p", NULL,
508"col", "caption", "col", "p", NULL,
509"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
510 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000511"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
512"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000513"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
514"thead", "caption", "col", "colgroup", NULL,
515"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
516 "tbody", "p", NULL,
517"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
518 "tfoot", "tbody", "p", NULL,
519"optgroup", "option", NULL,
520"option", "option", NULL,
521"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
522 "pre", "listing", "xmp", "a", NULL,
523NULL
524};
525
526/*
527 * The list of HTML elements which are supposed not to have
528 * CDATA content and where a p element will be implied
529 *
530 * TODO: extend that list by reading the HTML SGML DtD on
531 * implied paragraph
532 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000533static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000534 "html",
535 "head",
536 "body",
537 NULL
538};
539
540/*
541 * The list of HTML attributes which are of content %Script;
542 * NOTE: when adding ones, check htmlIsScriptAttribute() since
543 * it assumes the name starts with 'on'
544 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000545static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000546 "onclick",
547 "ondblclick",
548 "onmousedown",
549 "onmouseup",
550 "onmouseover",
551 "onmousemove",
552 "onmouseout",
553 "onkeypress",
554 "onkeydown",
555 "onkeyup",
556 "onload",
557 "onunload",
558 "onfocus",
559 "onblur",
560 "onsubmit",
561 "onrest",
562 "onchange",
563 "onselect"
564};
565
566
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000567static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +0000568static int htmlStartCloseIndexinitialized = 0;
569
570/************************************************************************
571 * *
572 * functions to handle HTML specific data *
573 * *
574 ************************************************************************/
575
576/**
577 * htmlInitAutoClose:
578 *
579 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
580 * This is not reentrant. Call xmlInitParser() once before processing in
581 * case of use in multithreaded programs.
582 */
583void
584htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000585 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000586
587 if (htmlStartCloseIndexinitialized) return;
588
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000589 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
590 indx = 0;
591 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
592 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +0000593 while (htmlStartClose[i] != NULL) i++;
594 i++;
595 }
596 htmlStartCloseIndexinitialized = 1;
597}
598
599/**
600 * htmlTagLookup:
601 * @tag: The tag name in lowercase
602 *
603 * Lookup the HTML tag in the ElementTable
604 *
605 * Returns the related htmlElemDescPtr or NULL if not found.
606 */
607htmlElemDescPtr
608htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000609 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000610
611 for (i = 0; i < (sizeof(html40ElementTable) /
612 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000613 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Owen Taylor3473f882001-02-23 17:55:21 +0000614 return(&html40ElementTable[i]);
615 }
616 return(NULL);
617}
618
619/**
620 * htmlCheckAutoClose:
621 * @newtag: The new tag name
622 * @oldtag: The old tag name
623 *
624 * Checks wether the new tag is one of the registered valid tags for closing old.
625 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
626 *
627 * Returns 0 if no, 1 if yes.
628 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000629static int
Owen Taylor3473f882001-02-23 17:55:21 +0000630htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000631 int i, indx;
632 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +0000633
634 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
635
636 /* inefficient, but not a big deal */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000637 for (indx = 0; indx < 100;indx++) {
638 closed = htmlStartCloseIndex[indx];
639 if (closed == NULL) return(0);
640 if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor3473f882001-02-23 17:55:21 +0000641 }
642
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000643 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +0000644 i++;
645 while (htmlStartClose[i] != NULL) {
646 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
647 return(1);
648 }
649 i++;
650 }
651 return(0);
652}
653
654/**
655 * htmlAutoCloseOnClose:
656 * @ctxt: an HTML parser context
657 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000658 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +0000659 *
660 * The HTmL DtD allows an ending tag to implicitely close other tags.
661 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000662static void
Owen Taylor3473f882001-02-23 17:55:21 +0000663htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
664 htmlElemDescPtr info;
665 xmlChar *oldname;
666 int i;
667
668#ifdef DEBUG
669 xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
670 for (i = 0;i < ctxt->nameNr;i++)
671 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
672#endif
673
674 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
675 if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
676 }
677 if (i < 0) return;
678
679 while (!xmlStrEqual(newtag, ctxt->name)) {
680 info = htmlTagLookup(ctxt->name);
681 if ((info == NULL) || (info->endTag == 1)) {
682#ifdef DEBUG
683 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
684#endif
685 } else {
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000686 return;
Owen Taylor3473f882001-02-23 17:55:21 +0000687 }
688 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
689 ctxt->sax->endElement(ctxt->userData, ctxt->name);
690 oldname = htmlnamePop(ctxt);
691 if (oldname != NULL) {
692#ifdef DEBUG
693 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
694#endif
695 xmlFree(oldname);
696 }
697 }
698}
699
700/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000701 * htmlAutoCloseOnEnd:
702 * @ctxt: an HTML parser context
703 *
704 * Close all remaining tags at the end of the stream
705 */
706static void
707htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
708 xmlChar *oldname;
709 int i;
710
711 if (ctxt->nameNr == 0)
712 return;
713#ifdef DEBUG
714 xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
715#endif
716
717 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
718#ifdef DEBUG
719 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
720#endif
721 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
722 ctxt->sax->endElement(ctxt->userData, ctxt->name);
723 oldname = htmlnamePop(ctxt);
724 if (oldname != NULL) {
725#ifdef DEBUG
726 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
727#endif
728 xmlFree(oldname);
729 }
730 }
731}
732
733/**
Owen Taylor3473f882001-02-23 17:55:21 +0000734 * htmlAutoClose:
735 * @ctxt: an HTML parser context
736 * @newtag: The new tag name or NULL
737 *
738 * The HTmL DtD allows a tag to implicitely close other tags.
739 * The list is kept in htmlStartClose array. This function is
740 * called when a new tag has been detected and generates the
741 * appropriates closes if possible/needed.
742 * If newtag is NULL this mean we are at the end of the resource
743 * and we should check
744 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000745static void
Owen Taylor3473f882001-02-23 17:55:21 +0000746htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
747 xmlChar *oldname;
748 while ((newtag != NULL) && (ctxt->name != NULL) &&
749 (htmlCheckAutoClose(newtag, ctxt->name))) {
750#ifdef DEBUG
751 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
752#endif
753 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
754 ctxt->sax->endElement(ctxt->userData, ctxt->name);
755 oldname = htmlnamePop(ctxt);
756 if (oldname != NULL) {
757#ifdef DEBUG
758 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
759#endif
760 xmlFree(oldname);
761 }
762 }
763 if (newtag == NULL) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000764 htmlAutoCloseOnEnd(ctxt);
765 return;
Owen Taylor3473f882001-02-23 17:55:21 +0000766 }
767 while ((newtag == NULL) && (ctxt->name != NULL) &&
768 ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
769 (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
770 (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
771#ifdef DEBUG
772 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
773#endif
774 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
775 ctxt->sax->endElement(ctxt->userData, ctxt->name);
776 oldname = htmlnamePop(ctxt);
777 if (oldname != NULL) {
778#ifdef DEBUG
779 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
780#endif
781 xmlFree(oldname);
782 }
783 }
784
785}
786
787/**
788 * htmlAutoCloseTag:
789 * @doc: the HTML document
790 * @name: The tag name
791 * @elem: the HTML element
792 *
793 * The HTmL DtD allows a tag to implicitely close other tags.
794 * The list is kept in htmlStartClose array. This function checks
795 * if the element or one of it's children would autoclose the
796 * given tag.
797 *
798 * Returns 1 if autoclose, 0 otherwise
799 */
800int
801htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
802 htmlNodePtr child;
803
804 if (elem == NULL) return(1);
805 if (xmlStrEqual(name, elem->name)) return(0);
806 if (htmlCheckAutoClose(elem->name, name)) return(1);
807 child = elem->children;
808 while (child != NULL) {
809 if (htmlAutoCloseTag(doc, name, child)) return(1);
810 child = child->next;
811 }
812 return(0);
813}
814
815/**
816 * htmlIsAutoClosed:
817 * @doc: the HTML document
818 * @elem: the HTML element
819 *
820 * The HTmL DtD allows a tag to implicitely close other tags.
821 * The list is kept in htmlStartClose array. This function checks
822 * if a tag is autoclosed by one of it's child
823 *
824 * Returns 1 if autoclosed, 0 otherwise
825 */
826int
827htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
828 htmlNodePtr child;
829
830 if (elem == NULL) return(1);
831 child = elem->children;
832 while (child != NULL) {
833 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
834 child = child->next;
835 }
836 return(0);
837}
838
839/**
840 * htmlCheckImplied:
841 * @ctxt: an HTML parser context
842 * @newtag: The new tag name
843 *
844 * The HTML DtD allows a tag to exists only implicitely
845 * called when a new tag has been detected and generates the
846 * appropriates implicit tags if missing
847 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000848static void
Owen Taylor3473f882001-02-23 17:55:21 +0000849htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
850 if (!htmlOmittedDefaultValue)
851 return;
852 if (xmlStrEqual(newtag, BAD_CAST"html"))
853 return;
854 if (ctxt->nameNr <= 0) {
855#ifdef DEBUG
856 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
857#endif
858 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
859 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
860 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
861 }
862 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
863 return;
864 if ((ctxt->nameNr <= 1) &&
865 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
866 (xmlStrEqual(newtag, BAD_CAST"style")) ||
867 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
868 (xmlStrEqual(newtag, BAD_CAST"link")) ||
869 (xmlStrEqual(newtag, BAD_CAST"title")) ||
870 (xmlStrEqual(newtag, BAD_CAST"base")))) {
871 /*
872 * dropped OBJECT ... i you put it first BODY will be
873 * assumed !
874 */
875#ifdef DEBUG
876 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
877#endif
878 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
879 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
880 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
881 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
882 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
883 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
884 int i;
885 for (i = 0;i < ctxt->nameNr;i++) {
886 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
887 return;
888 }
889 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
890 return;
891 }
892 }
893
894#ifdef DEBUG
895 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
896#endif
897 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
898 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
899 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
900 }
901}
902
903/**
904 * htmlCheckParagraph
905 * @ctxt: an HTML parser context
906 *
907 * Check whether a p element need to be implied before inserting
908 * characters in the current element.
909 *
910 * Returns 1 if a paragraph has been inserted, 0 if not and -1
911 * in case of error.
912 */
913
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000914static int
Owen Taylor3473f882001-02-23 17:55:21 +0000915htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
916 const xmlChar *tag;
917 int i;
918
919 if (ctxt == NULL)
920 return(-1);
921 tag = ctxt->name;
922 if (tag == NULL) {
923 htmlAutoClose(ctxt, BAD_CAST"p");
924 htmlCheckImplied(ctxt, BAD_CAST"p");
925 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
926 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
927 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
928 return(1);
929 }
930 if (!htmlOmittedDefaultValue)
931 return(0);
932 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
933 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
934#ifdef DEBUG
935 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
936#endif
937 htmlAutoClose(ctxt, BAD_CAST"p");
938 htmlCheckImplied(ctxt, BAD_CAST"p");
939 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
940 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
941 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
942 return(1);
943 }
944 }
945 return(0);
946}
947
948/**
949 * htmlIsScriptAttribute:
950 * @name: an attribute name
951 *
952 * Check if an attribute is of content type Script
953 *
954 * Returns 1 is the attribute is a script 0 otherwise
955 */
956int
957htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000958 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000959
960 if (name == NULL)
961 return(0);
962 /*
963 * all script attributes start with 'on'
964 */
965 if ((name[0] != 'o') || (name[1] != 'n'))
966 return(0);
967 for (i = 0;
968 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
969 i++) {
970 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
971 return(1);
972 }
973 return(0);
974}
975
976/************************************************************************
977 * *
978 * The list of HTML predefined entities *
979 * *
980 ************************************************************************/
981
982
983htmlEntityDesc html40EntitiesTable[] = {
984/*
985 * the 4 absolute ones, plus apostrophe.
986 */
987{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
988{ 38, "amp", "ampersand, U+0026 ISOnum" },
989{ 39, "apos", "single quote" },
990{ 60, "lt", "less-than sign, U+003C ISOnum" },
991{ 62, "gt", "greater-than sign, U+003E ISOnum" },
992
993/*
994 * A bunch still in the 128-255 range
995 * Replacing them depend really on the charset used.
996 */
997{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
998{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
999{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1000{ 163, "pound","pound sign, U+00A3 ISOnum" },
1001{ 164, "curren","currency sign, U+00A4 ISOnum" },
1002{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1003{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1004{ 167, "sect", "section sign, U+00A7 ISOnum" },
1005{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1006{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1007{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1008{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1009{ 172, "not", "not sign, U+00AC ISOnum" },
1010{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1011{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1012{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1013{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1014{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1015{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1016{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1017{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1018{ 181, "micro","micro sign, U+00B5 ISOnum" },
1019{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1020{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1021{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1022{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1023{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1024{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1025{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1026{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1027{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1028{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1029{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1030{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1031{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1032{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1033{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1034{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1035{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1036{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1037{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1038{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1039{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1040{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1041{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1042{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1043{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1044{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1045{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1046{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1047{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1048{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1049{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1050{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1051{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1052{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1053{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1054{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1055{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1056{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1057{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1058{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1059{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1060{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1061{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1062{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1063{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1064{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1065{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1066{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1067{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1068{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1069{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1070{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1071{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1072{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1073{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1074{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1075{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1076{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1077{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1078{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1079{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1080{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1081{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1082{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1083{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1084{ 247, "divide","division sign, U+00F7 ISOnum" },
1085{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1086{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1087{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1088{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1089{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1090{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1091{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1092{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1093
1094{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1095{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1096{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1097{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1098{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1099
1100/*
1101 * Anything below should really be kept as entities references
1102 */
1103{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1104
1105{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1106{ 732, "tilde","small tilde, U+02DC ISOdia" },
1107
1108{ 913, "Alpha","greek capital letter alpha, U+0391" },
1109{ 914, "Beta", "greek capital letter beta, U+0392" },
1110{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1111{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1112{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1113{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1114{ 919, "Eta", "greek capital letter eta, U+0397" },
1115{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1116{ 921, "Iota", "greek capital letter iota, U+0399" },
1117{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001118{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001119{ 924, "Mu", "greek capital letter mu, U+039C" },
1120{ 925, "Nu", "greek capital letter nu, U+039D" },
1121{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1122{ 927, "Omicron","greek capital letter omicron, U+039F" },
1123{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1124{ 929, "Rho", "greek capital letter rho, U+03A1" },
1125{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1126{ 932, "Tau", "greek capital letter tau, U+03A4" },
1127{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1128{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1129{ 935, "Chi", "greek capital letter chi, U+03A7" },
1130{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1131{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1132
1133{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1134{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1135{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1136{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1137{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1138{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1139{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1140{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1141{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1142{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1143{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1144{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1145{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1146{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1147{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1148{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1149{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1150{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1151{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1152{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1153{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1154{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1155{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1156{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1157{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1158{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1159{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1160{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1161
1162{ 8194, "ensp", "en space, U+2002 ISOpub" },
1163{ 8195, "emsp", "em space, U+2003 ISOpub" },
1164{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1165{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1166{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1167{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1168{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1169{ 8211, "ndash","en dash, U+2013 ISOpub" },
1170{ 8212, "mdash","em dash, U+2014 ISOpub" },
1171{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1172{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1173{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1174{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1175{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1176{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1177{ 8224, "dagger","dagger, U+2020 ISOpub" },
1178{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1179
1180{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1181{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1182
1183{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1184
1185{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1186{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1187
1188{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1189{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1190
1191{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1192{ 8260, "frasl","fraction slash, U+2044 NEW" },
1193
1194{ 8364, "euro", "euro sign, U+20AC NEW" },
1195
1196{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1197{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1198{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1199{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1200{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1201{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1202{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1203{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1204{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1205{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1206{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1207{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1208{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1209{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1210{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1211{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1212
1213{ 8704, "forall","for all, U+2200 ISOtech" },
1214{ 8706, "part", "partial differential, U+2202 ISOtech" },
1215{ 8707, "exist","there exists, U+2203 ISOtech" },
1216{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1217{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1218{ 8712, "isin", "element of, U+2208 ISOtech" },
1219{ 8713, "notin","not an element of, U+2209 ISOtech" },
1220{ 8715, "ni", "contains as member, U+220B ISOtech" },
1221{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1222{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
1223{ 8722, "minus","minus sign, U+2212 ISOtech" },
1224{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1225{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1226{ 8733, "prop", "proportional to, U+221D ISOtech" },
1227{ 8734, "infin","infinity, U+221E ISOtech" },
1228{ 8736, "ang", "angle, U+2220 ISOamso" },
1229{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1230{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1231{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1232{ 8746, "cup", "union = cup, U+222A ISOtech" },
1233{ 8747, "int", "integral, U+222B ISOtech" },
1234{ 8756, "there4","therefore, U+2234 ISOtech" },
1235{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1236{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1237{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1238{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1239{ 8801, "equiv","identical to, U+2261 ISOtech" },
1240{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1241{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1242{ 8834, "sub", "subset of, U+2282 ISOtech" },
1243{ 8835, "sup", "superset of, U+2283 ISOtech" },
1244{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1245{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1246{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1247{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1248{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1249{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1250{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1251{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1252{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1253{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1254{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1255{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1256{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1257{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1258
1259{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1260{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1261{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1262{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1263
1264};
1265
1266/************************************************************************
1267 * *
1268 * Commodity functions to handle entities *
1269 * *
1270 ************************************************************************/
1271
1272/*
1273 * Macro used to grow the current buffer.
1274 */
1275#define growBuffer(buffer) { \
1276 buffer##_size *= 2; \
1277 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1278 if (buffer == NULL) { \
1279 perror("realloc failed"); \
1280 return(NULL); \
1281 } \
1282}
1283
1284/**
1285 * htmlEntityLookup:
1286 * @name: the entity name
1287 *
1288 * Lookup the given entity in EntitiesTable
1289 *
1290 * TODO: the linear scan is really ugly, an hash table is really needed.
1291 *
1292 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1293 */
1294htmlEntityDescPtr
1295htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001296 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001297
1298 for (i = 0;i < (sizeof(html40EntitiesTable)/
1299 sizeof(html40EntitiesTable[0]));i++) {
1300 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1301#ifdef DEBUG
1302 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
1303#endif
1304 return(&html40EntitiesTable[i]);
1305 }
1306 }
1307 return(NULL);
1308}
1309
1310/**
1311 * htmlEntityValueLookup:
1312 * @value: the entity's unicode value
1313 *
1314 * Lookup the given entity in EntitiesTable
1315 *
1316 * TODO: the linear scan is really ugly, an hash table is really needed.
1317 *
1318 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1319 */
1320htmlEntityDescPtr
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001321htmlEntityValueLookup(unsigned int value) {
1322 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001323#ifdef DEBUG
1324 int lv = 0;
1325#endif
1326
1327 for (i = 0;i < (sizeof(html40EntitiesTable)/
1328 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001329 if (html40EntitiesTable[i].value >= value) {
1330 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001331 break;
1332#ifdef DEBUG
1333 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
1334#endif
1335 return(&html40EntitiesTable[i]);
1336 }
1337#ifdef DEBUG
1338 if (lv > html40EntitiesTable[i].value) {
1339 xmlGenericError(xmlGenericErrorContext,
1340 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1341 lv, html40EntitiesTable[i].value);
1342 }
1343 lv = html40EntitiesTable[i].value;
1344#endif
1345 }
1346 return(NULL);
1347}
1348
1349/**
1350 * UTF8ToHtml:
1351 * @out: a pointer to an array of bytes to store the result
1352 * @outlen: the length of @out
1353 * @in: a pointer to an array of UTF-8 chars
1354 * @inlen: the length of @in
1355 *
1356 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1357 * plus HTML entities block of chars out.
1358 *
1359 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1360 * The value of @inlen after return is the number of octets consumed
1361 * as the return value is positive, else unpredictiable.
1362 * The value of @outlen after return is the number of octets consumed.
1363 */
1364int
1365UTF8ToHtml(unsigned char* out, int *outlen,
1366 const unsigned char* in, int *inlen) {
1367 const unsigned char* processed = in;
1368 const unsigned char* outend;
1369 const unsigned char* outstart = out;
1370 const unsigned char* instart = in;
1371 const unsigned char* inend;
1372 unsigned int c, d;
1373 int trailing;
1374
1375 if (in == NULL) {
1376 /*
1377 * initialization nothing to do
1378 */
1379 *outlen = 0;
1380 *inlen = 0;
1381 return(0);
1382 }
1383 inend = in + (*inlen);
1384 outend = out + (*outlen);
1385 while (in < inend) {
1386 d = *in++;
1387 if (d < 0x80) { c= d; trailing= 0; }
1388 else if (d < 0xC0) {
1389 /* trailing byte in leading position */
1390 *outlen = out - outstart;
1391 *inlen = processed - instart;
1392 return(-2);
1393 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1394 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1395 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1396 else {
1397 /* no chance for this in Ascii */
1398 *outlen = out - outstart;
1399 *inlen = processed - instart;
1400 return(-2);
1401 }
1402
1403 if (inend - in < trailing) {
1404 break;
1405 }
1406
1407 for ( ; trailing; trailing--) {
1408 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1409 break;
1410 c <<= 6;
1411 c |= d & 0x3F;
1412 }
1413
1414 /* assertion: c is a single UTF-4 value */
1415 if (c < 0x80) {
1416 if (out + 1 >= outend)
1417 break;
1418 *out++ = c;
1419 } else {
1420 int len;
1421 htmlEntityDescPtr ent;
1422
1423 /*
1424 * Try to lookup a predefined HTML entity for it
1425 */
1426
1427 ent = htmlEntityValueLookup(c);
1428 if (ent == NULL) {
1429 /* no chance for this in Ascii */
1430 *outlen = out - outstart;
1431 *inlen = processed - instart;
1432 return(-2);
1433 }
1434 len = strlen(ent->name);
1435 if (out + 2 + len >= outend)
1436 break;
1437 *out++ = '&';
1438 memcpy(out, ent->name, len);
1439 out += len;
1440 *out++ = ';';
1441 }
1442 processed = in;
1443 }
1444 *outlen = out - outstart;
1445 *inlen = processed - instart;
1446 return(0);
1447}
1448
1449/**
1450 * htmlEncodeEntities:
1451 * @out: a pointer to an array of bytes to store the result
1452 * @outlen: the length of @out
1453 * @in: a pointer to an array of UTF-8 chars
1454 * @inlen: the length of @in
1455 * @quoteChar: the quote character to escape (' or ") or zero.
1456 *
1457 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1458 * plus HTML entities block of chars out.
1459 *
1460 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1461 * The value of @inlen after return is the number of octets consumed
1462 * as the return value is positive, else unpredictiable.
1463 * The value of @outlen after return is the number of octets consumed.
1464 */
1465int
1466htmlEncodeEntities(unsigned char* out, int *outlen,
1467 const unsigned char* in, int *inlen, int quoteChar) {
1468 const unsigned char* processed = in;
1469 const unsigned char* outend = out + (*outlen);
1470 const unsigned char* outstart = out;
1471 const unsigned char* instart = in;
1472 const unsigned char* inend = in + (*inlen);
1473 unsigned int c, d;
1474 int trailing;
1475
1476 while (in < inend) {
1477 d = *in++;
1478 if (d < 0x80) { c= d; trailing= 0; }
1479 else if (d < 0xC0) {
1480 /* trailing byte in leading position */
1481 *outlen = out - outstart;
1482 *inlen = processed - instart;
1483 return(-2);
1484 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1485 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1486 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1487 else {
1488 /* no chance for this in Ascii */
1489 *outlen = out - outstart;
1490 *inlen = processed - instart;
1491 return(-2);
1492 }
1493
1494 if (inend - in < trailing)
1495 break;
1496
1497 while (trailing--) {
1498 if (((d= *in++) & 0xC0) != 0x80) {
1499 *outlen = out - outstart;
1500 *inlen = processed - instart;
1501 return(-2);
1502 }
1503 c <<= 6;
1504 c |= d & 0x3F;
1505 }
1506
1507 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001508 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1509 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001510 if (out >= outend)
1511 break;
1512 *out++ = c;
1513 } else {
1514 htmlEntityDescPtr ent;
1515 const char *cp;
1516 char nbuf[16];
1517 int len;
1518
1519 /*
1520 * Try to lookup a predefined HTML entity for it
1521 */
1522 ent = htmlEntityValueLookup(c);
1523 if (ent == NULL) {
1524 sprintf(nbuf, "#%u", c);
1525 cp = nbuf;
1526 }
1527 else
1528 cp = ent->name;
1529 len = strlen(cp);
1530 if (out + 2 + len > outend)
1531 break;
1532 *out++ = '&';
1533 memcpy(out, cp, len);
1534 out += len;
1535 *out++ = ';';
1536 }
1537 processed = in;
1538 }
1539 *outlen = out - outstart;
1540 *inlen = processed - instart;
1541 return(0);
1542}
1543
1544/**
1545 * htmlDecodeEntities:
1546 * @ctxt: the parser context
1547 * @len: the len to decode (in bytes !), -1 for no size limit
1548 * @end: an end marker xmlChar, 0 if none
1549 * @end2: an end marker xmlChar, 0 if none
1550 * @end3: an end marker xmlChar, 0 if none
1551 *
1552 * Subtitute the HTML entities by their value
1553 *
1554 * DEPRECATED !!!!
1555 *
1556 * Returns A newly allocated string with the substitution done. The caller
1557 * must deallocate it !
1558 */
1559xmlChar *
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00001560htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
1561 xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001562 static int deprecated = 0;
1563 if (!deprecated) {
1564 xmlGenericError(xmlGenericErrorContext,
1565 "htmlDecodeEntities() deprecated function reached\n");
1566 deprecated = 1;
1567 }
1568 return(NULL);
1569#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00001570 xmlChar *name = NULL;
1571 xmlChar *buffer = NULL;
1572 unsigned int buffer_size = 0;
1573 unsigned int nbchars = 0;
1574 htmlEntityDescPtr ent;
1575 unsigned int max = (unsigned int) len;
1576 int c,l;
1577
1578 if (ctxt->depth > 40) {
1579 ctxt->errNo = XML_ERR_ENTITY_LOOP;
1580 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1581 ctxt->sax->error(ctxt->userData,
1582 "Detected entity reference loop\n");
1583 ctxt->wellFormed = 0;
1584 ctxt->disableSAX = 1;
1585 return(NULL);
1586 }
1587
1588 /*
1589 * allocate a translation buffer.
1590 */
1591 buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1592 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1593 if (buffer == NULL) {
1594 perror("xmlDecodeEntities: malloc failed");
1595 return(NULL);
1596 }
1597
1598 /*
1599 * Ok loop until we reach one of the ending char or a size limit.
1600 */
1601 c = CUR_CHAR(l);
1602 while ((nbchars < max) && (c != end) &&
1603 (c != end2) && (c != end3)) {
1604
1605 if (c == 0) break;
1606 if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
1607 int val = htmlParseCharRef(ctxt);
1608 COPY_BUF(0,buffer,nbchars,val);
1609 NEXTL(l);
1610 } else if ((c == '&') && (ctxt->token != '&')) {
1611 ent = htmlParseEntityRef(ctxt, &name);
1612 if (name != NULL) {
1613 if (ent != NULL) {
1614 int val = ent->value;
1615 COPY_BUF(0,buffer,nbchars,val);
1616 NEXTL(l);
1617 } else {
1618 const xmlChar *cur = name;
1619
1620 buffer[nbchars++] = '&';
1621 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1622 growBuffer(buffer);
1623 }
1624 while (*cur != 0) {
1625 buffer[nbchars++] = *cur++;
1626 }
1627 buffer[nbchars++] = ';';
1628 }
1629 }
1630 } else {
1631 COPY_BUF(l,buffer,nbchars,c);
1632 NEXTL(l);
1633 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1634 growBuffer(buffer);
1635 }
1636 }
1637 c = CUR_CHAR(l);
1638 }
1639 buffer[nbchars++] = 0;
1640 return(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001641#endif
Owen Taylor3473f882001-02-23 17:55:21 +00001642}
1643
1644/************************************************************************
1645 * *
1646 * Commodity functions to handle streams *
1647 * *
1648 ************************************************************************/
1649
1650/**
Owen Taylor3473f882001-02-23 17:55:21 +00001651 * htmlNewInputStream:
1652 * @ctxt: an HTML parser context
1653 *
1654 * Create a new input stream structure
1655 * Returns the new input stream or NULL
1656 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001657static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001658htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1659 htmlParserInputPtr input;
1660
1661 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1662 if (input == NULL) {
1663 ctxt->errNo = XML_ERR_NO_MEMORY;
1664 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1665 ctxt->sax->error(ctxt->userData,
1666 "malloc: couldn't allocate a new input stream\n");
1667 return(NULL);
1668 }
1669 memset(input, 0, sizeof(htmlParserInput));
1670 input->filename = NULL;
1671 input->directory = NULL;
1672 input->base = NULL;
1673 input->cur = NULL;
1674 input->buf = NULL;
1675 input->line = 1;
1676 input->col = 1;
1677 input->buf = NULL;
1678 input->free = NULL;
1679 input->version = NULL;
1680 input->consumed = 0;
1681 input->length = 0;
1682 return(input);
1683}
1684
1685
1686/************************************************************************
1687 * *
1688 * Commodity functions, cleanup needed ? *
1689 * *
1690 ************************************************************************/
1691
1692/**
1693 * areBlanks:
1694 * @ctxt: an HTML parser context
1695 * @str: a xmlChar *
1696 * @len: the size of @str
1697 *
1698 * Is this a sequence of blank chars that one can ignore ?
1699 *
1700 * Returns 1 if ignorable 0 otherwise.
1701 */
1702
1703static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
1704 int i;
1705 xmlNodePtr lastChild;
1706
1707 for (i = 0;i < len;i++)
1708 if (!(IS_BLANK(str[i]))) return(0);
1709
1710 if (CUR == 0) return(1);
1711 if (CUR != '<') return(0);
1712 if (ctxt->name == NULL)
1713 return(1);
1714 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
1715 return(1);
1716 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
1717 return(1);
1718 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
1719 return(1);
1720 if (ctxt->node == NULL) return(0);
1721 lastChild = xmlGetLastChild(ctxt->node);
1722 if (lastChild == NULL) {
1723 if (ctxt->node->content != NULL) return(0);
1724 } else if (xmlNodeIsText(lastChild)) {
1725 return(0);
1726 } else if (xmlStrEqual(lastChild->name, BAD_CAST"b")) {
1727 return(0);
1728 } else if (xmlStrEqual(lastChild->name, BAD_CAST"bold")) {
1729 return(0);
1730 } else if (xmlStrEqual(lastChild->name, BAD_CAST"em")) {
1731 return(0);
1732 }
1733 return(1);
1734}
1735
1736/**
Owen Taylor3473f882001-02-23 17:55:21 +00001737 * htmlNewDocNoDtD:
1738 * @URI: URI for the dtd, or NULL
1739 * @ExternalID: the external ID of the DTD, or NULL
1740 *
1741 * Returns a new document, do not intialize the DTD if not provided
1742 */
1743htmlDocPtr
1744htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
1745 xmlDocPtr cur;
1746
1747 /*
1748 * Allocate a new document and fill the fields.
1749 */
1750 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
1751 if (cur == NULL) {
1752 xmlGenericError(xmlGenericErrorContext,
1753 "xmlNewDoc : malloc failed\n");
1754 return(NULL);
1755 }
1756 memset(cur, 0, sizeof(xmlDoc));
1757
1758 cur->type = XML_HTML_DOCUMENT_NODE;
1759 cur->version = NULL;
1760 cur->intSubset = NULL;
1761 if ((ExternalID != NULL) ||
1762 (URI != NULL))
1763 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
1764 cur->doc = cur;
1765 cur->name = NULL;
1766 cur->children = NULL;
1767 cur->extSubset = NULL;
1768 cur->oldNs = NULL;
1769 cur->encoding = NULL;
1770 cur->standalone = 1;
1771 cur->compression = 0;
1772 cur->ids = NULL;
1773 cur->refs = NULL;
1774#ifndef XML_WITHOUT_CORBA
1775 cur->_private = NULL;
1776#endif
1777 return(cur);
1778}
1779
1780/**
1781 * htmlNewDoc:
1782 * @URI: URI for the dtd, or NULL
1783 * @ExternalID: the external ID of the DTD, or NULL
1784 *
1785 * Returns a new document
1786 */
1787htmlDocPtr
1788htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1789 if ((URI == NULL) && (ExternalID == NULL))
1790 return(htmlNewDocNoDtD(
1791 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
1792 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"));
1793
1794 return(htmlNewDocNoDtD(URI, ExternalID));
1795}
1796
1797
1798/************************************************************************
1799 * *
1800 * The parser itself *
1801 * Relates to http://www.w3.org/TR/html40 *
1802 * *
1803 ************************************************************************/
1804
1805/************************************************************************
1806 * *
1807 * The parser itself *
1808 * *
1809 ************************************************************************/
1810
1811/**
1812 * htmlParseHTMLName:
1813 * @ctxt: an HTML parser context
1814 *
1815 * parse an HTML tag or attribute name, note that we convert it to lowercase
1816 * since HTML names are not case-sensitive.
1817 *
1818 * Returns the Tag Name parsed or NULL
1819 */
1820
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001821static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001822htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1823 xmlChar *ret = NULL;
1824 int i = 0;
1825 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
1826
1827 if (!IS_LETTER(CUR) && (CUR != '_') &&
1828 (CUR != ':')) return(NULL);
1829
1830 while ((i < HTML_PARSER_BUFFER_SIZE) &&
1831 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1832 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
1833 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
1834 else loc[i] = CUR;
1835 i++;
1836
1837 NEXT;
1838 }
1839
1840 ret = xmlStrndup(loc, i);
1841
1842 return(ret);
1843}
1844
1845/**
1846 * htmlParseName:
1847 * @ctxt: an HTML parser context
1848 *
1849 * parse an HTML name, this routine is case sensistive.
1850 *
1851 * Returns the Name parsed or NULL
1852 */
1853
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001854static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001855htmlParseName(htmlParserCtxtPtr ctxt) {
1856 xmlChar buf[HTML_MAX_NAMELEN];
1857 int len = 0;
1858
1859 GROW;
1860 if (!IS_LETTER(CUR) && (CUR != '_')) {
1861 return(NULL);
1862 }
1863
1864 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1865 (CUR == '.') || (CUR == '-') ||
1866 (CUR == '_') || (CUR == ':') ||
1867 (IS_COMBINING(CUR)) ||
1868 (IS_EXTENDER(CUR))) {
1869 buf[len++] = CUR;
1870 NEXT;
1871 if (len >= HTML_MAX_NAMELEN) {
1872 xmlGenericError(xmlGenericErrorContext,
1873 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1874 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1875 (CUR == '.') || (CUR == '-') ||
1876 (CUR == '_') || (CUR == ':') ||
1877 (IS_COMBINING(CUR)) ||
1878 (IS_EXTENDER(CUR)))
1879 NEXT;
1880 break;
1881 }
1882 }
1883 return(xmlStrndup(buf, len));
1884}
1885
1886/**
1887 * htmlParseHTMLAttribute:
1888 * @ctxt: an HTML parser context
1889 * @stop: a char stop value
1890 *
1891 * parse an HTML attribute value till the stop (quote), if
1892 * stop is 0 then it stops at the first space
1893 *
1894 * Returns the attribute parsed or NULL
1895 */
1896
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001897static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001898htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
1899 xmlChar *buffer = NULL;
1900 int buffer_size = 0;
1901 xmlChar *out = NULL;
1902 xmlChar *name = NULL;
1903
1904 xmlChar *cur = NULL;
1905 htmlEntityDescPtr ent;
1906
1907 /*
1908 * allocate a translation buffer.
1909 */
1910 buffer_size = HTML_PARSER_BUFFER_SIZE;
1911 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1912 if (buffer == NULL) {
1913 perror("htmlParseHTMLAttribute: malloc failed");
1914 return(NULL);
1915 }
1916 out = buffer;
1917
1918 /*
1919 * Ok loop until we reach one of the ending chars
1920 */
1921 while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1922 if ((stop == 0) && (IS_BLANK(CUR))) break;
1923 if (CUR == '&') {
1924 if (NXT(1) == '#') {
1925 unsigned int c;
1926 int bits;
1927
1928 c = htmlParseCharRef(ctxt);
1929 if (c < 0x80)
1930 { *out++ = c; bits= -6; }
1931 else if (c < 0x800)
1932 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1933 else if (c < 0x10000)
1934 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1935 else
1936 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1937
1938 for ( ; bits >= 0; bits-= 6) {
1939 *out++ = ((c >> bits) & 0x3F) | 0x80;
1940 }
1941 } else {
1942 ent = htmlParseEntityRef(ctxt, &name);
1943 if (name == NULL) {
1944 *out++ = '&';
1945 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001946 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00001947
1948 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001949 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00001950 }
1951 } else if (ent == NULL) {
1952 *out++ = '&';
1953 cur = name;
1954 while (*cur != 0) {
1955 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001956 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00001957
1958 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001959 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00001960 }
1961 *out++ = *cur++;
1962 }
1963 xmlFree(name);
1964 } else {
1965 unsigned int c;
1966 int bits;
1967
1968 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001969 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00001970
1971 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001972 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00001973 }
1974 c = (xmlChar)ent->value;
1975 if (c < 0x80)
1976 { *out++ = c; bits= -6; }
1977 else if (c < 0x800)
1978 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1979 else if (c < 0x10000)
1980 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1981 else
1982 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1983
1984 for ( ; bits >= 0; bits-= 6) {
1985 *out++ = ((c >> bits) & 0x3F) | 0x80;
1986 }
1987 xmlFree(name);
1988 }
1989 }
1990 } else {
1991 unsigned int c;
1992 int bits, l;
1993
1994 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001995 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00001996
1997 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001998 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00001999 }
2000 c = CUR_CHAR(l);
2001 if (c < 0x80)
2002 { *out++ = c; bits= -6; }
2003 else if (c < 0x800)
2004 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2005 else if (c < 0x10000)
2006 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2007 else
2008 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2009
2010 for ( ; bits >= 0; bits-= 6) {
2011 *out++ = ((c >> bits) & 0x3F) | 0x80;
2012 }
2013 NEXT;
2014 }
2015 }
2016 *out++ = 0;
2017 return(buffer);
2018}
2019
2020/**
Owen Taylor3473f882001-02-23 17:55:21 +00002021 * htmlParseEntityRef:
2022 * @ctxt: an HTML parser context
2023 * @str: location to store the entity name
2024 *
2025 * parse an HTML ENTITY references
2026 *
2027 * [68] EntityRef ::= '&' Name ';'
2028 *
2029 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2030 * if non-NULL *str will have to be freed by the caller.
2031 */
2032htmlEntityDescPtr
2033htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2034 xmlChar *name;
2035 htmlEntityDescPtr ent = NULL;
2036 *str = NULL;
2037
2038 if (CUR == '&') {
2039 NEXT;
2040 name = htmlParseName(ctxt);
2041 if (name == NULL) {
2042 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2043 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2044 ctxt->wellFormed = 0;
2045 } else {
2046 GROW;
2047 if (CUR == ';') {
2048 *str = name;
2049
2050 /*
2051 * Lookup the entity in the table.
2052 */
2053 ent = htmlEntityLookup(name);
2054 if (ent != NULL) /* OK that's ugly !!! */
2055 NEXT;
2056 } else {
2057 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2058 ctxt->sax->error(ctxt->userData,
2059 "htmlParseEntityRef: expecting ';'\n");
2060 *str = name;
2061 }
2062 }
2063 }
2064 return(ent);
2065}
2066
2067/**
2068 * htmlParseAttValue:
2069 * @ctxt: an HTML parser context
2070 *
2071 * parse a value for an attribute
2072 * Note: the parser won't do substitution of entities here, this
2073 * will be handled later in xmlStringGetNodeList, unless it was
2074 * asked for ctxt->replaceEntities != 0
2075 *
2076 * Returns the AttValue parsed or NULL.
2077 */
2078
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002079static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002080htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2081 xmlChar *ret = NULL;
2082
2083 if (CUR == '"') {
2084 NEXT;
2085 ret = htmlParseHTMLAttribute(ctxt, '"');
2086 if (CUR != '"') {
2087 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2088 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2089 ctxt->wellFormed = 0;
2090 } else
2091 NEXT;
2092 } else if (CUR == '\'') {
2093 NEXT;
2094 ret = htmlParseHTMLAttribute(ctxt, '\'');
2095 if (CUR != '\'') {
2096 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2097 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2098 ctxt->wellFormed = 0;
2099 } else
2100 NEXT;
2101 } else {
2102 /*
2103 * That's an HTMLism, the attribute value may not be quoted
2104 */
2105 ret = htmlParseHTMLAttribute(ctxt, 0);
2106 if (ret == NULL) {
2107 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2108 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2109 ctxt->wellFormed = 0;
2110 }
2111 }
2112 return(ret);
2113}
2114
2115/**
2116 * htmlParseSystemLiteral:
2117 * @ctxt: an HTML parser context
2118 *
2119 * parse an HTML Literal
2120 *
2121 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2122 *
2123 * Returns the SystemLiteral parsed or NULL
2124 */
2125
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002126static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002127htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2128 const xmlChar *q;
2129 xmlChar *ret = NULL;
2130
2131 if (CUR == '"') {
2132 NEXT;
2133 q = CUR_PTR;
2134 while ((IS_CHAR(CUR)) && (CUR != '"'))
2135 NEXT;
2136 if (!IS_CHAR(CUR)) {
2137 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2138 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2139 ctxt->wellFormed = 0;
2140 } else {
2141 ret = xmlStrndup(q, CUR_PTR - q);
2142 NEXT;
2143 }
2144 } else if (CUR == '\'') {
2145 NEXT;
2146 q = CUR_PTR;
2147 while ((IS_CHAR(CUR)) && (CUR != '\''))
2148 NEXT;
2149 if (!IS_CHAR(CUR)) {
2150 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2151 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2152 ctxt->wellFormed = 0;
2153 } else {
2154 ret = xmlStrndup(q, CUR_PTR - q);
2155 NEXT;
2156 }
2157 } else {
2158 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2159 ctxt->sax->error(ctxt->userData,
2160 "SystemLiteral \" or ' expected\n");
2161 ctxt->wellFormed = 0;
2162 }
2163
2164 return(ret);
2165}
2166
2167/**
2168 * htmlParsePubidLiteral:
2169 * @ctxt: an HTML parser context
2170 *
2171 * parse an HTML public literal
2172 *
2173 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2174 *
2175 * Returns the PubidLiteral parsed or NULL.
2176 */
2177
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002178static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002179htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2180 const xmlChar *q;
2181 xmlChar *ret = NULL;
2182 /*
2183 * Name ::= (Letter | '_') (NameChar)*
2184 */
2185 if (CUR == '"') {
2186 NEXT;
2187 q = CUR_PTR;
2188 while (IS_PUBIDCHAR(CUR)) NEXT;
2189 if (CUR != '"') {
2190 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2191 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2192 ctxt->wellFormed = 0;
2193 } else {
2194 ret = xmlStrndup(q, CUR_PTR - q);
2195 NEXT;
2196 }
2197 } else if (CUR == '\'') {
2198 NEXT;
2199 q = CUR_PTR;
2200 while ((IS_LETTER(CUR)) && (CUR != '\''))
2201 NEXT;
2202 if (!IS_LETTER(CUR)) {
2203 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2204 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2205 ctxt->wellFormed = 0;
2206 } else {
2207 ret = xmlStrndup(q, CUR_PTR - q);
2208 NEXT;
2209 }
2210 } else {
2211 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2212 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2213 ctxt->wellFormed = 0;
2214 }
2215
2216 return(ret);
2217}
2218
2219/**
2220 * htmlParseScript:
2221 * @ctxt: an HTML parser context
2222 *
2223 * parse the content of an HTML SCRIPT or STYLE element
2224 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2225 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2226 * http://www.w3.org/TR/html4/types.html#type-script
2227 * http://www.w3.org/TR/html4/types.html#h-6.15
2228 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2229 *
2230 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2231 * element and the value of intrinsic event attributes. User agents must
2232 * not evaluate script data as HTML markup but instead must pass it on as
2233 * data to a script engine.
2234 * NOTES:
2235 * - The content is passed like CDATA
2236 * - the attributes for style and scripting "onXXX" are also described
2237 * as CDATA but SGML allows entities references in attributes so their
2238 * processing is identical as other attributes
2239 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002240static void
Owen Taylor3473f882001-02-23 17:55:21 +00002241htmlParseScript(htmlParserCtxtPtr ctxt) {
2242 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2243 int nbchar = 0;
2244 xmlChar cur;
2245
2246 SHRINK;
2247 cur = CUR;
2248 while (IS_CHAR(cur)) {
2249 if ((cur == '<') && (NXT(1) == '/')) {
2250 /*
2251 * One should break here, the specification is clear:
2252 * Authors should therefore escape "</" within the content.
2253 * Escape mechanisms are specific to each scripting or
2254 * style sheet language.
2255 */
2256 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2257 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2258 break; /* while */
2259 }
2260 buf[nbchar++] = cur;
2261 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2262 if (ctxt->sax->cdataBlock!= NULL) {
2263 /*
2264 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2265 */
2266 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2267 }
2268 nbchar = 0;
2269 }
2270 NEXT;
2271 cur = CUR;
2272 }
2273 if (!(IS_CHAR(cur))) {
2274 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2275 ctxt->sax->error(ctxt->userData,
2276 "Invalid char in CDATA 0x%X\n", cur);
2277 ctxt->wellFormed = 0;
2278 NEXT;
2279 }
2280
2281 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2282 if (ctxt->sax->cdataBlock!= NULL) {
2283 /*
2284 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2285 */
2286 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2287 }
2288 }
2289}
2290
2291
2292/**
2293 * htmlParseCharData:
2294 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002295 *
2296 * parse a CharData section.
2297 * if we are within a CDATA section ']]>' marks an end of section.
2298 *
2299 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2300 */
2301
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002302static void
2303htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002304 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2305 int nbchar = 0;
2306 int cur, l;
2307
2308 SHRINK;
2309 cur = CUR_CHAR(l);
2310 while (((cur != '<') || (ctxt->token == '<')) &&
2311 ((cur != '&') || (ctxt->token == '&')) &&
2312 (IS_CHAR(cur))) {
2313 COPY_BUF(l,buf,nbchar,cur);
2314 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2315 /*
2316 * Ok the segment is to be consumed as chars.
2317 */
2318 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2319 if (areBlanks(ctxt, buf, nbchar)) {
2320 if (ctxt->sax->ignorableWhitespace != NULL)
2321 ctxt->sax->ignorableWhitespace(ctxt->userData,
2322 buf, nbchar);
2323 } else {
2324 htmlCheckParagraph(ctxt);
2325 if (ctxt->sax->characters != NULL)
2326 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2327 }
2328 }
2329 nbchar = 0;
2330 }
2331 NEXTL(l);
2332 cur = CUR_CHAR(l);
2333 }
2334 if (nbchar != 0) {
2335 /*
2336 * Ok the segment is to be consumed as chars.
2337 */
2338 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2339 if (areBlanks(ctxt, buf, nbchar)) {
2340 if (ctxt->sax->ignorableWhitespace != NULL)
2341 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2342 } else {
2343 htmlCheckParagraph(ctxt);
2344 if (ctxt->sax->characters != NULL)
2345 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2346 }
2347 }
2348 }
2349}
2350
2351/**
2352 * htmlParseExternalID:
2353 * @ctxt: an HTML parser context
2354 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002355 *
2356 * Parse an External ID or a Public ID
2357 *
Owen Taylor3473f882001-02-23 17:55:21 +00002358 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2359 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2360 *
2361 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2362 *
2363 * Returns the function returns SystemLiteral and in the second
2364 * case publicID receives PubidLiteral, is strict is off
2365 * it is possible to return NULL and have publicID set.
2366 */
2367
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002368static xmlChar *
2369htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002370 xmlChar *URI = NULL;
2371
2372 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2373 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2374 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2375 SKIP(6);
2376 if (!IS_BLANK(CUR)) {
2377 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2378 ctxt->sax->error(ctxt->userData,
2379 "Space required after 'SYSTEM'\n");
2380 ctxt->wellFormed = 0;
2381 }
2382 SKIP_BLANKS;
2383 URI = htmlParseSystemLiteral(ctxt);
2384 if (URI == NULL) {
2385 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2386 ctxt->sax->error(ctxt->userData,
2387 "htmlParseExternalID: SYSTEM, no URI\n");
2388 ctxt->wellFormed = 0;
2389 }
2390 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2391 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2392 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2393 SKIP(6);
2394 if (!IS_BLANK(CUR)) {
2395 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2396 ctxt->sax->error(ctxt->userData,
2397 "Space required after 'PUBLIC'\n");
2398 ctxt->wellFormed = 0;
2399 }
2400 SKIP_BLANKS;
2401 *publicID = htmlParsePubidLiteral(ctxt);
2402 if (*publicID == NULL) {
2403 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2404 ctxt->sax->error(ctxt->userData,
2405 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2406 ctxt->wellFormed = 0;
2407 }
2408 SKIP_BLANKS;
2409 if ((CUR == '"') || (CUR == '\'')) {
2410 URI = htmlParseSystemLiteral(ctxt);
2411 }
2412 }
2413 return(URI);
2414}
2415
2416/**
2417 * htmlParseComment:
2418 * @ctxt: an HTML parser context
2419 *
2420 * Parse an XML (SGML) comment <!-- .... -->
2421 *
2422 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2423 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002424static void
Owen Taylor3473f882001-02-23 17:55:21 +00002425htmlParseComment(htmlParserCtxtPtr ctxt) {
2426 xmlChar *buf = NULL;
2427 int len;
2428 int size = HTML_PARSER_BUFFER_SIZE;
2429 int q, ql;
2430 int r, rl;
2431 int cur, l;
2432 xmlParserInputState state;
2433
2434 /*
2435 * Check that there is a comment right here.
2436 */
2437 if ((RAW != '<') || (NXT(1) != '!') ||
2438 (NXT(2) != '-') || (NXT(3) != '-')) return;
2439
2440 state = ctxt->instate;
2441 ctxt->instate = XML_PARSER_COMMENT;
2442 SHRINK;
2443 SKIP(4);
2444 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2445 if (buf == NULL) {
2446 xmlGenericError(xmlGenericErrorContext,
2447 "malloc of %d byte failed\n", size);
2448 ctxt->instate = state;
2449 return;
2450 }
2451 q = CUR_CHAR(ql);
2452 NEXTL(ql);
2453 r = CUR_CHAR(rl);
2454 NEXTL(rl);
2455 cur = CUR_CHAR(l);
2456 len = 0;
2457 while (IS_CHAR(cur) &&
2458 ((cur != '>') ||
2459 (r != '-') || (q != '-'))) {
2460 if (len + 5 >= size) {
2461 size *= 2;
2462 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2463 if (buf == NULL) {
2464 xmlGenericError(xmlGenericErrorContext,
2465 "realloc of %d byte failed\n", size);
2466 ctxt->instate = state;
2467 return;
2468 }
2469 }
2470 COPY_BUF(ql,buf,len,q);
2471 q = r;
2472 ql = rl;
2473 r = cur;
2474 rl = l;
2475 NEXTL(l);
2476 cur = CUR_CHAR(l);
2477 if (cur == 0) {
2478 SHRINK;
2479 GROW;
2480 cur = CUR_CHAR(l);
2481 }
2482 }
2483 buf[len] = 0;
2484 if (!IS_CHAR(cur)) {
2485 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2486 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2487 ctxt->sax->error(ctxt->userData,
2488 "Comment not terminated \n<!--%.50s\n", buf);
2489 ctxt->wellFormed = 0;
2490 xmlFree(buf);
2491 } else {
2492 NEXT;
2493 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2494 (!ctxt->disableSAX))
2495 ctxt->sax->comment(ctxt->userData, buf);
2496 xmlFree(buf);
2497 }
2498 ctxt->instate = state;
2499}
2500
2501/**
2502 * htmlParseCharRef:
2503 * @ctxt: an HTML parser context
2504 *
2505 * parse Reference declarations
2506 *
2507 * [66] CharRef ::= '&#' [0-9]+ ';' |
2508 * '&#x' [0-9a-fA-F]+ ';'
2509 *
2510 * Returns the value parsed (as an int)
2511 */
2512int
2513htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2514 int val = 0;
2515
2516 if ((CUR == '&') && (NXT(1) == '#') &&
2517 (NXT(2) == 'x')) {
2518 SKIP(3);
2519 while (CUR != ';') {
2520 if ((CUR >= '0') && (CUR <= '9'))
2521 val = val * 16 + (CUR - '0');
2522 else if ((CUR >= 'a') && (CUR <= 'f'))
2523 val = val * 16 + (CUR - 'a') + 10;
2524 else if ((CUR >= 'A') && (CUR <= 'F'))
2525 val = val * 16 + (CUR - 'A') + 10;
2526 else {
2527 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2528 ctxt->sax->error(ctxt->userData,
2529 "htmlParseCharRef: invalid hexadecimal value\n");
2530 ctxt->wellFormed = 0;
2531 return(0);
2532 }
2533 NEXT;
2534 }
2535 if (CUR == ';')
2536 NEXT;
2537 } else if ((CUR == '&') && (NXT(1) == '#')) {
2538 SKIP(2);
2539 while (CUR != ';') {
2540 if ((CUR >= '0') && (CUR <= '9'))
2541 val = val * 10 + (CUR - '0');
2542 else {
2543 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2544 ctxt->sax->error(ctxt->userData,
2545 "htmlParseCharRef: invalid decimal value\n");
2546 ctxt->wellFormed = 0;
2547 return(0);
2548 }
2549 NEXT;
2550 }
2551 if (CUR == ';')
2552 NEXT;
2553 } else {
2554 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2555 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2556 ctxt->wellFormed = 0;
2557 }
2558 /*
2559 * Check the value IS_CHAR ...
2560 */
2561 if (IS_CHAR(val)) {
2562 return(val);
2563 } else {
2564 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2565 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
2566 val);
2567 ctxt->wellFormed = 0;
2568 }
2569 return(0);
2570}
2571
2572
2573/**
2574 * htmlParseDocTypeDecl :
2575 * @ctxt: an HTML parser context
2576 *
2577 * parse a DOCTYPE declaration
2578 *
2579 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2580 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2581 */
2582
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002583static void
Owen Taylor3473f882001-02-23 17:55:21 +00002584htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
2585 xmlChar *name;
2586 xmlChar *ExternalID = NULL;
2587 xmlChar *URI = NULL;
2588
2589 /*
2590 * We know that '<!DOCTYPE' has been detected.
2591 */
2592 SKIP(9);
2593
2594 SKIP_BLANKS;
2595
2596 /*
2597 * Parse the DOCTYPE name.
2598 */
2599 name = htmlParseName(ctxt);
2600 if (name == NULL) {
2601 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2602 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2603 ctxt->wellFormed = 0;
2604 }
2605 /*
2606 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2607 */
2608
2609 SKIP_BLANKS;
2610
2611 /*
2612 * Check for SystemID and ExternalID
2613 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002614 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00002615 SKIP_BLANKS;
2616
2617 /*
2618 * We should be at the end of the DOCTYPE declaration.
2619 */
2620 if (CUR != '>') {
2621 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2622 ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
2623 ctxt->wellFormed = 0;
2624 /* We shouldn't try to resynchronize ... */
2625 }
2626 NEXT;
2627
2628 /*
2629 * Create or update the document accordingly to the DOCTYPE
2630 */
2631 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2632 (!ctxt->disableSAX))
2633 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
2634
2635 /*
2636 * Cleanup, since we don't use all those identifiers
2637 */
2638 if (URI != NULL) xmlFree(URI);
2639 if (ExternalID != NULL) xmlFree(ExternalID);
2640 if (name != NULL) xmlFree(name);
2641}
2642
2643/**
2644 * htmlParseAttribute:
2645 * @ctxt: an HTML parser context
2646 * @value: a xmlChar ** used to store the value of the attribute
2647 *
2648 * parse an attribute
2649 *
2650 * [41] Attribute ::= Name Eq AttValue
2651 *
2652 * [25] Eq ::= S? '=' S?
2653 *
2654 * With namespace:
2655 *
2656 * [NS 11] Attribute ::= QName Eq AttValue
2657 *
2658 * Also the case QName == xmlns:??? is handled independently as a namespace
2659 * definition.
2660 *
2661 * Returns the attribute name, and the value in *value.
2662 */
2663
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002664static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002665htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
2666 xmlChar *name, *val = NULL;
2667
2668 *value = NULL;
2669 name = htmlParseHTMLName(ctxt);
2670 if (name == NULL) {
2671 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2672 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2673 ctxt->wellFormed = 0;
2674 return(NULL);
2675 }
2676
2677 /*
2678 * read the value
2679 */
2680 SKIP_BLANKS;
2681 if (CUR == '=') {
2682 NEXT;
2683 SKIP_BLANKS;
2684 val = htmlParseAttValue(ctxt);
2685 /******
2686 } else {
2687 * TODO : some attribute must have values, some may not
2688 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2689 ctxt->sax->warning(ctxt->userData,
2690 "No value for attribute %s\n", name); */
2691 }
2692
2693 *value = val;
2694 return(name);
2695}
2696
2697/**
2698 * htmlCheckEncoding:
2699 * @ctxt: an HTML parser context
2700 * @attvalue: the attribute value
2701 *
2702 * Checks an http-equiv attribute from a Meta tag to detect
2703 * the encoding
2704 * If a new encoding is detected the parser is switched to decode
2705 * it and pass UTF8
2706 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002707static void
Owen Taylor3473f882001-02-23 17:55:21 +00002708htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2709 const xmlChar *encoding;
2710
2711 if ((ctxt == NULL) || (attvalue == NULL))
2712 return;
2713
2714 /* do not change encoding */
2715 if (ctxt->input->encoding != NULL)
2716 return;
2717
2718 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
2719 if (encoding != NULL) {
2720 encoding += 8;
2721 } else {
2722 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
2723 if (encoding != NULL)
2724 encoding += 9;
2725 }
2726 if (encoding != NULL) {
2727 xmlCharEncoding enc;
2728 xmlCharEncodingHandlerPtr handler;
2729
2730 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2731
2732 if (ctxt->input->encoding != NULL)
2733 xmlFree((xmlChar *) ctxt->input->encoding);
2734 ctxt->input->encoding = xmlStrdup(encoding);
2735
2736 enc = xmlParseCharEncoding((const char *) encoding);
2737 /*
2738 * registered set of known encodings
2739 */
2740 if (enc != XML_CHAR_ENCODING_ERROR) {
2741 xmlSwitchEncoding(ctxt, enc);
2742 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2743 } else {
2744 /*
2745 * fallback for unknown encodings
2746 */
2747 handler = xmlFindCharEncodingHandler((const char *) encoding);
2748 if (handler != NULL) {
2749 xmlSwitchToEncoding(ctxt, handler);
2750 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2751 } else {
2752 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2753 }
2754 }
2755
2756 if ((ctxt->input->buf != NULL) &&
2757 (ctxt->input->buf->encoder != NULL) &&
2758 (ctxt->input->buf->raw != NULL) &&
2759 (ctxt->input->buf->buffer != NULL)) {
2760 int nbchars;
2761 int processed;
2762
2763 /*
2764 * convert as much as possible to the parser reading buffer.
2765 */
2766 processed = ctxt->input->cur - ctxt->input->base;
2767 xmlBufferShrink(ctxt->input->buf->buffer, processed);
2768 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2769 ctxt->input->buf->buffer,
2770 ctxt->input->buf->raw);
2771 if (nbchars < 0) {
2772 ctxt->errNo = XML_ERR_INVALID_ENCODING;
2773 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2774 ctxt->sax->error(ctxt->userData,
2775 "htmlCheckEncoding: encoder error\n");
2776 }
2777 ctxt->input->base =
2778 ctxt->input->cur = ctxt->input->buf->buffer->content;
2779 }
2780 }
2781}
2782
2783/**
2784 * htmlCheckMeta:
2785 * @ctxt: an HTML parser context
2786 * @atts: the attributes values
2787 *
2788 * Checks an attributes from a Meta tag
2789 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002790static void
Owen Taylor3473f882001-02-23 17:55:21 +00002791htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2792 int i;
2793 const xmlChar *att, *value;
2794 int http = 0;
2795 const xmlChar *content = NULL;
2796
2797 if ((ctxt == NULL) || (atts == NULL))
2798 return;
2799
2800 i = 0;
2801 att = atts[i++];
2802 while (att != NULL) {
2803 value = atts[i++];
2804 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
2805 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
2806 http = 1;
2807 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
2808 content = value;
2809 att = atts[i++];
2810 }
2811 if ((http) && (content != NULL))
2812 htmlCheckEncoding(ctxt, content);
2813
2814}
2815
2816/**
2817 * htmlParseStartTag:
2818 * @ctxt: an HTML parser context
2819 *
2820 * parse a start of tag either for rule element or
2821 * EmptyElement. In both case we don't parse the tag closing chars.
2822 *
2823 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2824 *
2825 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2826 *
2827 * With namespace:
2828 *
2829 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2830 *
2831 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2832 *
2833 */
2834
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002835static void
Owen Taylor3473f882001-02-23 17:55:21 +00002836htmlParseStartTag(htmlParserCtxtPtr ctxt) {
2837 xmlChar *name;
2838 xmlChar *attname;
2839 xmlChar *attvalue;
2840 const xmlChar **atts = NULL;
2841 int nbatts = 0;
2842 int maxatts = 0;
2843 int meta = 0;
2844 int i;
2845
2846 if (CUR != '<') return;
2847 NEXT;
2848
2849 GROW;
2850 name = htmlParseHTMLName(ctxt);
2851 if (name == NULL) {
2852 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2853 ctxt->sax->error(ctxt->userData,
2854 "htmlParseStartTag: invalid element name\n");
2855 ctxt->wellFormed = 0;
2856 /* Dump the bogus tag like browsers do */
2857 while ((IS_CHAR(CUR)) && (CUR != '>'))
2858 NEXT;
2859 return;
2860 }
2861 if (xmlStrEqual(name, BAD_CAST"meta"))
2862 meta = 1;
2863
2864 /*
2865 * Check for auto-closure of HTML elements.
2866 */
2867 htmlAutoClose(ctxt, name);
2868
2869 /*
2870 * Check for implied HTML elements.
2871 */
2872 htmlCheckImplied(ctxt, name);
2873
2874 /*
2875 * Avoid html at any level > 0, head at any level != 1
2876 * or any attempt to recurse body
2877 */
2878 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
2879 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2880 ctxt->sax->error(ctxt->userData,
2881 "htmlParseStartTag: misplaced <html> tag\n");
2882 ctxt->wellFormed = 0;
2883 xmlFree(name);
2884 return;
2885 }
2886 if ((ctxt->nameNr != 1) &&
2887 (xmlStrEqual(name, BAD_CAST"head"))) {
2888 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2889 ctxt->sax->error(ctxt->userData,
2890 "htmlParseStartTag: misplaced <head> tag\n");
2891 ctxt->wellFormed = 0;
2892 xmlFree(name);
2893 return;
2894 }
2895 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002896 int indx;
2897 for (indx = 0;indx < ctxt->nameNr;indx++) {
2898 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor3473f882001-02-23 17:55:21 +00002899 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2900 ctxt->sax->error(ctxt->userData,
2901 "htmlParseStartTag: misplaced <body> tag\n");
2902 ctxt->wellFormed = 0;
2903 xmlFree(name);
2904 return;
2905 }
2906 }
2907 }
2908
2909 /*
2910 * Now parse the attributes, it ends up with the ending
2911 *
2912 * (S Attribute)* S?
2913 */
2914 SKIP_BLANKS;
2915 while ((IS_CHAR(CUR)) &&
2916 (CUR != '>') &&
2917 ((CUR != '/') || (NXT(1) != '>'))) {
2918 long cons = ctxt->nbChars;
2919
2920 GROW;
2921 attname = htmlParseAttribute(ctxt, &attvalue);
2922 if (attname != NULL) {
2923
2924 /*
2925 * Well formedness requires at most one declaration of an attribute
2926 */
2927 for (i = 0; i < nbatts;i += 2) {
2928 if (xmlStrEqual(atts[i], attname)) {
2929 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2930 ctxt->sax->error(ctxt->userData,
2931 "Attribute %s redefined\n",
2932 attname);
2933 ctxt->wellFormed = 0;
2934 xmlFree(attname);
2935 if (attvalue != NULL)
2936 xmlFree(attvalue);
2937 goto failed;
2938 }
2939 }
2940
2941 /*
2942 * Add the pair to atts
2943 */
2944 if (atts == NULL) {
2945 maxatts = 10;
2946 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
2947 if (atts == NULL) {
2948 xmlGenericError(xmlGenericErrorContext,
2949 "malloc of %ld byte failed\n",
2950 maxatts * (long)sizeof(xmlChar *));
2951 if (name != NULL) xmlFree(name);
2952 return;
2953 }
2954 } else if (nbatts + 4 > maxatts) {
2955 maxatts *= 2;
2956 atts = (const xmlChar **) xmlRealloc((void *) atts,
2957 maxatts * sizeof(xmlChar *));
2958 if (atts == NULL) {
2959 xmlGenericError(xmlGenericErrorContext,
2960 "realloc of %ld byte failed\n",
2961 maxatts * (long)sizeof(xmlChar *));
2962 if (name != NULL) xmlFree(name);
2963 return;
2964 }
2965 }
2966 atts[nbatts++] = attname;
2967 atts[nbatts++] = attvalue;
2968 atts[nbatts] = NULL;
2969 atts[nbatts + 1] = NULL;
2970 }
2971 else {
2972 /* Dump the bogus attribute string up to the next blank or
2973 * the end of the tag. */
2974 while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
2975 && ((CUR != '/') || (NXT(1) != '>')))
2976 NEXT;
2977 }
2978
2979failed:
2980 SKIP_BLANKS;
2981 if (cons == ctxt->nbChars) {
2982 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2983 ctxt->sax->error(ctxt->userData,
2984 "htmlParseStartTag: problem parsing attributes\n");
2985 ctxt->wellFormed = 0;
2986 break;
2987 }
2988 }
2989
2990 /*
2991 * Handle specific association to the META tag
2992 */
2993 if (meta)
2994 htmlCheckMeta(ctxt, atts);
2995
2996 /*
2997 * SAX: Start of Element !
2998 */
2999 htmlnamePush(ctxt, xmlStrdup(name));
3000#ifdef DEBUG
3001 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
3002#endif
3003 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3004 ctxt->sax->startElement(ctxt->userData, name, atts);
3005
3006 if (atts != NULL) {
3007 for (i = 0;i < nbatts;i++) {
3008 if (atts[i] != NULL)
3009 xmlFree((xmlChar *) atts[i]);
3010 }
3011 xmlFree((void *) atts);
3012 }
3013 if (name != NULL) xmlFree(name);
3014}
3015
3016/**
3017 * htmlParseEndTag:
3018 * @ctxt: an HTML parser context
3019 *
3020 * parse an end of tag
3021 *
3022 * [42] ETag ::= '</' Name S? '>'
3023 *
3024 * With namespace
3025 *
3026 * [NS 9] ETag ::= '</' QName S? '>'
3027 */
3028
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003029static void
Owen Taylor3473f882001-02-23 17:55:21 +00003030htmlParseEndTag(htmlParserCtxtPtr ctxt) {
3031 xmlChar *name;
3032 xmlChar *oldname;
3033 int i;
3034
3035 if ((CUR != '<') || (NXT(1) != '/')) {
3036 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3037 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3038 ctxt->wellFormed = 0;
3039 return;
3040 }
3041 SKIP(2);
3042
3043 name = htmlParseHTMLName(ctxt);
3044 if (name == NULL) return;
3045
3046 /*
3047 * We should definitely be at the ending "S? '>'" part
3048 */
3049 SKIP_BLANKS;
3050 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3051 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3052 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3053 ctxt->wellFormed = 0;
3054 } else
3055 NEXT;
3056
3057 /*
3058 * If the name read is not one of the element in the parsing stack
3059 * then return, it's just an error.
3060 */
3061 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
3062 if (xmlStrEqual(name, ctxt->nameTab[i])) break;
3063 }
3064 if (i < 0) {
3065 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3066 ctxt->sax->error(ctxt->userData,
3067 "Unexpected end tag : %s\n", name);
3068 xmlFree(name);
3069 ctxt->wellFormed = 0;
3070 return;
3071 }
3072
3073
3074 /*
3075 * Check for auto-closure of HTML elements.
3076 */
3077
3078 htmlAutoCloseOnClose(ctxt, name);
3079
3080 /*
3081 * Well formedness constraints, opening and closing must match.
3082 * With the exception that the autoclose may have popped stuff out
3083 * of the stack.
3084 */
3085 if (!xmlStrEqual(name, ctxt->name)) {
3086#ifdef DEBUG
3087 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
3088#endif
3089 if ((ctxt->name != NULL) &&
3090 (!xmlStrEqual(ctxt->name, name))) {
3091 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3092 ctxt->sax->error(ctxt->userData,
3093 "Opening and ending tag mismatch: %s and %s\n",
3094 name, ctxt->name);
3095 ctxt->wellFormed = 0;
3096 }
3097 }
3098
3099 /*
3100 * SAX: End of Tag
3101 */
3102 oldname = ctxt->name;
3103 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3104 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3105 ctxt->sax->endElement(ctxt->userData, name);
3106 oldname = htmlnamePop(ctxt);
3107 if (oldname != NULL) {
3108#ifdef DEBUG
3109 xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
3110#endif
3111 xmlFree(oldname);
3112#ifdef DEBUG
3113 } else {
3114 xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
3115#endif
3116 }
3117 }
3118
3119 if (name != NULL)
3120 xmlFree(name);
3121
3122 return;
3123}
3124
3125
3126/**
3127 * htmlParseReference:
3128 * @ctxt: an HTML parser context
3129 *
3130 * parse and handle entity references in content,
3131 * this will end-up in a call to character() since this is either a
3132 * CharRef, or a predefined entity.
3133 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003134static void
Owen Taylor3473f882001-02-23 17:55:21 +00003135htmlParseReference(htmlParserCtxtPtr ctxt) {
3136 htmlEntityDescPtr ent;
3137 xmlChar out[6];
3138 xmlChar *name;
3139 if (CUR != '&') return;
3140
3141 if (NXT(1) == '#') {
3142 unsigned int c;
3143 int bits, i = 0;
3144
3145 c = htmlParseCharRef(ctxt);
3146 if (c == 0)
3147 return;
3148
3149 if (c < 0x80) { out[i++]= c; bits= -6; }
3150 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3151 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3152 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3153
3154 for ( ; bits >= 0; bits-= 6) {
3155 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3156 }
3157 out[i] = 0;
3158
3159 htmlCheckParagraph(ctxt);
3160 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3161 ctxt->sax->characters(ctxt->userData, out, i);
3162 } else {
3163 ent = htmlParseEntityRef(ctxt, &name);
3164 if (name == NULL) {
3165 htmlCheckParagraph(ctxt);
3166 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3167 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3168 return;
3169 }
3170 if ((ent == NULL) || (ent->value <= 0)) {
3171 htmlCheckParagraph(ctxt);
3172 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3173 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3174 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3175 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3176 }
3177 } else {
3178 unsigned int c;
3179 int bits, i = 0;
3180
3181 c = ent->value;
3182 if (c < 0x80)
3183 { out[i++]= c; bits= -6; }
3184 else if (c < 0x800)
3185 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3186 else if (c < 0x10000)
3187 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3188 else
3189 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3190
3191 for ( ; bits >= 0; bits-= 6) {
3192 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3193 }
3194 out[i] = 0;
3195
3196 htmlCheckParagraph(ctxt);
3197 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3198 ctxt->sax->characters(ctxt->userData, out, i);
3199 }
3200 xmlFree(name);
3201 }
3202}
3203
3204/**
3205 * htmlParseContent:
3206 * @ctxt: an HTML parser context
3207 * @name: the node name
3208 *
3209 * Parse a content: comment, sub-element, reference or text.
3210 *
3211 */
3212
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003213static void
Owen Taylor3473f882001-02-23 17:55:21 +00003214htmlParseContent(htmlParserCtxtPtr ctxt) {
3215 xmlChar *currentNode;
3216 int depth;
3217
3218 currentNode = xmlStrdup(ctxt->name);
3219 depth = ctxt->nameNr;
3220 while (1) {
3221 long cons = ctxt->nbChars;
3222
3223 GROW;
3224 /*
3225 * Our tag or one of it's parent or children is ending.
3226 */
3227 if ((CUR == '<') && (NXT(1) == '/')) {
3228 htmlParseEndTag(ctxt);
3229 if (currentNode != NULL) xmlFree(currentNode);
3230 return;
3231 }
3232
3233 /*
3234 * Has this node been popped out during parsing of
3235 * the next element
3236 */
3237 if ((!xmlStrEqual(currentNode, ctxt->name)) &&
3238 (depth >= ctxt->nameNr)) {
3239 if (currentNode != NULL) xmlFree(currentNode);
3240 return;
3241 }
3242
Daniel Veillardf9533d12001-03-03 10:04:57 +00003243 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3244 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003245 /*
3246 * Handle SCRIPT/STYLE separately
3247 */
3248 htmlParseScript(ctxt);
3249 } else {
3250 /*
3251 * Sometimes DOCTYPE arrives in the middle of the document
3252 */
3253 if ((CUR == '<') && (NXT(1) == '!') &&
3254 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3255 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3256 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3257 (UPP(8) == 'E')) {
3258 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3259 ctxt->sax->error(ctxt->userData,
3260 "Misplaced DOCTYPE declaration\n");
3261 ctxt->wellFormed = 0;
3262 htmlParseDocTypeDecl(ctxt);
3263 }
3264
3265 /*
3266 * First case : a comment
3267 */
3268 if ((CUR == '<') && (NXT(1) == '!') &&
3269 (NXT(2) == '-') && (NXT(3) == '-')) {
3270 htmlParseComment(ctxt);
3271 }
3272
3273 /*
3274 * Second case : a sub-element.
3275 */
3276 else if (CUR == '<') {
3277 htmlParseElement(ctxt);
3278 }
3279
3280 /*
3281 * Third case : a reference. If if has not been resolved,
3282 * parsing returns it's Name, create the node
3283 */
3284 else if (CUR == '&') {
3285 htmlParseReference(ctxt);
3286 }
3287
3288 /*
3289 * Fourth : end of the resource
3290 */
3291 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003292 htmlAutoCloseOnEnd(ctxt);
3293 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003294 }
3295
3296 /*
3297 * Last case, text. Note that References are handled directly.
3298 */
3299 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003300 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003301 }
3302
3303 if (cons == ctxt->nbChars) {
3304 if (ctxt->node != NULL) {
3305 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3306 ctxt->sax->error(ctxt->userData,
3307 "detected an error in element content\n");
3308 ctxt->wellFormed = 0;
3309 }
3310 break;
3311 }
3312 }
3313 GROW;
3314 }
3315 if (currentNode != NULL) xmlFree(currentNode);
3316}
3317
3318/**
3319 * htmlParseElement:
3320 * @ctxt: an HTML parser context
3321 *
3322 * parse an HTML element, this is highly recursive
3323 *
3324 * [39] element ::= EmptyElemTag | STag content ETag
3325 *
3326 * [41] Attribute ::= Name Eq AttValue
3327 */
3328
3329void
3330htmlParseElement(htmlParserCtxtPtr ctxt) {
3331 xmlChar *name;
3332 xmlChar *currentNode = NULL;
3333 htmlElemDescPtr info;
3334 htmlParserNodeInfo node_info;
3335 xmlChar *oldname;
3336 int depth = ctxt->nameNr;
3337
3338 /* Capture start position */
3339 if (ctxt->record_info) {
3340 node_info.begin_pos = ctxt->input->consumed +
3341 (CUR_PTR - ctxt->input->base);
3342 node_info.begin_line = ctxt->input->line;
3343 }
3344
3345 oldname = xmlStrdup(ctxt->name);
3346 htmlParseStartTag(ctxt);
3347 name = ctxt->name;
3348#ifdef DEBUG
3349 if (oldname == NULL)
3350 xmlGenericError(xmlGenericErrorContext,
3351 "Start of element %s\n", name);
3352 else if (name == NULL)
3353 xmlGenericError(xmlGenericErrorContext,
3354 "Start of element failed, was %s\n", oldname);
3355 else
3356 xmlGenericError(xmlGenericErrorContext,
3357 "Start of element %s, was %s\n", name, oldname);
3358#endif
3359 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3360 (name == NULL)) {
3361 if (CUR == '>')
3362 NEXT;
3363 if (oldname != NULL)
3364 xmlFree(oldname);
3365 return;
3366 }
3367 if (oldname != NULL)
3368 xmlFree(oldname);
3369
3370 /*
3371 * Lookup the info for that element.
3372 */
3373 info = htmlTagLookup(name);
3374 if (info == NULL) {
3375 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3376 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3377 name);
3378 ctxt->wellFormed = 0;
3379 } else if (info->depr) {
3380/***************************
3381 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3382 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3383 name);
3384 ***************************/
3385 }
3386
3387 /*
3388 * Check for an Empty Element labelled the XML/SGML way
3389 */
3390 if ((CUR == '/') && (NXT(1) == '>')) {
3391 SKIP(2);
3392 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3393 ctxt->sax->endElement(ctxt->userData, name);
3394 oldname = htmlnamePop(ctxt);
3395#ifdef DEBUG
3396 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
3397#endif
3398 if (oldname != NULL)
3399 xmlFree(oldname);
3400 return;
3401 }
3402
3403 if (CUR == '>') {
3404 NEXT;
3405 } else {
3406 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3407 ctxt->sax->error(ctxt->userData,
3408 "Couldn't find end of Start Tag %s\n",
3409 name);
3410 ctxt->wellFormed = 0;
3411
3412 /*
3413 * end of parsing of this node.
3414 */
3415 if (xmlStrEqual(name, ctxt->name)) {
3416 nodePop(ctxt);
3417 oldname = htmlnamePop(ctxt);
3418#ifdef DEBUG
3419 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
3420#endif
3421 if (oldname != NULL)
3422 xmlFree(oldname);
3423 }
3424
3425 /*
3426 * Capture end position and add node
3427 */
3428 if ( currentNode != NULL && ctxt->record_info ) {
3429 node_info.end_pos = ctxt->input->consumed +
3430 (CUR_PTR - ctxt->input->base);
3431 node_info.end_line = ctxt->input->line;
3432 node_info.node = ctxt->node;
3433 xmlParserAddNodeInfo(ctxt, &node_info);
3434 }
3435 return;
3436 }
3437
3438 /*
3439 * Check for an Empty Element from DTD definition
3440 */
3441 if ((info != NULL) && (info->empty)) {
3442 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3443 ctxt->sax->endElement(ctxt->userData, name);
3444 oldname = htmlnamePop(ctxt);
3445#ifdef DEBUG
3446 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
3447#endif
3448 if (oldname != NULL)
3449 xmlFree(oldname);
3450 return;
3451 }
3452
3453 /*
3454 * Parse the content of the element:
3455 */
3456 currentNode = xmlStrdup(ctxt->name);
3457 depth = ctxt->nameNr;
3458 while (IS_CHAR(CUR)) {
3459 htmlParseContent(ctxt);
3460 if (ctxt->nameNr < depth) break;
3461 }
3462
Owen Taylor3473f882001-02-23 17:55:21 +00003463 /*
3464 * Capture end position and add node
3465 */
3466 if ( currentNode != NULL && ctxt->record_info ) {
3467 node_info.end_pos = ctxt->input->consumed +
3468 (CUR_PTR - ctxt->input->base);
3469 node_info.end_line = ctxt->input->line;
3470 node_info.node = ctxt->node;
3471 xmlParserAddNodeInfo(ctxt, &node_info);
3472 }
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003473 if (!IS_CHAR(CUR)) {
3474 htmlAutoCloseOnEnd(ctxt);
3475 }
3476
Owen Taylor3473f882001-02-23 17:55:21 +00003477 if (currentNode != NULL)
3478 xmlFree(currentNode);
3479}
3480
3481/**
3482 * htmlParseDocument :
3483 * @ctxt: an HTML parser context
3484 *
3485 * parse an HTML document (and build a tree if using the standard SAX
3486 * interface).
3487 *
3488 * Returns 0, -1 in case of error. the parser context is augmented
3489 * as a result of the parsing.
3490 */
3491
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003492static int
Owen Taylor3473f882001-02-23 17:55:21 +00003493htmlParseDocument(htmlParserCtxtPtr ctxt) {
3494 xmlDtdPtr dtd;
3495
3496 htmlDefaultSAXHandlerInit();
3497 ctxt->html = 1;
3498
3499 GROW;
3500 /*
3501 * SAX: beginning of the document processing.
3502 */
3503 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3504 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3505
3506 /*
3507 * Wipe out everything which is before the first '<'
3508 */
3509 SKIP_BLANKS;
3510 if (CUR == 0) {
3511 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3512 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3513 ctxt->wellFormed = 0;
3514 }
3515
3516 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3517 ctxt->sax->startDocument(ctxt->userData);
3518
3519
3520 /*
3521 * Parse possible comments before any content
3522 */
3523 while ((CUR == '<') && (NXT(1) == '!') &&
3524 (NXT(2) == '-') && (NXT(3) == '-')) {
3525 htmlParseComment(ctxt);
3526 SKIP_BLANKS;
3527 }
3528
3529
3530 /*
3531 * Then possibly doc type declaration(s) and more Misc
3532 * (doctypedecl Misc*)?
3533 */
3534 if ((CUR == '<') && (NXT(1) == '!') &&
3535 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3536 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3537 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3538 (UPP(8) == 'E')) {
3539 htmlParseDocTypeDecl(ctxt);
3540 }
3541 SKIP_BLANKS;
3542
3543 /*
3544 * Parse possible comments before any content
3545 */
3546 while ((CUR == '<') && (NXT(1) == '!') &&
3547 (NXT(2) == '-') && (NXT(3) == '-')) {
3548 htmlParseComment(ctxt);
3549 SKIP_BLANKS;
3550 }
3551
3552 /*
3553 * Time to start parsing the tree itself
3554 */
3555 htmlParseContent(ctxt);
3556
3557 /*
3558 * autoclose
3559 */
3560 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003561 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003562
3563
3564 /*
3565 * SAX: end of the document processing.
3566 */
3567 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3568 ctxt->sax->endDocument(ctxt->userData);
3569
3570 if (ctxt->myDoc != NULL) {
3571 dtd = xmlGetIntSubset(ctxt->myDoc);
3572 if (dtd == NULL)
3573 ctxt->myDoc->intSubset =
3574 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3575 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3576 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3577 }
3578 if (! ctxt->wellFormed) return(-1);
3579 return(0);
3580}
3581
3582
3583/************************************************************************
3584 * *
3585 * Parser contexts handling *
3586 * *
3587 ************************************************************************/
3588
3589/**
3590 * xmlInitParserCtxt:
3591 * @ctxt: an HTML parser context
3592 *
3593 * Initialize a parser context
3594 */
3595
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003596static void
Owen Taylor3473f882001-02-23 17:55:21 +00003597htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3598{
3599 htmlSAXHandler *sax;
3600
3601 if (ctxt == NULL) return;
3602 memset(ctxt, 0, sizeof(htmlParserCtxt));
3603
3604 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3605 if (sax == NULL) {
3606 xmlGenericError(xmlGenericErrorContext,
3607 "htmlInitParserCtxt: out of memory\n");
3608 }
3609 else
3610 memset(sax, 0, sizeof(htmlSAXHandler));
3611
3612 /* Allocate the Input stack */
3613 ctxt->inputTab = (htmlParserInputPtr *)
3614 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3615 if (ctxt->inputTab == NULL) {
3616 xmlGenericError(xmlGenericErrorContext,
3617 "htmlInitParserCtxt: out of memory\n");
3618 ctxt->inputNr = 0;
3619 ctxt->inputMax = 0;
3620 ctxt->input = NULL;
3621 return;
3622 }
3623 ctxt->inputNr = 0;
3624 ctxt->inputMax = 5;
3625 ctxt->input = NULL;
3626 ctxt->version = NULL;
3627 ctxt->encoding = NULL;
3628 ctxt->standalone = -1;
3629 ctxt->instate = XML_PARSER_START;
3630
3631 /* Allocate the Node stack */
3632 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3633 if (ctxt->nodeTab == NULL) {
3634 xmlGenericError(xmlGenericErrorContext,
3635 "htmlInitParserCtxt: out of memory\n");
3636 ctxt->nodeNr = 0;
3637 ctxt->nodeMax = 0;
3638 ctxt->node = NULL;
3639 ctxt->inputNr = 0;
3640 ctxt->inputMax = 0;
3641 ctxt->input = NULL;
3642 return;
3643 }
3644 ctxt->nodeNr = 0;
3645 ctxt->nodeMax = 10;
3646 ctxt->node = NULL;
3647
3648 /* Allocate the Name stack */
3649 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
3650 if (ctxt->nameTab == NULL) {
3651 xmlGenericError(xmlGenericErrorContext,
3652 "htmlInitParserCtxt: out of memory\n");
3653 ctxt->nameNr = 0;
3654 ctxt->nameMax = 10;
3655 ctxt->name = NULL;
3656 ctxt->nodeNr = 0;
3657 ctxt->nodeMax = 0;
3658 ctxt->node = NULL;
3659 ctxt->inputNr = 0;
3660 ctxt->inputMax = 0;
3661 ctxt->input = NULL;
3662 return;
3663 }
3664 ctxt->nameNr = 0;
3665 ctxt->nameMax = 10;
3666 ctxt->name = NULL;
3667
3668 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3669 else {
3670 ctxt->sax = sax;
3671 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3672 }
3673 ctxt->userData = ctxt;
3674 ctxt->myDoc = NULL;
3675 ctxt->wellFormed = 1;
3676 ctxt->replaceEntities = 0;
3677 ctxt->html = 1;
3678 ctxt->record_info = 0;
3679 ctxt->validate = 0;
3680 ctxt->nbChars = 0;
3681 ctxt->checkIndex = 0;
3682 xmlInitNodeInfoSeq(&ctxt->node_seq);
3683}
3684
3685/**
3686 * htmlFreeParserCtxt:
3687 * @ctxt: an HTML parser context
3688 *
3689 * Free all the memory used by a parser context. However the parsed
3690 * document in ctxt->myDoc is not freed.
3691 */
3692
3693void
3694htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3695{
3696 xmlFreeParserCtxt(ctxt);
3697}
3698
3699/**
3700 * htmlCreateDocParserCtxt :
3701 * @cur: a pointer to an array of xmlChar
3702 * @encoding: a free form C string describing the HTML document encoding, or NULL
3703 *
3704 * Create a parser context for an HTML document.
3705 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003706 * TODO: check the need to add encoding handling there
3707 *
Owen Taylor3473f882001-02-23 17:55:21 +00003708 * Returns the new parser context or NULL
3709 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003710static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00003711htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Owen Taylor3473f882001-02-23 17:55:21 +00003712 htmlParserCtxtPtr ctxt;
3713 htmlParserInputPtr input;
3714 /* htmlCharEncoding enc; */
3715
3716 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
3717 if (ctxt == NULL) {
3718 perror("malloc");
3719 return(NULL);
3720 }
3721 htmlInitParserCtxt(ctxt);
3722 input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
3723 if (input == NULL) {
3724 perror("malloc");
3725 xmlFree(ctxt);
3726 return(NULL);
3727 }
3728 memset(input, 0, sizeof(htmlParserInput));
3729
3730 input->line = 1;
3731 input->col = 1;
3732 input->base = cur;
3733 input->cur = cur;
3734
3735 inputPush(ctxt, input);
3736 return(ctxt);
3737}
3738
3739/************************************************************************
3740 * *
3741 * Progressive parsing interfaces *
3742 * *
3743 ************************************************************************/
3744
3745/**
3746 * htmlParseLookupSequence:
3747 * @ctxt: an HTML parser context
3748 * @first: the first char to lookup
3749 * @next: the next char to lookup or zero
3750 * @third: the next char to lookup or zero
3751 *
3752 * Try to find if a sequence (first, next, third) or just (first next) or
3753 * (first) is available in the input stream.
3754 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3755 * to avoid rescanning sequences of bytes, it DOES change the state of the
3756 * parser, do not use liberally.
3757 * This is basically similar to xmlParseLookupSequence()
3758 *
3759 * Returns the index to the current parsing point if the full sequence
3760 * is available, -1 otherwise.
3761 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003762static int
Owen Taylor3473f882001-02-23 17:55:21 +00003763htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3764 xmlChar next, xmlChar third) {
3765 int base, len;
3766 htmlParserInputPtr in;
3767 const xmlChar *buf;
3768
3769 in = ctxt->input;
3770 if (in == NULL) return(-1);
3771 base = in->cur - in->base;
3772 if (base < 0) return(-1);
3773 if (ctxt->checkIndex > base)
3774 base = ctxt->checkIndex;
3775 if (in->buf == NULL) {
3776 buf = in->base;
3777 len = in->length;
3778 } else {
3779 buf = in->buf->buffer->content;
3780 len = in->buf->buffer->use;
3781 }
3782 /* take into account the sequence length */
3783 if (third) len -= 2;
3784 else if (next) len --;
3785 for (;base < len;base++) {
3786 if (buf[base] == first) {
3787 if (third != 0) {
3788 if ((buf[base + 1] != next) ||
3789 (buf[base + 2] != third)) continue;
3790 } else if (next != 0) {
3791 if (buf[base + 1] != next) continue;
3792 }
3793 ctxt->checkIndex = 0;
3794#ifdef DEBUG_PUSH
3795 if (next == 0)
3796 xmlGenericError(xmlGenericErrorContext,
3797 "HPP: lookup '%c' found at %d\n",
3798 first, base);
3799 else if (third == 0)
3800 xmlGenericError(xmlGenericErrorContext,
3801 "HPP: lookup '%c%c' found at %d\n",
3802 first, next, base);
3803 else
3804 xmlGenericError(xmlGenericErrorContext,
3805 "HPP: lookup '%c%c%c' found at %d\n",
3806 first, next, third, base);
3807#endif
3808 return(base - (in->cur - in->base));
3809 }
3810 }
3811 ctxt->checkIndex = base;
3812#ifdef DEBUG_PUSH
3813 if (next == 0)
3814 xmlGenericError(xmlGenericErrorContext,
3815 "HPP: lookup '%c' failed\n", first);
3816 else if (third == 0)
3817 xmlGenericError(xmlGenericErrorContext,
3818 "HPP: lookup '%c%c' failed\n", first, next);
3819 else
3820 xmlGenericError(xmlGenericErrorContext,
3821 "HPP: lookup '%c%c%c' failed\n", first, next, third);
3822#endif
3823 return(-1);
3824}
3825
3826/**
3827 * htmlParseTryOrFinish:
3828 * @ctxt: an HTML parser context
3829 * @terminate: last chunk indicator
3830 *
3831 * Try to progress on parsing
3832 *
3833 * Returns zero if no parsing was possible
3834 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003835static int
Owen Taylor3473f882001-02-23 17:55:21 +00003836htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
3837 int ret = 0;
3838 htmlParserInputPtr in;
3839 int avail = 0;
3840 xmlChar cur, next;
3841
3842#ifdef DEBUG_PUSH
3843 switch (ctxt->instate) {
3844 case XML_PARSER_EOF:
3845 xmlGenericError(xmlGenericErrorContext,
3846 "HPP: try EOF\n"); break;
3847 case XML_PARSER_START:
3848 xmlGenericError(xmlGenericErrorContext,
3849 "HPP: try START\n"); break;
3850 case XML_PARSER_MISC:
3851 xmlGenericError(xmlGenericErrorContext,
3852 "HPP: try MISC\n");break;
3853 case XML_PARSER_COMMENT:
3854 xmlGenericError(xmlGenericErrorContext,
3855 "HPP: try COMMENT\n");break;
3856 case XML_PARSER_PROLOG:
3857 xmlGenericError(xmlGenericErrorContext,
3858 "HPP: try PROLOG\n");break;
3859 case XML_PARSER_START_TAG:
3860 xmlGenericError(xmlGenericErrorContext,
3861 "HPP: try START_TAG\n");break;
3862 case XML_PARSER_CONTENT:
3863 xmlGenericError(xmlGenericErrorContext,
3864 "HPP: try CONTENT\n");break;
3865 case XML_PARSER_CDATA_SECTION:
3866 xmlGenericError(xmlGenericErrorContext,
3867 "HPP: try CDATA_SECTION\n");break;
3868 case XML_PARSER_END_TAG:
3869 xmlGenericError(xmlGenericErrorContext,
3870 "HPP: try END_TAG\n");break;
3871 case XML_PARSER_ENTITY_DECL:
3872 xmlGenericError(xmlGenericErrorContext,
3873 "HPP: try ENTITY_DECL\n");break;
3874 case XML_PARSER_ENTITY_VALUE:
3875 xmlGenericError(xmlGenericErrorContext,
3876 "HPP: try ENTITY_VALUE\n");break;
3877 case XML_PARSER_ATTRIBUTE_VALUE:
3878 xmlGenericError(xmlGenericErrorContext,
3879 "HPP: try ATTRIBUTE_VALUE\n");break;
3880 case XML_PARSER_DTD:
3881 xmlGenericError(xmlGenericErrorContext,
3882 "HPP: try DTD\n");break;
3883 case XML_PARSER_EPILOG:
3884 xmlGenericError(xmlGenericErrorContext,
3885 "HPP: try EPILOG\n");break;
3886 case XML_PARSER_PI:
3887 xmlGenericError(xmlGenericErrorContext,
3888 "HPP: try PI\n");break;
3889 case XML_PARSER_SYSTEM_LITERAL:
3890 xmlGenericError(xmlGenericErrorContext,
3891 "HPP: try SYSTEM_LITERAL\n");break;
3892 }
3893#endif
3894
3895 while (1) {
3896
3897 in = ctxt->input;
3898 if (in == NULL) break;
3899 if (in->buf == NULL)
3900 avail = in->length - (in->cur - in->base);
3901 else
3902 avail = in->buf->buffer->use - (in->cur - in->base);
3903 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003904 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003905 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
3906 /*
3907 * SAX: end of the document processing.
3908 */
3909 ctxt->instate = XML_PARSER_EOF;
3910 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3911 ctxt->sax->endDocument(ctxt->userData);
3912 }
3913 }
3914 if (avail < 1)
3915 goto done;
3916 switch (ctxt->instate) {
3917 case XML_PARSER_EOF:
3918 /*
3919 * Document parsing is done !
3920 */
3921 goto done;
3922 case XML_PARSER_START:
3923 /*
3924 * Very first chars read from the document flow.
3925 */
3926 cur = in->cur[0];
3927 if (IS_BLANK(cur)) {
3928 SKIP_BLANKS;
3929 if (in->buf == NULL)
3930 avail = in->length - (in->cur - in->base);
3931 else
3932 avail = in->buf->buffer->use - (in->cur - in->base);
3933 }
3934 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3935 ctxt->sax->setDocumentLocator(ctxt->userData,
3936 &xmlDefaultSAXLocator);
3937 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
3938 (!ctxt->disableSAX))
3939 ctxt->sax->startDocument(ctxt->userData);
3940
3941 cur = in->cur[0];
3942 next = in->cur[1];
3943 if ((cur == '<') && (next == '!') &&
3944 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3945 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3946 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3947 (UPP(8) == 'E')) {
3948 if ((!terminate) &&
3949 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
3950 goto done;
3951#ifdef DEBUG_PUSH
3952 xmlGenericError(xmlGenericErrorContext,
3953 "HPP: Parsing internal subset\n");
3954#endif
3955 htmlParseDocTypeDecl(ctxt);
3956 ctxt->instate = XML_PARSER_PROLOG;
3957#ifdef DEBUG_PUSH
3958 xmlGenericError(xmlGenericErrorContext,
3959 "HPP: entering PROLOG\n");
3960#endif
3961 } else {
3962 ctxt->instate = XML_PARSER_MISC;
3963 }
3964#ifdef DEBUG_PUSH
3965 xmlGenericError(xmlGenericErrorContext,
3966 "HPP: entering MISC\n");
3967#endif
3968 break;
3969 case XML_PARSER_MISC:
3970 SKIP_BLANKS;
3971 if (in->buf == NULL)
3972 avail = in->length - (in->cur - in->base);
3973 else
3974 avail = in->buf->buffer->use - (in->cur - in->base);
3975 if (avail < 2)
3976 goto done;
3977 cur = in->cur[0];
3978 next = in->cur[1];
3979 if ((cur == '<') && (next == '!') &&
3980 (in->cur[2] == '-') && (in->cur[3] == '-')) {
3981 if ((!terminate) &&
3982 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
3983 goto done;
3984#ifdef DEBUG_PUSH
3985 xmlGenericError(xmlGenericErrorContext,
3986 "HPP: Parsing Comment\n");
3987#endif
3988 htmlParseComment(ctxt);
3989 ctxt->instate = XML_PARSER_MISC;
3990 } else if ((cur == '<') && (next == '!') &&
3991 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3992 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3993 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3994 (UPP(8) == 'E')) {
3995 if ((!terminate) &&
3996 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
3997 goto done;
3998#ifdef DEBUG_PUSH
3999 xmlGenericError(xmlGenericErrorContext,
4000 "HPP: Parsing internal subset\n");
4001#endif
4002 htmlParseDocTypeDecl(ctxt);
4003 ctxt->instate = XML_PARSER_PROLOG;
4004#ifdef DEBUG_PUSH
4005 xmlGenericError(xmlGenericErrorContext,
4006 "HPP: entering PROLOG\n");
4007#endif
4008 } else if ((cur == '<') && (next == '!') &&
4009 (avail < 9)) {
4010 goto done;
4011 } else {
4012 ctxt->instate = XML_PARSER_START_TAG;
4013#ifdef DEBUG_PUSH
4014 xmlGenericError(xmlGenericErrorContext,
4015 "HPP: entering START_TAG\n");
4016#endif
4017 }
4018 break;
4019 case XML_PARSER_PROLOG:
4020 SKIP_BLANKS;
4021 if (in->buf == NULL)
4022 avail = in->length - (in->cur - in->base);
4023 else
4024 avail = in->buf->buffer->use - (in->cur - in->base);
4025 if (avail < 2)
4026 goto done;
4027 cur = in->cur[0];
4028 next = in->cur[1];
4029 if ((cur == '<') && (next == '!') &&
4030 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4031 if ((!terminate) &&
4032 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4033 goto done;
4034#ifdef DEBUG_PUSH
4035 xmlGenericError(xmlGenericErrorContext,
4036 "HPP: Parsing Comment\n");
4037#endif
4038 htmlParseComment(ctxt);
4039 ctxt->instate = XML_PARSER_PROLOG;
4040 } else if ((cur == '<') && (next == '!') &&
4041 (avail < 4)) {
4042 goto done;
4043 } else {
4044 ctxt->instate = XML_PARSER_START_TAG;
4045#ifdef DEBUG_PUSH
4046 xmlGenericError(xmlGenericErrorContext,
4047 "HPP: entering START_TAG\n");
4048#endif
4049 }
4050 break;
4051 case XML_PARSER_EPILOG:
4052 if (in->buf == NULL)
4053 avail = in->length - (in->cur - in->base);
4054 else
4055 avail = in->buf->buffer->use - (in->cur - in->base);
4056 if (avail < 1)
4057 goto done;
4058 cur = in->cur[0];
4059 if (IS_BLANK(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004060 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004061 goto done;
4062 }
4063 if (avail < 2)
4064 goto done;
4065 next = in->cur[1];
4066 if ((cur == '<') && (next == '!') &&
4067 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4068 if ((!terminate) &&
4069 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4070 goto done;
4071#ifdef DEBUG_PUSH
4072 xmlGenericError(xmlGenericErrorContext,
4073 "HPP: Parsing Comment\n");
4074#endif
4075 htmlParseComment(ctxt);
4076 ctxt->instate = XML_PARSER_EPILOG;
4077 } else if ((cur == '<') && (next == '!') &&
4078 (avail < 4)) {
4079 goto done;
4080 } else {
4081 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004082 ctxt->wellFormed = 0;
4083 ctxt->instate = XML_PARSER_EOF;
4084#ifdef DEBUG_PUSH
4085 xmlGenericError(xmlGenericErrorContext,
4086 "HPP: entering EOF\n");
4087#endif
4088 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4089 ctxt->sax->endDocument(ctxt->userData);
4090 goto done;
4091 }
4092 break;
4093 case XML_PARSER_START_TAG: {
4094 xmlChar *name, *oldname;
4095 int depth = ctxt->nameNr;
4096 htmlElemDescPtr info;
4097
4098 if (avail < 2)
4099 goto done;
4100 cur = in->cur[0];
4101 if (cur != '<') {
4102 ctxt->instate = XML_PARSER_CONTENT;
4103#ifdef DEBUG_PUSH
4104 xmlGenericError(xmlGenericErrorContext,
4105 "HPP: entering CONTENT\n");
4106#endif
4107 break;
4108 }
4109 if ((!terminate) &&
4110 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4111 goto done;
4112
4113 oldname = xmlStrdup(ctxt->name);
4114 htmlParseStartTag(ctxt);
4115 name = ctxt->name;
4116#ifdef DEBUG
4117 if (oldname == NULL)
4118 xmlGenericError(xmlGenericErrorContext,
4119 "Start of element %s\n", name);
4120 else if (name == NULL)
4121 xmlGenericError(xmlGenericErrorContext,
4122 "Start of element failed, was %s\n",
4123 oldname);
4124 else
4125 xmlGenericError(xmlGenericErrorContext,
4126 "Start of element %s, was %s\n",
4127 name, oldname);
4128#endif
4129 if (((depth == ctxt->nameNr) &&
4130 (xmlStrEqual(oldname, ctxt->name))) ||
4131 (name == NULL)) {
4132 if (CUR == '>')
4133 NEXT;
4134 if (oldname != NULL)
4135 xmlFree(oldname);
4136 break;
4137 }
4138 if (oldname != NULL)
4139 xmlFree(oldname);
4140
4141 /*
4142 * Lookup the info for that element.
4143 */
4144 info = htmlTagLookup(name);
4145 if (info == NULL) {
4146 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4147 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4148 name);
4149 ctxt->wellFormed = 0;
4150 } else if (info->depr) {
4151 /***************************
4152 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4153 ctxt->sax->warning(ctxt->userData,
4154 "Tag %s is deprecated\n",
4155 name);
4156 ***************************/
4157 }
4158
4159 /*
4160 * Check for an Empty Element labelled the XML/SGML way
4161 */
4162 if ((CUR == '/') && (NXT(1) == '>')) {
4163 SKIP(2);
4164 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4165 ctxt->sax->endElement(ctxt->userData, name);
4166 oldname = htmlnamePop(ctxt);
4167#ifdef DEBUG
4168 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
4169 oldname);
4170#endif
4171 if (oldname != NULL)
4172 xmlFree(oldname);
4173 ctxt->instate = XML_PARSER_CONTENT;
4174#ifdef DEBUG_PUSH
4175 xmlGenericError(xmlGenericErrorContext,
4176 "HPP: entering CONTENT\n");
4177#endif
4178 break;
4179 }
4180
4181 if (CUR == '>') {
4182 NEXT;
4183 } else {
4184 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4185 ctxt->sax->error(ctxt->userData,
4186 "Couldn't find end of Start Tag %s\n",
4187 name);
4188 ctxt->wellFormed = 0;
4189
4190 /*
4191 * end of parsing of this node.
4192 */
4193 if (xmlStrEqual(name, ctxt->name)) {
4194 nodePop(ctxt);
4195 oldname = htmlnamePop(ctxt);
4196#ifdef DEBUG
4197 xmlGenericError(xmlGenericErrorContext,
4198 "End of start tag problem: popping out %s\n", oldname);
4199#endif
4200 if (oldname != NULL)
4201 xmlFree(oldname);
4202 }
4203
4204 ctxt->instate = XML_PARSER_CONTENT;
4205#ifdef DEBUG_PUSH
4206 xmlGenericError(xmlGenericErrorContext,
4207 "HPP: entering CONTENT\n");
4208#endif
4209 break;
4210 }
4211
4212 /*
4213 * Check for an Empty Element from DTD definition
4214 */
4215 if ((info != NULL) && (info->empty)) {
4216 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4217 ctxt->sax->endElement(ctxt->userData, name);
4218 oldname = htmlnamePop(ctxt);
4219#ifdef DEBUG
4220 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
4221#endif
4222 if (oldname != NULL)
4223 xmlFree(oldname);
4224 }
4225 ctxt->instate = XML_PARSER_CONTENT;
4226#ifdef DEBUG_PUSH
4227 xmlGenericError(xmlGenericErrorContext,
4228 "HPP: entering CONTENT\n");
4229#endif
4230 break;
4231 }
4232 case XML_PARSER_CONTENT: {
4233 long cons;
4234 /*
4235 * Handle preparsed entities and charRef
4236 */
4237 if (ctxt->token != 0) {
4238 xmlChar chr[2] = { 0 , 0 } ;
4239
4240 chr[0] = (xmlChar) ctxt->token;
4241 htmlCheckParagraph(ctxt);
4242 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4243 ctxt->sax->characters(ctxt->userData, chr, 1);
4244 ctxt->token = 0;
4245 ctxt->checkIndex = 0;
4246 }
4247 if ((avail == 1) && (terminate)) {
4248 cur = in->cur[0];
4249 if ((cur != '<') && (cur != '&')) {
4250 if (ctxt->sax != NULL) {
4251 if (IS_BLANK(cur)) {
4252 if (ctxt->sax->ignorableWhitespace != NULL)
4253 ctxt->sax->ignorableWhitespace(
4254 ctxt->userData, &cur, 1);
4255 } else {
4256 htmlCheckParagraph(ctxt);
4257 if (ctxt->sax->characters != NULL)
4258 ctxt->sax->characters(
4259 ctxt->userData, &cur, 1);
4260 }
4261 }
4262 ctxt->token = 0;
4263 ctxt->checkIndex = 0;
4264 NEXT;
4265 }
4266 break;
4267 }
4268 if (avail < 2)
4269 goto done;
4270 cur = in->cur[0];
4271 next = in->cur[1];
4272 cons = ctxt->nbChars;
4273 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4274 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4275 /*
4276 * Handle SCRIPT/STYLE separately
4277 */
4278 if ((!terminate) &&
4279 (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4280 goto done;
4281 htmlParseScript(ctxt);
4282 if ((cur == '<') && (next == '/')) {
4283 ctxt->instate = XML_PARSER_END_TAG;
4284 ctxt->checkIndex = 0;
4285#ifdef DEBUG_PUSH
4286 xmlGenericError(xmlGenericErrorContext,
4287 "HPP: entering END_TAG\n");
4288#endif
4289 break;
4290 }
4291 } else {
4292 /*
4293 * Sometimes DOCTYPE arrives in the middle of the document
4294 */
4295 if ((cur == '<') && (next == '!') &&
4296 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4297 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4298 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4299 (UPP(8) == 'E')) {
4300 if ((!terminate) &&
4301 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4302 goto done;
4303 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4304 ctxt->sax->error(ctxt->userData,
4305 "Misplaced DOCTYPE declaration\n");
4306 ctxt->wellFormed = 0;
4307 htmlParseDocTypeDecl(ctxt);
4308 } else if ((cur == '<') && (next == '!') &&
4309 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4310 if ((!terminate) &&
4311 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4312 goto done;
4313#ifdef DEBUG_PUSH
4314 xmlGenericError(xmlGenericErrorContext,
4315 "HPP: Parsing Comment\n");
4316#endif
4317 htmlParseComment(ctxt);
4318 ctxt->instate = XML_PARSER_CONTENT;
4319 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4320 goto done;
4321 } else if ((cur == '<') && (next == '/')) {
4322 ctxt->instate = XML_PARSER_END_TAG;
4323 ctxt->checkIndex = 0;
4324#ifdef DEBUG_PUSH
4325 xmlGenericError(xmlGenericErrorContext,
4326 "HPP: entering END_TAG\n");
4327#endif
4328 break;
4329 } else if (cur == '<') {
4330 ctxt->instate = XML_PARSER_START_TAG;
4331 ctxt->checkIndex = 0;
4332#ifdef DEBUG_PUSH
4333 xmlGenericError(xmlGenericErrorContext,
4334 "HPP: entering START_TAG\n");
4335#endif
4336 break;
4337 } else if (cur == '&') {
4338 if ((!terminate) &&
4339 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4340 goto done;
4341#ifdef DEBUG_PUSH
4342 xmlGenericError(xmlGenericErrorContext,
4343 "HPP: Parsing Reference\n");
4344#endif
4345 /* TODO: check generation of subtrees if noent !!! */
4346 htmlParseReference(ctxt);
4347 } else {
4348 /* TODO Avoid the extra copy, handle directly !!!!!! */
4349 /*
4350 * Goal of the following test is :
4351 * - minimize calls to the SAX 'character' callback
4352 * when they are mergeable
4353 */
4354 if ((ctxt->inputNr == 1) &&
4355 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4356 if ((!terminate) &&
4357 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4358 goto done;
4359 }
4360 ctxt->checkIndex = 0;
4361#ifdef DEBUG_PUSH
4362 xmlGenericError(xmlGenericErrorContext,
4363 "HPP: Parsing char data\n");
4364#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004365 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004366 }
4367 }
4368 if (cons == ctxt->nbChars) {
4369 if (ctxt->node != NULL) {
4370 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4371 ctxt->sax->error(ctxt->userData,
4372 "detected an error in element content\n");
4373 ctxt->wellFormed = 0;
4374 }
4375 NEXT;
4376 break;
4377 }
4378
4379 break;
4380 }
4381 case XML_PARSER_END_TAG:
4382 if (avail < 2)
4383 goto done;
4384 if ((!terminate) &&
4385 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4386 goto done;
4387 htmlParseEndTag(ctxt);
4388 if (ctxt->nameNr == 0) {
4389 ctxt->instate = XML_PARSER_EPILOG;
4390 } else {
4391 ctxt->instate = XML_PARSER_CONTENT;
4392 }
4393 ctxt->checkIndex = 0;
4394#ifdef DEBUG_PUSH
4395 xmlGenericError(xmlGenericErrorContext,
4396 "HPP: entering CONTENT\n");
4397#endif
4398 break;
4399 case XML_PARSER_CDATA_SECTION:
4400 xmlGenericError(xmlGenericErrorContext,
4401 "HPP: internal error, state == CDATA\n");
4402 ctxt->instate = XML_PARSER_CONTENT;
4403 ctxt->checkIndex = 0;
4404#ifdef DEBUG_PUSH
4405 xmlGenericError(xmlGenericErrorContext,
4406 "HPP: entering CONTENT\n");
4407#endif
4408 break;
4409 case XML_PARSER_DTD:
4410 xmlGenericError(xmlGenericErrorContext,
4411 "HPP: internal error, state == DTD\n");
4412 ctxt->instate = XML_PARSER_CONTENT;
4413 ctxt->checkIndex = 0;
4414#ifdef DEBUG_PUSH
4415 xmlGenericError(xmlGenericErrorContext,
4416 "HPP: entering CONTENT\n");
4417#endif
4418 break;
4419 case XML_PARSER_COMMENT:
4420 xmlGenericError(xmlGenericErrorContext,
4421 "HPP: internal error, state == COMMENT\n");
4422 ctxt->instate = XML_PARSER_CONTENT;
4423 ctxt->checkIndex = 0;
4424#ifdef DEBUG_PUSH
4425 xmlGenericError(xmlGenericErrorContext,
4426 "HPP: entering CONTENT\n");
4427#endif
4428 break;
4429 case XML_PARSER_PI:
4430 xmlGenericError(xmlGenericErrorContext,
4431 "HPP: internal error, state == PI\n");
4432 ctxt->instate = XML_PARSER_CONTENT;
4433 ctxt->checkIndex = 0;
4434#ifdef DEBUG_PUSH
4435 xmlGenericError(xmlGenericErrorContext,
4436 "HPP: entering CONTENT\n");
4437#endif
4438 break;
4439 case XML_PARSER_ENTITY_DECL:
4440 xmlGenericError(xmlGenericErrorContext,
4441 "HPP: internal error, state == ENTITY_DECL\n");
4442 ctxt->instate = XML_PARSER_CONTENT;
4443 ctxt->checkIndex = 0;
4444#ifdef DEBUG_PUSH
4445 xmlGenericError(xmlGenericErrorContext,
4446 "HPP: entering CONTENT\n");
4447#endif
4448 break;
4449 case XML_PARSER_ENTITY_VALUE:
4450 xmlGenericError(xmlGenericErrorContext,
4451 "HPP: internal error, state == ENTITY_VALUE\n");
4452 ctxt->instate = XML_PARSER_CONTENT;
4453 ctxt->checkIndex = 0;
4454#ifdef DEBUG_PUSH
4455 xmlGenericError(xmlGenericErrorContext,
4456 "HPP: entering DTD\n");
4457#endif
4458 break;
4459 case XML_PARSER_ATTRIBUTE_VALUE:
4460 xmlGenericError(xmlGenericErrorContext,
4461 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
4462 ctxt->instate = XML_PARSER_START_TAG;
4463 ctxt->checkIndex = 0;
4464#ifdef DEBUG_PUSH
4465 xmlGenericError(xmlGenericErrorContext,
4466 "HPP: entering START_TAG\n");
4467#endif
4468 break;
4469 case XML_PARSER_SYSTEM_LITERAL:
4470 xmlGenericError(xmlGenericErrorContext,
4471 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
4472 ctxt->instate = XML_PARSER_CONTENT;
4473 ctxt->checkIndex = 0;
4474#ifdef DEBUG_PUSH
4475 xmlGenericError(xmlGenericErrorContext,
4476 "HPP: entering CONTENT\n");
4477#endif
4478 break;
4479 case XML_PARSER_IGNORE:
4480 xmlGenericError(xmlGenericErrorContext,
4481 "HPP: internal error, state == XML_PARSER_IGNORE\n");
4482 ctxt->instate = XML_PARSER_CONTENT;
4483 ctxt->checkIndex = 0;
4484#ifdef DEBUG_PUSH
4485 xmlGenericError(xmlGenericErrorContext,
4486 "HPP: entering CONTENT\n");
4487#endif
4488 break;
4489 }
4490 }
4491done:
4492 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004493 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004494 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4495 /*
4496 * SAX: end of the document processing.
4497 */
4498 ctxt->instate = XML_PARSER_EOF;
4499 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4500 ctxt->sax->endDocument(ctxt->userData);
4501 }
4502 }
4503 if ((ctxt->myDoc != NULL) &&
4504 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4505 (ctxt->instate == XML_PARSER_EPILOG))) {
4506 xmlDtdPtr dtd;
4507 dtd = xmlGetIntSubset(ctxt->myDoc);
4508 if (dtd == NULL)
4509 ctxt->myDoc->intSubset =
4510 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4511 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4512 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4513 }
4514#ifdef DEBUG_PUSH
4515 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4516#endif
4517 return(ret);
4518}
4519
4520/**
Owen Taylor3473f882001-02-23 17:55:21 +00004521 * htmlParseChunk:
4522 * @ctxt: an XML parser context
4523 * @chunk: an char array
4524 * @size: the size in byte of the chunk
4525 * @terminate: last chunk indicator
4526 *
4527 * Parse a Chunk of memory
4528 *
4529 * Returns zero if no error, the xmlParserErrors otherwise.
4530 */
4531int
4532htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4533 int terminate) {
4534 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4535 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4536 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4537 int cur = ctxt->input->cur - ctxt->input->base;
4538
4539 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4540 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4541 ctxt->input->cur = ctxt->input->base + cur;
4542#ifdef DEBUG_PUSH
4543 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4544#endif
4545
4546 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4547 htmlParseTryOrFinish(ctxt, terminate);
4548 } else if (ctxt->instate != XML_PARSER_EOF) {
4549 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
4550 htmlParseTryOrFinish(ctxt, terminate);
4551 }
4552 if (terminate) {
4553 if ((ctxt->instate != XML_PARSER_EOF) &&
4554 (ctxt->instate != XML_PARSER_EPILOG) &&
4555 (ctxt->instate != XML_PARSER_MISC)) {
4556 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004557 ctxt->wellFormed = 0;
4558 }
4559 if (ctxt->instate != XML_PARSER_EOF) {
4560 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4561 ctxt->sax->endDocument(ctxt->userData);
4562 }
4563 ctxt->instate = XML_PARSER_EOF;
4564 }
4565 return((xmlParserErrors) ctxt->errNo);
4566}
4567
4568/************************************************************************
4569 * *
4570 * User entry points *
4571 * *
4572 ************************************************************************/
4573
4574/**
4575 * htmlCreatePushParserCtxt :
4576 * @sax: a SAX handler
4577 * @user_data: The user data returned on SAX callbacks
4578 * @chunk: a pointer to an array of chars
4579 * @size: number of chars in the array
4580 * @filename: an optional file name or URI
4581 * @enc: an optional encoding
4582 *
4583 * Create a parser context for using the HTML parser in push mode
4584 * To allow content encoding detection, @size should be >= 4
4585 * The value of @filename is used for fetching external entities
4586 * and error/warning reports.
4587 *
4588 * Returns the new parser context or NULL
4589 */
4590htmlParserCtxtPtr
4591htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4592 const char *chunk, int size, const char *filename,
4593 xmlCharEncoding enc) {
4594 htmlParserCtxtPtr ctxt;
4595 htmlParserInputPtr inputStream;
4596 xmlParserInputBufferPtr buf;
4597
4598 buf = xmlAllocParserInputBuffer(enc);
4599 if (buf == NULL) return(NULL);
4600
4601 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4602 if (ctxt == NULL) {
4603 xmlFree(buf);
4604 return(NULL);
4605 }
4606 memset(ctxt, 0, sizeof(htmlParserCtxt));
4607 htmlInitParserCtxt(ctxt);
4608 if (sax != NULL) {
4609 if (ctxt->sax != &htmlDefaultSAXHandler)
4610 xmlFree(ctxt->sax);
4611 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4612 if (ctxt->sax == NULL) {
4613 xmlFree(buf);
4614 xmlFree(ctxt);
4615 return(NULL);
4616 }
4617 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4618 if (user_data != NULL)
4619 ctxt->userData = user_data;
4620 }
4621 if (filename == NULL) {
4622 ctxt->directory = NULL;
4623 } else {
4624 ctxt->directory = xmlParserGetDirectory(filename);
4625 }
4626
4627 inputStream = htmlNewInputStream(ctxt);
4628 if (inputStream == NULL) {
4629 xmlFreeParserCtxt(ctxt);
4630 return(NULL);
4631 }
4632
4633 if (filename == NULL)
4634 inputStream->filename = NULL;
4635 else
4636 inputStream->filename = xmlMemStrdup(filename);
4637 inputStream->buf = buf;
4638 inputStream->base = inputStream->buf->buffer->content;
4639 inputStream->cur = inputStream->buf->buffer->content;
4640
4641 inputPush(ctxt, inputStream);
4642
4643 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4644 (ctxt->input->buf != NULL)) {
4645 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4646#ifdef DEBUG_PUSH
4647 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4648#endif
4649 }
4650
4651 return(ctxt);
4652}
4653
4654/**
4655 * htmlSAXParseDoc :
4656 * @cur: a pointer to an array of xmlChar
4657 * @encoding: a free form C string describing the HTML document encoding, or NULL
4658 * @sax: the SAX handler block
4659 * @userData: if using SAX, this pointer will be provided on callbacks.
4660 *
4661 * parse an HTML in-memory document and build a tree.
4662 * It use the given SAX function block to handle the parsing callback.
4663 * If sax is NULL, fallback to the default DOM tree building routines.
4664 *
4665 * Returns the resulting document tree
4666 */
4667
4668htmlDocPtr
4669htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
4670 htmlDocPtr ret;
4671 htmlParserCtxtPtr ctxt;
4672
4673 if (cur == NULL) return(NULL);
4674
4675
4676 ctxt = htmlCreateDocParserCtxt(cur, encoding);
4677 if (ctxt == NULL) return(NULL);
4678 if (sax != NULL) {
4679 ctxt->sax = sax;
4680 ctxt->userData = userData;
4681 }
4682
4683 htmlParseDocument(ctxt);
4684 ret = ctxt->myDoc;
4685 if (sax != NULL) {
4686 ctxt->sax = NULL;
4687 ctxt->userData = NULL;
4688 }
4689 htmlFreeParserCtxt(ctxt);
4690
4691 return(ret);
4692}
4693
4694/**
4695 * htmlParseDoc :
4696 * @cur: a pointer to an array of xmlChar
4697 * @encoding: a free form C string describing the HTML document encoding, or NULL
4698 *
4699 * parse an HTML in-memory document and build a tree.
4700 *
4701 * Returns the resulting document tree
4702 */
4703
4704htmlDocPtr
4705htmlParseDoc(xmlChar *cur, const char *encoding) {
4706 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4707}
4708
4709
4710/**
4711 * htmlCreateFileParserCtxt :
4712 * @filename: the filename
4713 * @encoding: a free form C string describing the HTML document encoding, or NULL
4714 *
4715 * Create a parser context for a file content.
4716 * Automatic support for ZLIB/Compress compressed document is provided
4717 * by default if found at compile-time.
4718 *
4719 * Returns the new parser context or NULL
4720 */
4721htmlParserCtxtPtr
4722htmlCreateFileParserCtxt(const char *filename, const char *encoding)
4723{
4724 htmlParserCtxtPtr ctxt;
4725 htmlParserInputPtr inputStream;
4726 xmlParserInputBufferPtr buf;
4727 /* htmlCharEncoding enc; */
4728 xmlChar *content, *content_line = (xmlChar *) "charset=";
4729
4730 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
4731 if (buf == NULL) return(NULL);
4732
4733 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4734 if (ctxt == NULL) {
4735 perror("malloc");
4736 return(NULL);
4737 }
4738 memset(ctxt, 0, sizeof(htmlParserCtxt));
4739 htmlInitParserCtxt(ctxt);
4740 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
4741 if (inputStream == NULL) {
4742 perror("malloc");
4743 xmlFree(ctxt);
4744 return(NULL);
4745 }
4746 memset(inputStream, 0, sizeof(htmlParserInput));
4747
4748 inputStream->filename = xmlMemStrdup(filename);
4749 inputStream->line = 1;
4750 inputStream->col = 1;
4751 inputStream->buf = buf;
4752 inputStream->directory = NULL;
4753
4754 inputStream->base = inputStream->buf->buffer->content;
4755 inputStream->cur = inputStream->buf->buffer->content;
4756 inputStream->free = NULL;
4757
4758 inputPush(ctxt, inputStream);
4759
4760 /* set encoding */
4761 if (encoding) {
4762 content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
4763 if (content) {
4764 strcpy ((char *)content, (char *)content_line);
4765 strcat ((char *)content, (char *)encoding);
4766 htmlCheckEncoding (ctxt, content);
4767 xmlFree (content);
4768 }
4769 }
4770
4771 return(ctxt);
4772}
4773
4774/**
4775 * htmlSAXParseFile :
4776 * @filename: the filename
4777 * @encoding: a free form C string describing the HTML document encoding, or NULL
4778 * @sax: the SAX handler block
4779 * @userData: if using SAX, this pointer will be provided on callbacks.
4780 *
4781 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4782 * compressed document is provided by default if found at compile-time.
4783 * It use the given SAX function block to handle the parsing callback.
4784 * If sax is NULL, fallback to the default DOM tree building routines.
4785 *
4786 * Returns the resulting document tree
4787 */
4788
4789htmlDocPtr
4790htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
4791 void *userData) {
4792 htmlDocPtr ret;
4793 htmlParserCtxtPtr ctxt;
4794 htmlSAXHandlerPtr oldsax = NULL;
4795
4796 ctxt = htmlCreateFileParserCtxt(filename, encoding);
4797 if (ctxt == NULL) return(NULL);
4798 if (sax != NULL) {
4799 oldsax = ctxt->sax;
4800 ctxt->sax = sax;
4801 ctxt->userData = userData;
4802 }
4803
4804 htmlParseDocument(ctxt);
4805
4806 ret = ctxt->myDoc;
4807 if (sax != NULL) {
4808 ctxt->sax = oldsax;
4809 ctxt->userData = NULL;
4810 }
4811 htmlFreeParserCtxt(ctxt);
4812
4813 return(ret);
4814}
4815
4816/**
4817 * htmlParseFile :
4818 * @filename: the filename
4819 * @encoding: a free form C string describing the HTML document encoding, or NULL
4820 *
4821 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4822 * compressed document is provided by default if found at compile-time.
4823 *
4824 * Returns the resulting document tree
4825 */
4826
4827htmlDocPtr
4828htmlParseFile(const char *filename, const char *encoding) {
4829 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
4830}
4831
4832/**
4833 * htmlHandleOmittedElem:
4834 * @val: int 0 or 1
4835 *
4836 * Set and return the previous value for handling HTML omitted tags.
4837 *
4838 * Returns the last value for 0 for no handling, 1 for auto insertion.
4839 */
4840
4841int
4842htmlHandleOmittedElem(int val) {
4843 int old = htmlOmittedDefaultValue;
4844
4845 htmlOmittedDefaultValue = val;
4846 return(old);
4847}
4848
4849#endif /* LIBXML_HTML_ENABLED */