blob: 870201dd4a25061d79217c0c6bcc2be1fd52e7bd [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * Daniel.Veillard@w3.org
7 */
8
Bjorn Reese70a9da52001-04-21 16:57:29 +00009#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000010#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000011
Owen Taylor3473f882001-02-23 17:55:21 +000012#include <string.h>
13#ifdef HAVE_CTYPE_H
14#include <ctype.h>
15#endif
16#ifdef HAVE_STDLIB_H
17#include <stdlib.h>
18#endif
19#ifdef HAVE_SYS_STAT_H
20#include <sys/stat.h>
21#endif
22#ifdef HAVE_FCNTL_H
23#include <fcntl.h>
24#endif
25#ifdef HAVE_UNISTD_H
26#include <unistd.h>
27#endif
28#ifdef HAVE_ZLIB_H
29#include <zlib.h>
30#endif
31
32#include <libxml/xmlmemory.h>
33#include <libxml/tree.h>
34#include <libxml/parser.h>
35#include <libxml/parserInternals.h>
36#include <libxml/xmlerror.h>
37#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000038#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000039#include <libxml/entities.h>
40#include <libxml/encoding.h>
41#include <libxml/valid.h>
42#include <libxml/xmlIO.h>
43
44#define HTML_MAX_NAMELEN 1000
45#define HTML_PARSER_BIG_BUFFER_SIZE 1000
46#define HTML_PARSER_BUFFER_SIZE 100
47
48/* #define DEBUG */
49/* #define DEBUG_PUSH */
50
51int htmlOmittedDefaultValue = 1;
52
Daniel Veillard56a4cb82001-03-24 17:00:36 +000053xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
54 xmlChar end, xmlChar end2, xmlChar end3);
55
56/************************************************************************
57 * *
Owen Taylor3473f882001-02-23 17:55:21 +000058 * Parser stacks related functions and macros *
59 * *
60 ************************************************************************/
61
62/*
63 * Generic function for accessing stacks in the Parser Context
64 */
65
66#define PUSH_AND_POP(scope, type, name) \
67scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
68 if (ctxt->name##Nr >= ctxt->name##Max) { \
69 ctxt->name##Max *= 2; \
70 ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
71 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
72 if (ctxt->name##Tab == NULL) { \
73 xmlGenericError(xmlGenericErrorContext, \
74 "realloc failed !\n"); \
75 return(0); \
76 } \
77 } \
78 ctxt->name##Tab[ctxt->name##Nr] = value; \
79 ctxt->name = value; \
80 return(ctxt->name##Nr++); \
81} \
82scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
83 type ret; \
84 if (ctxt->name##Nr < 0) return(0); \
85 ctxt->name##Nr--; \
86 if (ctxt->name##Nr < 0) return(0); \
87 if (ctxt->name##Nr > 0) \
88 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
89 else \
90 ctxt->name = NULL; \
91 ret = ctxt->name##Tab[ctxt->name##Nr]; \
92 ctxt->name##Tab[ctxt->name##Nr] = 0; \
93 return(ret); \
94} \
95
Daniel Veillard56a4cb82001-03-24 17:00:36 +000096/* PUSH_AND_POP(static, xmlNodePtr, node) */
97PUSH_AND_POP(static, xmlChar*, name)
Owen Taylor3473f882001-02-23 17:55:21 +000098
99/*
100 * Macros for accessing the content. Those should be used only by the parser,
101 * and not exported.
102 *
103 * Dirty macros, i.e. one need to make assumption on the context to use them
104 *
105 * CUR_PTR return the current pointer to the xmlChar to be parsed.
106 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
107 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
108 * in UNICODE mode. This should be used internally by the parser
109 * only to compare to ASCII values otherwise it would break when
110 * running with UTF-8 encoding.
111 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
112 * to compare on ASCII based substring.
113 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
114 * it should be used only to compare on ASCII based substring.
115 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
116 * strings within the parser.
117 *
118 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
119 *
120 * CURRENT Returns the current char value, with the full decoding of
121 * UTF-8 if we are using this mode. It returns an int.
122 * NEXT Skip to the next character, this does the proper decoding
123 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
124 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
125 */
126
127#define UPPER (toupper(*ctxt->input->cur))
128
129#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
130
131#define NXT(val) ctxt->input->cur[(val)]
132
133#define UPP(val) (toupper(ctxt->input->cur[(val)]))
134
135#define CUR_PTR ctxt->input->cur
136
137#define SHRINK xmlParserInputShrink(ctxt->input)
138
139#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
140
141#define CURRENT ((int) (*ctxt->input->cur))
142
143#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
144
145/* Inported from XML */
146
147/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
148#define CUR ((int) (*ctxt->input->cur))
149#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
150
151#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
152#define NXT(val) ctxt->input->cur[(val)]
153#define CUR_PTR ctxt->input->cur
154
155
156#define NEXTL(l) do { \
157 if (*(ctxt->input->cur) == '\n') { \
158 ctxt->input->line++; ctxt->input->col = 1; \
159 } else ctxt->input->col++; \
160 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
161 } while (0)
162
163/************
164 \
165 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
166 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
167 ************/
168
169#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
170#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
171
172#define COPY_BUF(l,b,i,v) \
173 if (l == 1) b[i++] = (xmlChar) v; \
174 else i += xmlCopyChar(l,&b[i],v)
175
176/**
177 * htmlCurrentChar:
178 * @ctxt: the HTML parser context
179 * @len: pointer to the length of the char read
180 *
181 * The current char value, if using UTF-8 this may actaully span multiple
182 * bytes in the input buffer. Implement the end of line normalization:
183 * 2.11 End-of-Line Handling
184 * If the encoding is unspecified, in the case we find an ISO-Latin-1
185 * char, then the encoding converter is plugged in automatically.
186 *
187 * Returns the current char value and its lenght
188 */
189
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000190static int
Owen Taylor3473f882001-02-23 17:55:21 +0000191htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
192 if (ctxt->instate == XML_PARSER_EOF)
193 return(0);
194
195 if (ctxt->token != 0) {
196 *len = 0;
197 return(ctxt->token);
198 }
199 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
200 /*
201 * We are supposed to handle UTF8, check it's valid
202 * From rfc2044: encoding of the Unicode values on UTF-8:
203 *
204 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
205 * 0000 0000-0000 007F 0xxxxxxx
206 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
207 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
208 *
209 * Check for the 0x110000 limit too
210 */
211 const unsigned char *cur = ctxt->input->cur;
212 unsigned char c;
213 unsigned int val;
214
215 c = *cur;
216 if (c & 0x80) {
217 if (cur[1] == 0)
218 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
219 if ((cur[1] & 0xc0) != 0x80)
220 goto encoding_error;
221 if ((c & 0xe0) == 0xe0) {
222
223 if (cur[2] == 0)
224 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
225 if ((cur[2] & 0xc0) != 0x80)
226 goto encoding_error;
227 if ((c & 0xf0) == 0xf0) {
228 if (cur[3] == 0)
229 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
230 if (((c & 0xf8) != 0xf0) ||
231 ((cur[3] & 0xc0) != 0x80))
232 goto encoding_error;
233 /* 4-byte code */
234 *len = 4;
235 val = (cur[0] & 0x7) << 18;
236 val |= (cur[1] & 0x3f) << 12;
237 val |= (cur[2] & 0x3f) << 6;
238 val |= cur[3] & 0x3f;
239 } else {
240 /* 3-byte code */
241 *len = 3;
242 val = (cur[0] & 0xf) << 12;
243 val |= (cur[1] & 0x3f) << 6;
244 val |= cur[2] & 0x3f;
245 }
246 } else {
247 /* 2-byte code */
248 *len = 2;
249 val = (cur[0] & 0x1f) << 6;
250 val |= cur[1] & 0x3f;
251 }
252 if (!IS_CHAR(val)) {
253 ctxt->errNo = XML_ERR_INVALID_ENCODING;
254 if ((ctxt->sax != NULL) &&
255 (ctxt->sax->error != NULL))
256 ctxt->sax->error(ctxt->userData,
257 "Char 0x%X out of allowed range\n", val);
258 ctxt->wellFormed = 0;
259 ctxt->disableSAX = 1;
260 }
261 return(val);
262 } else {
263 /* 1-byte code */
264 *len = 1;
265 return((int) *ctxt->input->cur);
266 }
267 }
268 /*
269 * Assume it's a fixed lenght encoding (1) with
270 * a compatibke encoding for the ASCII set, since
271 * XML constructs only use < 128 chars
272 */
273 *len = 1;
274 if ((int) *ctxt->input->cur < 0x80)
275 return((int) *ctxt->input->cur);
276
277 /*
278 * Humm this is bad, do an automatic flow conversion
279 */
280 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
281 ctxt->charset = XML_CHAR_ENCODING_UTF8;
282 return(xmlCurrentChar(ctxt, len));
283
284encoding_error:
285 /*
286 * If we detect an UTF8 error that probably mean that the
287 * input encoding didn't get properly advertized in the
288 * declaration header. Report the error and switch the encoding
289 * to ISO-Latin-1 (if you don't like this policy, just declare the
290 * encoding !)
291 */
292 ctxt->errNo = XML_ERR_INVALID_ENCODING;
293 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
294 ctxt->sax->error(ctxt->userData,
295 "Input is not proper UTF-8, indicate encoding !\n");
296 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
297 ctxt->input->cur[0], ctxt->input->cur[1],
298 ctxt->input->cur[2], ctxt->input->cur[3]);
299 }
300
301 ctxt->charset = XML_CHAR_ENCODING_8859_1;
302 *len = 1;
303 return((int) *ctxt->input->cur);
304}
305
306/**
Owen Taylor3473f882001-02-23 17:55:21 +0000307 * htmlSkipBlankChars:
308 * @ctxt: the HTML parser context
309 *
310 * skip all blanks character found at that point in the input streams.
311 *
312 * Returns the number of space chars skipped
313 */
314
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000315static int
Owen Taylor3473f882001-02-23 17:55:21 +0000316htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
317 int res = 0;
318
319 while (IS_BLANK(*(ctxt->input->cur))) {
320 if ((*ctxt->input->cur == 0) &&
321 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
322 xmlPopInput(ctxt);
323 } else {
324 if (*(ctxt->input->cur) == '\n') {
325 ctxt->input->line++; ctxt->input->col = 1;
326 } else ctxt->input->col++;
327 ctxt->input->cur++;
328 ctxt->nbChars++;
329 if (*ctxt->input->cur == 0)
330 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
331 }
332 res++;
333 }
334 return(res);
335}
336
337
338
339/************************************************************************
340 * *
341 * The list of HTML elements and their properties *
342 * *
343 ************************************************************************/
344
345/*
346 * Start Tag: 1 means the start tag can be ommited
347 * End Tag: 1 means the end tag can be ommited
348 * 2 means it's forbidden (empty elements)
Daniel Veillard56098d42001-04-24 12:51:09 +0000349 * 3 means the tag is stylistic and should be closed easilly
Owen Taylor3473f882001-02-23 17:55:21 +0000350 * Depr: this element is deprecated
351 * DTD: 1 means that this element is valid only in the Loose DTD
352 * 2 means that this element is valid only in the Frameset DTD
353 *
354 * Name,Start Tag,End Tag,Save End, Empty, Depr., DTD, Description
355 */
356htmlElemDesc html40ElementTable[] = {
357{ "a", 0, 0, 0, 0, 0, 0, "anchor " },
358{ "abbr", 0, 0, 0, 0, 0, 0, "abbreviated form" },
359{ "acronym", 0, 0, 0, 0, 0, 0, "" },
360{ "address", 0, 0, 0, 0, 0, 0, "information on author " },
361{ "applet", 0, 0, 0, 0, 1, 1, "java applet " },
362{ "area", 0, 2, 2, 1, 0, 0, "client-side image map area " },
Daniel Veillard56098d42001-04-24 12:51:09 +0000363{ "b", 0, 3, 0, 0, 0, 0, "bold text style" },
Owen Taylor3473f882001-02-23 17:55:21 +0000364{ "base", 0, 2, 2, 1, 0, 0, "document base uri " },
365{ "basefont", 0, 2, 2, 1, 1, 1, "base font size " },
366{ "bdo", 0, 0, 0, 0, 0, 0, "i18n bidi over-ride " },
Daniel Veillard56098d42001-04-24 12:51:09 +0000367{ "big", 0, 3, 0, 0, 0, 0, "large text style" },
Owen Taylor3473f882001-02-23 17:55:21 +0000368{ "blockquote", 0, 0, 0, 0, 0, 0, "long quotation " },
369{ "body", 1, 1, 0, 0, 0, 0, "document body " },
370{ "br", 0, 2, 2, 1, 0, 0, "forced line break " },
371{ "button", 0, 0, 0, 0, 0, 0, "push button " },
372{ "caption", 0, 0, 0, 0, 0, 0, "table caption " },
Daniel Veillard56098d42001-04-24 12:51:09 +0000373{ "center", 0, 3, 0, 0, 1, 1, "shorthand for div align=center " },
Owen Taylor3473f882001-02-23 17:55:21 +0000374{ "cite", 0, 0, 0, 0, 0, 0, "citation" },
375{ "code", 0, 0, 0, 0, 0, 0, "computer code fragment" },
376{ "col", 0, 2, 2, 1, 0, 0, "table column " },
377{ "colgroup", 0, 1, 0, 0, 0, 0, "table column group " },
378{ "dd", 0, 1, 0, 0, 0, 0, "definition description " },
379{ "del", 0, 0, 0, 0, 0, 0, "deleted text " },
380{ "dfn", 0, 0, 0, 0, 0, 0, "instance definition" },
381{ "dir", 0, 0, 0, 0, 1, 1, "directory list" },
382{ "div", 0, 0, 0, 0, 0, 0, "generic language/style container"},
383{ "dl", 0, 0, 0, 0, 0, 0, "definition list " },
384{ "dt", 0, 1, 0, 0, 0, 0, "definition term " },
Daniel Veillard56098d42001-04-24 12:51:09 +0000385{ "em", 0, 3, 0, 0, 0, 0, "emphasis" },
Owen Taylor3473f882001-02-23 17:55:21 +0000386{ "fieldset", 0, 0, 0, 0, 0, 0, "form control group " },
Daniel Veillard56098d42001-04-24 12:51:09 +0000387{ "font", 0, 3, 0, 0, 1, 1, "local change to font " },
Owen Taylor3473f882001-02-23 17:55:21 +0000388{ "form", 0, 0, 0, 0, 0, 0, "interactive form " },
389{ "frame", 0, 2, 2, 1, 0, 2, "subwindow " },
390{ "frameset", 0, 0, 0, 0, 0, 2, "window subdivision" },
391{ "h1", 0, 0, 0, 0, 0, 0, "heading " },
392{ "h2", 0, 0, 0, 0, 0, 0, "heading " },
393{ "h3", 0, 0, 0, 0, 0, 0, "heading " },
394{ "h4", 0, 0, 0, 0, 0, 0, "heading " },
395{ "h5", 0, 0, 0, 0, 0, 0, "heading " },
396{ "h6", 0, 0, 0, 0, 0, 0, "heading " },
397{ "head", 1, 1, 0, 0, 0, 0, "document head " },
398{ "hr", 0, 2, 2, 1, 0, 0, "horizontal rule " },
399{ "html", 1, 1, 0, 0, 0, 0, "document root element " },
Daniel Veillard56098d42001-04-24 12:51:09 +0000400{ "i", 0, 3, 0, 0, 0, 0, "italic text style" },
Owen Taylor3473f882001-02-23 17:55:21 +0000401{ "iframe", 0, 0, 0, 0, 0, 1, "inline subwindow " },
402{ "img", 0, 2, 2, 1, 0, 0, "embedded image " },
403{ "input", 0, 2, 2, 1, 0, 0, "form control " },
404{ "ins", 0, 0, 0, 0, 0, 0, "inserted text" },
405{ "isindex", 0, 2, 2, 1, 1, 1, "single line prompt " },
406{ "kbd", 0, 0, 0, 0, 0, 0, "text to be entered by the user" },
407{ "label", 0, 0, 0, 0, 0, 0, "form field label text " },
408{ "legend", 0, 0, 0, 0, 0, 0, "fieldset legend " },
409{ "li", 0, 1, 1, 0, 0, 0, "list item " },
410{ "link", 0, 2, 2, 1, 0, 0, "a media-independent link " },
411{ "map", 0, 0, 0, 0, 0, 0, "client-side image map " },
412{ "menu", 0, 0, 0, 0, 1, 1, "menu list " },
413{ "meta", 0, 2, 2, 1, 0, 0, "generic metainformation " },
414{ "noframes", 0, 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " },
415{ "noscript", 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
416{ "object", 0, 0, 0, 0, 0, 0, "generic embedded object " },
417{ "ol", 0, 0, 0, 0, 0, 0, "ordered list " },
418{ "optgroup", 0, 0, 0, 0, 0, 0, "option group " },
419{ "option", 0, 1, 0, 0, 0, 0, "selectable choice " },
420{ "p", 0, 1, 1, 0, 0, 0, "paragraph " },
421{ "param", 0, 2, 2, 1, 0, 0, "named property value " },
422{ "pre", 0, 0, 0, 0, 0, 0, "preformatted text " },
423{ "q", 0, 0, 0, 0, 0, 0, "short inline quotation " },
Daniel Veillard56098d42001-04-24 12:51:09 +0000424{ "s", 0, 3, 0, 0, 1, 1, "strike-through text style" },
Owen Taylor3473f882001-02-23 17:55:21 +0000425{ "samp", 0, 0, 0, 0, 0, 0, "sample program output, scripts, etc." },
426{ "script", 0, 0, 0, 0, 0, 0, "script statements " },
427{ "select", 0, 0, 0, 0, 0, 0, "option selector " },
Daniel Veillard56098d42001-04-24 12:51:09 +0000428{ "small", 0, 3, 0, 0, 0, 0, "small text style" },
Owen Taylor3473f882001-02-23 17:55:21 +0000429{ "span", 0, 0, 0, 0, 0, 0, "generic language/style container " },
Daniel Veillard56098d42001-04-24 12:51:09 +0000430{ "strike", 0, 3, 0, 0, 1, 1, "strike-through text" },
431{ "strong", 0, 3, 0, 0, 0, 0, "strong emphasis" },
Owen Taylor3473f882001-02-23 17:55:21 +0000432{ "style", 0, 0, 0, 0, 0, 0, "style info " },
Daniel Veillard56098d42001-04-24 12:51:09 +0000433{ "sub", 0, 3, 0, 0, 0, 0, "subscript" },
434{ "sup", 0, 3, 0, 0, 0, 0, "superscript " },
Owen Taylor3473f882001-02-23 17:55:21 +0000435{ "table", 0, 0, 0, 0, 0, 0, "&#160;" },
436{ "tbody", 1, 0, 0, 0, 0, 0, "table body " },
437{ "td", 0, 0, 0, 0, 0, 0, "table data cell" },
438{ "textarea", 0, 0, 0, 0, 0, 0, "multi-line text field " },
439{ "tfoot", 0, 1, 0, 0, 0, 0, "table footer " },
440{ "th", 0, 1, 0, 0, 0, 0, "table header cell" },
441{ "thead", 0, 1, 0, 0, 0, 0, "table header " },
442{ "title", 0, 0, 0, 0, 0, 0, "document title " },
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000443{ "tr", 0, 0, 0, 0, 0, 0, "table row " },
Daniel Veillard56098d42001-04-24 12:51:09 +0000444{ "tt", 0, 3, 0, 0, 0, 0, "teletype or monospaced text style" },
445{ "u", 0, 3, 0, 0, 1, 1, "underlined text style" },
Owen Taylor3473f882001-02-23 17:55:21 +0000446{ "ul", 0, 0, 0, 0, 0, 0, "unordered list " },
447{ "var", 0, 0, 0, 0, 0, 0, "instance of a variable or program argument" },
448};
449
450/*
451 * start tags that imply the end of a current element
452 * any tag of each line implies the end of the current element if the type of
453 * that element is in the same line
454 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000455const char *htmlEquEnd[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000456"dt", "dd", "li", "option", NULL,
457"h1", "h2", "h3", "h4", "h5", "h6", NULL,
458"ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,
459NULL
460};
461/*
462 * acording the HTML DTD, HR should be added to the 2nd line above, as it
463 * is not allowed within a H1, H2, H3, etc. But we should tolerate that case
464 * because many documents contain rules in headings...
465 */
466
467/*
468 * start tags that imply the end of current element
469 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000470const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000471"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
472 "dl", "ul", "ol", "menu", "dir", "address", "pre",
473 "listing", "xmp", "head", NULL,
474"head", "p", NULL,
475"title", "p", NULL,
476"body", "head", "style", "link", "title", "p", NULL,
477"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
478 "pre", "listing", "xmp", "head", "li", NULL,
479"hr", "p", "head", NULL,
480"h1", "p", "head", NULL,
481"h2", "p", "head", NULL,
482"h3", "p", "head", NULL,
483"h4", "p", "head", NULL,
484"h5", "p", "head", NULL,
485"h6", "p", "head", NULL,
486"dir", "p", "head", NULL,
487"address", "p", "head", "ul", NULL,
488"pre", "p", "head", "ul", NULL,
489"listing", "p", "head", NULL,
490"xmp", "p", "head", NULL,
491"blockquote", "p", "head", NULL,
492"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
493 "xmp", "head", NULL,
494"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
495 "head", "dd", NULL,
496"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
497 "head", "dt", NULL,
498"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
499 "listing", "xmp", NULL,
500"ol", "p", "head", "ul", NULL,
501"menu", "p", "head", "ul", NULL,
502"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
503"div", "p", "head", NULL,
504"noscript", "p", "head", NULL,
505"center", "font", "b", "i", "p", "head", NULL,
506"a", "a", NULL,
507"caption", "p", NULL,
508"colgroup", "caption", "colgroup", "col", "p", NULL,
509"col", "caption", "col", "p", NULL,
510"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
511 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000512"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
513"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000514"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
515"thead", "caption", "col", "colgroup", NULL,
516"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
517 "tbody", "p", NULL,
518"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
519 "tfoot", "tbody", "p", NULL,
520"optgroup", "option", NULL,
521"option", "option", NULL,
522"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
523 "pre", "listing", "xmp", "a", NULL,
524NULL
525};
526
527/*
528 * The list of HTML elements which are supposed not to have
529 * CDATA content and where a p element will be implied
530 *
531 * TODO: extend that list by reading the HTML SGML DtD on
532 * implied paragraph
533 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000534static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000535 "html",
536 "head",
537 "body",
538 NULL
539};
540
541/*
542 * The list of HTML attributes which are of content %Script;
543 * NOTE: when adding ones, check htmlIsScriptAttribute() since
544 * it assumes the name starts with 'on'
545 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000546static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000547 "onclick",
548 "ondblclick",
549 "onmousedown",
550 "onmouseup",
551 "onmouseover",
552 "onmousemove",
553 "onmouseout",
554 "onkeypress",
555 "onkeydown",
556 "onkeyup",
557 "onload",
558 "onunload",
559 "onfocus",
560 "onblur",
561 "onsubmit",
562 "onrest",
563 "onchange",
564 "onselect"
565};
566
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000567/*
568 * end tags that imply the end of the inside elements
569 */
570const char *htmlEndClose[] = {
571"head",
572"body",
573"html",
574NULL
575};
576
Owen Taylor3473f882001-02-23 17:55:21 +0000577
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000578static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +0000579static int htmlStartCloseIndexinitialized = 0;
580
581/************************************************************************
582 * *
583 * functions to handle HTML specific data *
584 * *
585 ************************************************************************/
586
587/**
588 * htmlInitAutoClose:
589 *
590 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
591 * This is not reentrant. Call xmlInitParser() once before processing in
592 * case of use in multithreaded programs.
593 */
594void
595htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000596 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000597
598 if (htmlStartCloseIndexinitialized) return;
599
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000600 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
601 indx = 0;
602 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
603 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +0000604 while (htmlStartClose[i] != NULL) i++;
605 i++;
606 }
607 htmlStartCloseIndexinitialized = 1;
608}
609
610/**
611 * htmlTagLookup:
612 * @tag: The tag name in lowercase
613 *
614 * Lookup the HTML tag in the ElementTable
615 *
616 * Returns the related htmlElemDescPtr or NULL if not found.
617 */
618htmlElemDescPtr
619htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000620 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000621
622 for (i = 0; i < (sizeof(html40ElementTable) /
623 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000624 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Owen Taylor3473f882001-02-23 17:55:21 +0000625 return(&html40ElementTable[i]);
626 }
627 return(NULL);
628}
629
630/**
631 * htmlCheckAutoClose:
632 * @newtag: The new tag name
633 * @oldtag: The old tag name
634 *
635 * Checks wether the new tag is one of the registered valid tags for closing old.
636 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
637 *
638 * Returns 0 if no, 1 if yes.
639 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000640static int
Owen Taylor3473f882001-02-23 17:55:21 +0000641htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000642 int i, indx;
643 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +0000644
645 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
646
647 /* inefficient, but not a big deal */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000648 for (indx = 0; indx < 100;indx++) {
649 closed = htmlStartCloseIndex[indx];
650 if (closed == NULL) return(0);
651 if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor3473f882001-02-23 17:55:21 +0000652 }
653
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000654 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +0000655 i++;
656 while (htmlStartClose[i] != NULL) {
657 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
658 return(1);
659 }
660 i++;
661 }
662 return(0);
663}
664
665/**
666 * htmlAutoCloseOnClose:
667 * @ctxt: an HTML parser context
668 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000669 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +0000670 *
671 * The HTmL DtD allows an ending tag to implicitely close other tags.
672 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000673static void
Owen Taylor3473f882001-02-23 17:55:21 +0000674htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
675 htmlElemDescPtr info;
676 xmlChar *oldname;
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000677 int i, endCloses = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000678
679#ifdef DEBUG
680 xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
681 for (i = 0;i < ctxt->nameNr;i++)
682 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
683#endif
684
685 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
686 if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
687 }
688 if (i < 0) return;
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000689 for (i = 0; (htmlEndClose[i] != NULL);i++)
690 if (xmlStrEqual(newtag, (const xmlChar *) htmlEndClose[i])) {
691 endCloses = 1;
692 break;
693 }
Owen Taylor3473f882001-02-23 17:55:21 +0000694
695 while (!xmlStrEqual(newtag, ctxt->name)) {
696 info = htmlTagLookup(ctxt->name);
697 if ((info == NULL) || (info->endTag == 1)) {
698#ifdef DEBUG
699 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
700#endif
Daniel Veillard56098d42001-04-24 12:51:09 +0000701 } else if (info->endTag == 3) {
702#ifdef DEBUG
703 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
704#endif
705 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
706 ctxt->sax->error(ctxt->userData,
707 "Opening and ending tag mismatch: %s and %s\n",
708 newtag, ctxt->name);
709 ctxt->wellFormed = 0;
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000710 } else if (endCloses == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000711 return;
Owen Taylor3473f882001-02-23 17:55:21 +0000712 }
713 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
714 ctxt->sax->endElement(ctxt->userData, ctxt->name);
715 oldname = htmlnamePop(ctxt);
716 if (oldname != NULL) {
717#ifdef DEBUG
718 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
719#endif
720 xmlFree(oldname);
721 }
722 }
723}
724
725/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000726 * htmlAutoCloseOnEnd:
727 * @ctxt: an HTML parser context
728 *
729 * Close all remaining tags at the end of the stream
730 */
731static void
732htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
733 xmlChar *oldname;
734 int i;
735
736 if (ctxt->nameNr == 0)
737 return;
738#ifdef DEBUG
739 xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
740#endif
741
742 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
743#ifdef DEBUG
744 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
745#endif
746 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
747 ctxt->sax->endElement(ctxt->userData, ctxt->name);
748 oldname = htmlnamePop(ctxt);
749 if (oldname != NULL) {
750#ifdef DEBUG
751 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
752#endif
753 xmlFree(oldname);
754 }
755 }
756}
757
758/**
Owen Taylor3473f882001-02-23 17:55:21 +0000759 * htmlAutoClose:
760 * @ctxt: an HTML parser context
761 * @newtag: The new tag name or NULL
762 *
763 * The HTmL DtD allows a tag to implicitely close other tags.
764 * The list is kept in htmlStartClose array. This function is
765 * called when a new tag has been detected and generates the
766 * appropriates closes if possible/needed.
767 * If newtag is NULL this mean we are at the end of the resource
768 * and we should check
769 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000770static void
Owen Taylor3473f882001-02-23 17:55:21 +0000771htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
772 xmlChar *oldname;
773 while ((newtag != NULL) && (ctxt->name != NULL) &&
774 (htmlCheckAutoClose(newtag, ctxt->name))) {
775#ifdef DEBUG
776 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
777#endif
778 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
779 ctxt->sax->endElement(ctxt->userData, ctxt->name);
780 oldname = htmlnamePop(ctxt);
781 if (oldname != NULL) {
782#ifdef DEBUG
783 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
784#endif
785 xmlFree(oldname);
786 }
787 }
788 if (newtag == NULL) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000789 htmlAutoCloseOnEnd(ctxt);
790 return;
Owen Taylor3473f882001-02-23 17:55:21 +0000791 }
792 while ((newtag == NULL) && (ctxt->name != NULL) &&
793 ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
794 (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
795 (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
796#ifdef DEBUG
797 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
798#endif
799 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
800 ctxt->sax->endElement(ctxt->userData, ctxt->name);
801 oldname = htmlnamePop(ctxt);
802 if (oldname != NULL) {
803#ifdef DEBUG
804 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
805#endif
806 xmlFree(oldname);
807 }
808 }
809
810}
811
812/**
813 * htmlAutoCloseTag:
814 * @doc: the HTML document
815 * @name: The tag name
816 * @elem: the HTML element
817 *
818 * The HTmL DtD allows a tag to implicitely close other tags.
819 * The list is kept in htmlStartClose array. This function checks
820 * if the element or one of it's children would autoclose the
821 * given tag.
822 *
823 * Returns 1 if autoclose, 0 otherwise
824 */
825int
826htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
827 htmlNodePtr child;
828
829 if (elem == NULL) return(1);
830 if (xmlStrEqual(name, elem->name)) return(0);
831 if (htmlCheckAutoClose(elem->name, name)) return(1);
832 child = elem->children;
833 while (child != NULL) {
834 if (htmlAutoCloseTag(doc, name, child)) return(1);
835 child = child->next;
836 }
837 return(0);
838}
839
840/**
841 * htmlIsAutoClosed:
842 * @doc: the HTML document
843 * @elem: the HTML element
844 *
845 * The HTmL DtD allows a tag to implicitely close other tags.
846 * The list is kept in htmlStartClose array. This function checks
847 * if a tag is autoclosed by one of it's child
848 *
849 * Returns 1 if autoclosed, 0 otherwise
850 */
851int
852htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
853 htmlNodePtr child;
854
855 if (elem == NULL) return(1);
856 child = elem->children;
857 while (child != NULL) {
858 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
859 child = child->next;
860 }
861 return(0);
862}
863
864/**
865 * htmlCheckImplied:
866 * @ctxt: an HTML parser context
867 * @newtag: The new tag name
868 *
869 * The HTML DtD allows a tag to exists only implicitely
870 * called when a new tag has been detected and generates the
871 * appropriates implicit tags if missing
872 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000873static void
Owen Taylor3473f882001-02-23 17:55:21 +0000874htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
875 if (!htmlOmittedDefaultValue)
876 return;
877 if (xmlStrEqual(newtag, BAD_CAST"html"))
878 return;
879 if (ctxt->nameNr <= 0) {
880#ifdef DEBUG
881 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
882#endif
883 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
884 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
885 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
886 }
887 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
888 return;
889 if ((ctxt->nameNr <= 1) &&
890 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
891 (xmlStrEqual(newtag, BAD_CAST"style")) ||
892 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
893 (xmlStrEqual(newtag, BAD_CAST"link")) ||
894 (xmlStrEqual(newtag, BAD_CAST"title")) ||
895 (xmlStrEqual(newtag, BAD_CAST"base")))) {
896 /*
897 * dropped OBJECT ... i you put it first BODY will be
898 * assumed !
899 */
900#ifdef DEBUG
901 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
902#endif
903 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
904 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
905 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
906 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
907 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
908 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
909 int i;
910 for (i = 0;i < ctxt->nameNr;i++) {
911 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
912 return;
913 }
914 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
915 return;
916 }
917 }
918
919#ifdef DEBUG
920 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
921#endif
922 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
923 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
924 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
925 }
926}
927
928/**
929 * htmlCheckParagraph
930 * @ctxt: an HTML parser context
931 *
932 * Check whether a p element need to be implied before inserting
933 * characters in the current element.
934 *
935 * Returns 1 if a paragraph has been inserted, 0 if not and -1
936 * in case of error.
937 */
938
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000939static int
Owen Taylor3473f882001-02-23 17:55:21 +0000940htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
941 const xmlChar *tag;
942 int i;
943
944 if (ctxt == NULL)
945 return(-1);
946 tag = ctxt->name;
947 if (tag == NULL) {
948 htmlAutoClose(ctxt, BAD_CAST"p");
949 htmlCheckImplied(ctxt, BAD_CAST"p");
950 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
951 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
952 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
953 return(1);
954 }
955 if (!htmlOmittedDefaultValue)
956 return(0);
957 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
958 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
959#ifdef DEBUG
960 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
961#endif
962 htmlAutoClose(ctxt, BAD_CAST"p");
963 htmlCheckImplied(ctxt, BAD_CAST"p");
964 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
965 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
966 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
967 return(1);
968 }
969 }
970 return(0);
971}
972
973/**
974 * htmlIsScriptAttribute:
975 * @name: an attribute name
976 *
977 * Check if an attribute is of content type Script
978 *
979 * Returns 1 is the attribute is a script 0 otherwise
980 */
981int
982htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000983 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000984
985 if (name == NULL)
986 return(0);
987 /*
988 * all script attributes start with 'on'
989 */
990 if ((name[0] != 'o') || (name[1] != 'n'))
991 return(0);
992 for (i = 0;
993 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
994 i++) {
995 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
996 return(1);
997 }
998 return(0);
999}
1000
1001/************************************************************************
1002 * *
1003 * The list of HTML predefined entities *
1004 * *
1005 ************************************************************************/
1006
1007
1008htmlEntityDesc html40EntitiesTable[] = {
1009/*
1010 * the 4 absolute ones, plus apostrophe.
1011 */
1012{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1013{ 38, "amp", "ampersand, U+0026 ISOnum" },
1014{ 39, "apos", "single quote" },
1015{ 60, "lt", "less-than sign, U+003C ISOnum" },
1016{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1017
1018/*
1019 * A bunch still in the 128-255 range
1020 * Replacing them depend really on the charset used.
1021 */
1022{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1023{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1024{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1025{ 163, "pound","pound sign, U+00A3 ISOnum" },
1026{ 164, "curren","currency sign, U+00A4 ISOnum" },
1027{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1028{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1029{ 167, "sect", "section sign, U+00A7 ISOnum" },
1030{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1031{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1032{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1033{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1034{ 172, "not", "not sign, U+00AC ISOnum" },
1035{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1036{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1037{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1038{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1039{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1040{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1041{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1042{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1043{ 181, "micro","micro sign, U+00B5 ISOnum" },
1044{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1045{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1046{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1047{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1048{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1049{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1050{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1051{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1052{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1053{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1054{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1055{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1056{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1057{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1058{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1059{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1060{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1061{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1062{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1063{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1064{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1065{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1066{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1067{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1068{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1069{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1070{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1071{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1072{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1073{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1074{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1075{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1076{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1077{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1078{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1079{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1080{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1081{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1082{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1083{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1084{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1085{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1086{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1087{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1088{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1089{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1090{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1091{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1092{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1093{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1094{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1095{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1096{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1097{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1098{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1099{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1100{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1101{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1102{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1103{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1104{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1105{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1106{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1107{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1108{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1109{ 247, "divide","division sign, U+00F7 ISOnum" },
1110{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1111{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1112{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1113{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1114{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1115{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1116{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1117{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1118
1119{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1120{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1121{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1122{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1123{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1124
1125/*
1126 * Anything below should really be kept as entities references
1127 */
1128{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1129
1130{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1131{ 732, "tilde","small tilde, U+02DC ISOdia" },
1132
1133{ 913, "Alpha","greek capital letter alpha, U+0391" },
1134{ 914, "Beta", "greek capital letter beta, U+0392" },
1135{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1136{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1137{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1138{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1139{ 919, "Eta", "greek capital letter eta, U+0397" },
1140{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1141{ 921, "Iota", "greek capital letter iota, U+0399" },
1142{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001143{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001144{ 924, "Mu", "greek capital letter mu, U+039C" },
1145{ 925, "Nu", "greek capital letter nu, U+039D" },
1146{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1147{ 927, "Omicron","greek capital letter omicron, U+039F" },
1148{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1149{ 929, "Rho", "greek capital letter rho, U+03A1" },
1150{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1151{ 932, "Tau", "greek capital letter tau, U+03A4" },
1152{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1153{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1154{ 935, "Chi", "greek capital letter chi, U+03A7" },
1155{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1156{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1157
1158{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1159{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1160{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1161{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1162{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1163{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1164{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1165{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1166{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1167{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1168{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1169{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1170{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1171{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1172{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1173{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1174{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1175{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1176{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1177{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1178{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1179{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1180{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1181{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1182{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1183{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1184{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1185{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1186
1187{ 8194, "ensp", "en space, U+2002 ISOpub" },
1188{ 8195, "emsp", "em space, U+2003 ISOpub" },
1189{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1190{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1191{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1192{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1193{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1194{ 8211, "ndash","en dash, U+2013 ISOpub" },
1195{ 8212, "mdash","em dash, U+2014 ISOpub" },
1196{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1197{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1198{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1199{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1200{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1201{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1202{ 8224, "dagger","dagger, U+2020 ISOpub" },
1203{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1204
1205{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1206{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1207
1208{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1209
1210{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1211{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1212
1213{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1214{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1215
1216{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1217{ 8260, "frasl","fraction slash, U+2044 NEW" },
1218
1219{ 8364, "euro", "euro sign, U+20AC NEW" },
1220
1221{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1222{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1223{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1224{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1225{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1226{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1227{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1228{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1229{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1230{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1231{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1232{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1233{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1234{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1235{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1236{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1237
1238{ 8704, "forall","for all, U+2200 ISOtech" },
1239{ 8706, "part", "partial differential, U+2202 ISOtech" },
1240{ 8707, "exist","there exists, U+2203 ISOtech" },
1241{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1242{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1243{ 8712, "isin", "element of, U+2208 ISOtech" },
1244{ 8713, "notin","not an element of, U+2209 ISOtech" },
1245{ 8715, "ni", "contains as member, U+220B ISOtech" },
1246{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1247{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
1248{ 8722, "minus","minus sign, U+2212 ISOtech" },
1249{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1250{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1251{ 8733, "prop", "proportional to, U+221D ISOtech" },
1252{ 8734, "infin","infinity, U+221E ISOtech" },
1253{ 8736, "ang", "angle, U+2220 ISOamso" },
1254{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1255{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1256{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1257{ 8746, "cup", "union = cup, U+222A ISOtech" },
1258{ 8747, "int", "integral, U+222B ISOtech" },
1259{ 8756, "there4","therefore, U+2234 ISOtech" },
1260{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1261{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1262{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1263{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1264{ 8801, "equiv","identical to, U+2261 ISOtech" },
1265{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1266{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1267{ 8834, "sub", "subset of, U+2282 ISOtech" },
1268{ 8835, "sup", "superset of, U+2283 ISOtech" },
1269{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1270{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1271{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1272{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1273{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1274{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1275{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1276{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1277{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1278{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1279{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1280{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1281{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1282{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1283
1284{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1285{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1286{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1287{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1288
1289};
1290
1291/************************************************************************
1292 * *
1293 * Commodity functions to handle entities *
1294 * *
1295 ************************************************************************/
1296
1297/*
1298 * Macro used to grow the current buffer.
1299 */
1300#define growBuffer(buffer) { \
1301 buffer##_size *= 2; \
1302 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1303 if (buffer == NULL) { \
1304 perror("realloc failed"); \
1305 return(NULL); \
1306 } \
1307}
1308
1309/**
1310 * htmlEntityLookup:
1311 * @name: the entity name
1312 *
1313 * Lookup the given entity in EntitiesTable
1314 *
1315 * TODO: the linear scan is really ugly, an hash table is really needed.
1316 *
1317 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1318 */
1319htmlEntityDescPtr
1320htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001321 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001322
1323 for (i = 0;i < (sizeof(html40EntitiesTable)/
1324 sizeof(html40EntitiesTable[0]));i++) {
1325 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1326#ifdef DEBUG
1327 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
1328#endif
1329 return(&html40EntitiesTable[i]);
1330 }
1331 }
1332 return(NULL);
1333}
1334
1335/**
1336 * htmlEntityValueLookup:
1337 * @value: the entity's unicode value
1338 *
1339 * Lookup the given entity in EntitiesTable
1340 *
1341 * TODO: the linear scan is really ugly, an hash table is really needed.
1342 *
1343 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1344 */
1345htmlEntityDescPtr
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001346htmlEntityValueLookup(unsigned int value) {
1347 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001348#ifdef DEBUG
1349 int lv = 0;
1350#endif
1351
1352 for (i = 0;i < (sizeof(html40EntitiesTable)/
1353 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001354 if (html40EntitiesTable[i].value >= value) {
1355 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001356 break;
1357#ifdef DEBUG
1358 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
1359#endif
1360 return(&html40EntitiesTable[i]);
1361 }
1362#ifdef DEBUG
1363 if (lv > html40EntitiesTable[i].value) {
1364 xmlGenericError(xmlGenericErrorContext,
1365 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1366 lv, html40EntitiesTable[i].value);
1367 }
1368 lv = html40EntitiesTable[i].value;
1369#endif
1370 }
1371 return(NULL);
1372}
1373
1374/**
1375 * UTF8ToHtml:
1376 * @out: a pointer to an array of bytes to store the result
1377 * @outlen: the length of @out
1378 * @in: a pointer to an array of UTF-8 chars
1379 * @inlen: the length of @in
1380 *
1381 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1382 * plus HTML entities block of chars out.
1383 *
1384 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1385 * The value of @inlen after return is the number of octets consumed
1386 * as the return value is positive, else unpredictiable.
1387 * The value of @outlen after return is the number of octets consumed.
1388 */
1389int
1390UTF8ToHtml(unsigned char* out, int *outlen,
1391 const unsigned char* in, int *inlen) {
1392 const unsigned char* processed = in;
1393 const unsigned char* outend;
1394 const unsigned char* outstart = out;
1395 const unsigned char* instart = in;
1396 const unsigned char* inend;
1397 unsigned int c, d;
1398 int trailing;
1399
1400 if (in == NULL) {
1401 /*
1402 * initialization nothing to do
1403 */
1404 *outlen = 0;
1405 *inlen = 0;
1406 return(0);
1407 }
1408 inend = in + (*inlen);
1409 outend = out + (*outlen);
1410 while (in < inend) {
1411 d = *in++;
1412 if (d < 0x80) { c= d; trailing= 0; }
1413 else if (d < 0xC0) {
1414 /* trailing byte in leading position */
1415 *outlen = out - outstart;
1416 *inlen = processed - instart;
1417 return(-2);
1418 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1419 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1420 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1421 else {
1422 /* no chance for this in Ascii */
1423 *outlen = out - outstart;
1424 *inlen = processed - instart;
1425 return(-2);
1426 }
1427
1428 if (inend - in < trailing) {
1429 break;
1430 }
1431
1432 for ( ; trailing; trailing--) {
1433 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1434 break;
1435 c <<= 6;
1436 c |= d & 0x3F;
1437 }
1438
1439 /* assertion: c is a single UTF-4 value */
1440 if (c < 0x80) {
1441 if (out + 1 >= outend)
1442 break;
1443 *out++ = c;
1444 } else {
1445 int len;
1446 htmlEntityDescPtr ent;
1447
1448 /*
1449 * Try to lookup a predefined HTML entity for it
1450 */
1451
1452 ent = htmlEntityValueLookup(c);
1453 if (ent == NULL) {
1454 /* no chance for this in Ascii */
1455 *outlen = out - outstart;
1456 *inlen = processed - instart;
1457 return(-2);
1458 }
1459 len = strlen(ent->name);
1460 if (out + 2 + len >= outend)
1461 break;
1462 *out++ = '&';
1463 memcpy(out, ent->name, len);
1464 out += len;
1465 *out++ = ';';
1466 }
1467 processed = in;
1468 }
1469 *outlen = out - outstart;
1470 *inlen = processed - instart;
1471 return(0);
1472}
1473
1474/**
1475 * htmlEncodeEntities:
1476 * @out: a pointer to an array of bytes to store the result
1477 * @outlen: the length of @out
1478 * @in: a pointer to an array of UTF-8 chars
1479 * @inlen: the length of @in
1480 * @quoteChar: the quote character to escape (' or ") or zero.
1481 *
1482 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1483 * plus HTML entities block of chars out.
1484 *
1485 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1486 * The value of @inlen after return is the number of octets consumed
1487 * as the return value is positive, else unpredictiable.
1488 * The value of @outlen after return is the number of octets consumed.
1489 */
1490int
1491htmlEncodeEntities(unsigned char* out, int *outlen,
1492 const unsigned char* in, int *inlen, int quoteChar) {
1493 const unsigned char* processed = in;
1494 const unsigned char* outend = out + (*outlen);
1495 const unsigned char* outstart = out;
1496 const unsigned char* instart = in;
1497 const unsigned char* inend = in + (*inlen);
1498 unsigned int c, d;
1499 int trailing;
1500
1501 while (in < inend) {
1502 d = *in++;
1503 if (d < 0x80) { c= d; trailing= 0; }
1504 else if (d < 0xC0) {
1505 /* trailing byte in leading position */
1506 *outlen = out - outstart;
1507 *inlen = processed - instart;
1508 return(-2);
1509 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1510 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1511 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1512 else {
1513 /* no chance for this in Ascii */
1514 *outlen = out - outstart;
1515 *inlen = processed - instart;
1516 return(-2);
1517 }
1518
1519 if (inend - in < trailing)
1520 break;
1521
1522 while (trailing--) {
1523 if (((d= *in++) & 0xC0) != 0x80) {
1524 *outlen = out - outstart;
1525 *inlen = processed - instart;
1526 return(-2);
1527 }
1528 c <<= 6;
1529 c |= d & 0x3F;
1530 }
1531
1532 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001533 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1534 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001535 if (out >= outend)
1536 break;
1537 *out++ = c;
1538 } else {
1539 htmlEntityDescPtr ent;
1540 const char *cp;
1541 char nbuf[16];
1542 int len;
1543
1544 /*
1545 * Try to lookup a predefined HTML entity for it
1546 */
1547 ent = htmlEntityValueLookup(c);
1548 if (ent == NULL) {
1549 sprintf(nbuf, "#%u", c);
1550 cp = nbuf;
1551 }
1552 else
1553 cp = ent->name;
1554 len = strlen(cp);
1555 if (out + 2 + len > outend)
1556 break;
1557 *out++ = '&';
1558 memcpy(out, cp, len);
1559 out += len;
1560 *out++ = ';';
1561 }
1562 processed = in;
1563 }
1564 *outlen = out - outstart;
1565 *inlen = processed - instart;
1566 return(0);
1567}
1568
1569/**
1570 * htmlDecodeEntities:
1571 * @ctxt: the parser context
1572 * @len: the len to decode (in bytes !), -1 for no size limit
1573 * @end: an end marker xmlChar, 0 if none
1574 * @end2: an end marker xmlChar, 0 if none
1575 * @end3: an end marker xmlChar, 0 if none
1576 *
1577 * Subtitute the HTML entities by their value
1578 *
1579 * DEPRECATED !!!!
1580 *
1581 * Returns A newly allocated string with the substitution done. The caller
1582 * must deallocate it !
1583 */
1584xmlChar *
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00001585htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
1586 xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001587 static int deprecated = 0;
1588 if (!deprecated) {
1589 xmlGenericError(xmlGenericErrorContext,
1590 "htmlDecodeEntities() deprecated function reached\n");
1591 deprecated = 1;
1592 }
1593 return(NULL);
1594#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00001595 xmlChar *name = NULL;
1596 xmlChar *buffer = NULL;
1597 unsigned int buffer_size = 0;
1598 unsigned int nbchars = 0;
1599 htmlEntityDescPtr ent;
1600 unsigned int max = (unsigned int) len;
1601 int c,l;
1602
1603 if (ctxt->depth > 40) {
1604 ctxt->errNo = XML_ERR_ENTITY_LOOP;
1605 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1606 ctxt->sax->error(ctxt->userData,
1607 "Detected entity reference loop\n");
1608 ctxt->wellFormed = 0;
1609 ctxt->disableSAX = 1;
1610 return(NULL);
1611 }
1612
1613 /*
1614 * allocate a translation buffer.
1615 */
1616 buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1617 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1618 if (buffer == NULL) {
1619 perror("xmlDecodeEntities: malloc failed");
1620 return(NULL);
1621 }
1622
1623 /*
1624 * Ok loop until we reach one of the ending char or a size limit.
1625 */
1626 c = CUR_CHAR(l);
1627 while ((nbchars < max) && (c != end) &&
1628 (c != end2) && (c != end3)) {
1629
1630 if (c == 0) break;
1631 if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
1632 int val = htmlParseCharRef(ctxt);
1633 COPY_BUF(0,buffer,nbchars,val);
1634 NEXTL(l);
1635 } else if ((c == '&') && (ctxt->token != '&')) {
1636 ent = htmlParseEntityRef(ctxt, &name);
1637 if (name != NULL) {
1638 if (ent != NULL) {
1639 int val = ent->value;
1640 COPY_BUF(0,buffer,nbchars,val);
1641 NEXTL(l);
1642 } else {
1643 const xmlChar *cur = name;
1644
1645 buffer[nbchars++] = '&';
1646 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1647 growBuffer(buffer);
1648 }
1649 while (*cur != 0) {
1650 buffer[nbchars++] = *cur++;
1651 }
1652 buffer[nbchars++] = ';';
1653 }
1654 }
1655 } else {
1656 COPY_BUF(l,buffer,nbchars,c);
1657 NEXTL(l);
1658 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1659 growBuffer(buffer);
1660 }
1661 }
1662 c = CUR_CHAR(l);
1663 }
1664 buffer[nbchars++] = 0;
1665 return(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001666#endif
Owen Taylor3473f882001-02-23 17:55:21 +00001667}
1668
1669/************************************************************************
1670 * *
1671 * Commodity functions to handle streams *
1672 * *
1673 ************************************************************************/
1674
1675/**
Owen Taylor3473f882001-02-23 17:55:21 +00001676 * htmlNewInputStream:
1677 * @ctxt: an HTML parser context
1678 *
1679 * Create a new input stream structure
1680 * Returns the new input stream or NULL
1681 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001682static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001683htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1684 htmlParserInputPtr input;
1685
1686 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1687 if (input == NULL) {
1688 ctxt->errNo = XML_ERR_NO_MEMORY;
1689 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1690 ctxt->sax->error(ctxt->userData,
1691 "malloc: couldn't allocate a new input stream\n");
1692 return(NULL);
1693 }
1694 memset(input, 0, sizeof(htmlParserInput));
1695 input->filename = NULL;
1696 input->directory = NULL;
1697 input->base = NULL;
1698 input->cur = NULL;
1699 input->buf = NULL;
1700 input->line = 1;
1701 input->col = 1;
1702 input->buf = NULL;
1703 input->free = NULL;
1704 input->version = NULL;
1705 input->consumed = 0;
1706 input->length = 0;
1707 return(input);
1708}
1709
1710
1711/************************************************************************
1712 * *
1713 * Commodity functions, cleanup needed ? *
1714 * *
1715 ************************************************************************/
1716
1717/**
1718 * areBlanks:
1719 * @ctxt: an HTML parser context
1720 * @str: a xmlChar *
1721 * @len: the size of @str
1722 *
1723 * Is this a sequence of blank chars that one can ignore ?
1724 *
1725 * Returns 1 if ignorable 0 otherwise.
1726 */
1727
1728static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
1729 int i;
1730 xmlNodePtr lastChild;
1731
1732 for (i = 0;i < len;i++)
1733 if (!(IS_BLANK(str[i]))) return(0);
1734
1735 if (CUR == 0) return(1);
1736 if (CUR != '<') return(0);
1737 if (ctxt->name == NULL)
1738 return(1);
1739 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
1740 return(1);
1741 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
1742 return(1);
1743 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
1744 return(1);
1745 if (ctxt->node == NULL) return(0);
1746 lastChild = xmlGetLastChild(ctxt->node);
1747 if (lastChild == NULL) {
1748 if (ctxt->node->content != NULL) return(0);
1749 } else if (xmlNodeIsText(lastChild)) {
1750 return(0);
1751 } else if (xmlStrEqual(lastChild->name, BAD_CAST"b")) {
1752 return(0);
1753 } else if (xmlStrEqual(lastChild->name, BAD_CAST"bold")) {
1754 return(0);
1755 } else if (xmlStrEqual(lastChild->name, BAD_CAST"em")) {
1756 return(0);
1757 }
1758 return(1);
1759}
1760
1761/**
Owen Taylor3473f882001-02-23 17:55:21 +00001762 * htmlNewDocNoDtD:
1763 * @URI: URI for the dtd, or NULL
1764 * @ExternalID: the external ID of the DTD, or NULL
1765 *
1766 * Returns a new document, do not intialize the DTD if not provided
1767 */
1768htmlDocPtr
1769htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
1770 xmlDocPtr cur;
1771
1772 /*
1773 * Allocate a new document and fill the fields.
1774 */
1775 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
1776 if (cur == NULL) {
1777 xmlGenericError(xmlGenericErrorContext,
1778 "xmlNewDoc : malloc failed\n");
1779 return(NULL);
1780 }
1781 memset(cur, 0, sizeof(xmlDoc));
1782
1783 cur->type = XML_HTML_DOCUMENT_NODE;
1784 cur->version = NULL;
1785 cur->intSubset = NULL;
1786 if ((ExternalID != NULL) ||
1787 (URI != NULL))
1788 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
1789 cur->doc = cur;
1790 cur->name = NULL;
1791 cur->children = NULL;
1792 cur->extSubset = NULL;
1793 cur->oldNs = NULL;
1794 cur->encoding = NULL;
1795 cur->standalone = 1;
1796 cur->compression = 0;
1797 cur->ids = NULL;
1798 cur->refs = NULL;
1799#ifndef XML_WITHOUT_CORBA
1800 cur->_private = NULL;
1801#endif
1802 return(cur);
1803}
1804
1805/**
1806 * htmlNewDoc:
1807 * @URI: URI for the dtd, or NULL
1808 * @ExternalID: the external ID of the DTD, or NULL
1809 *
1810 * Returns a new document
1811 */
1812htmlDocPtr
1813htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1814 if ((URI == NULL) && (ExternalID == NULL))
1815 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00001816 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
1817 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00001818
1819 return(htmlNewDocNoDtD(URI, ExternalID));
1820}
1821
1822
1823/************************************************************************
1824 * *
1825 * The parser itself *
1826 * Relates to http://www.w3.org/TR/html40 *
1827 * *
1828 ************************************************************************/
1829
1830/************************************************************************
1831 * *
1832 * The parser itself *
1833 * *
1834 ************************************************************************/
1835
1836/**
1837 * htmlParseHTMLName:
1838 * @ctxt: an HTML parser context
1839 *
1840 * parse an HTML tag or attribute name, note that we convert it to lowercase
1841 * since HTML names are not case-sensitive.
1842 *
1843 * Returns the Tag Name parsed or NULL
1844 */
1845
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001846static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001847htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1848 xmlChar *ret = NULL;
1849 int i = 0;
1850 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
1851
1852 if (!IS_LETTER(CUR) && (CUR != '_') &&
1853 (CUR != ':')) return(NULL);
1854
1855 while ((i < HTML_PARSER_BUFFER_SIZE) &&
1856 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1857 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
1858 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
1859 else loc[i] = CUR;
1860 i++;
1861
1862 NEXT;
1863 }
1864
1865 ret = xmlStrndup(loc, i);
1866
1867 return(ret);
1868}
1869
1870/**
1871 * htmlParseName:
1872 * @ctxt: an HTML parser context
1873 *
1874 * parse an HTML name, this routine is case sensistive.
1875 *
1876 * Returns the Name parsed or NULL
1877 */
1878
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001879static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001880htmlParseName(htmlParserCtxtPtr ctxt) {
1881 xmlChar buf[HTML_MAX_NAMELEN];
1882 int len = 0;
1883
1884 GROW;
1885 if (!IS_LETTER(CUR) && (CUR != '_')) {
1886 return(NULL);
1887 }
1888
1889 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1890 (CUR == '.') || (CUR == '-') ||
1891 (CUR == '_') || (CUR == ':') ||
1892 (IS_COMBINING(CUR)) ||
1893 (IS_EXTENDER(CUR))) {
1894 buf[len++] = CUR;
1895 NEXT;
1896 if (len >= HTML_MAX_NAMELEN) {
1897 xmlGenericError(xmlGenericErrorContext,
1898 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1899 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1900 (CUR == '.') || (CUR == '-') ||
1901 (CUR == '_') || (CUR == ':') ||
1902 (IS_COMBINING(CUR)) ||
1903 (IS_EXTENDER(CUR)))
1904 NEXT;
1905 break;
1906 }
1907 }
1908 return(xmlStrndup(buf, len));
1909}
1910
1911/**
1912 * htmlParseHTMLAttribute:
1913 * @ctxt: an HTML parser context
1914 * @stop: a char stop value
1915 *
1916 * parse an HTML attribute value till the stop (quote), if
1917 * stop is 0 then it stops at the first space
1918 *
1919 * Returns the attribute parsed or NULL
1920 */
1921
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001922static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001923htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
1924 xmlChar *buffer = NULL;
1925 int buffer_size = 0;
1926 xmlChar *out = NULL;
1927 xmlChar *name = NULL;
1928
1929 xmlChar *cur = NULL;
1930 htmlEntityDescPtr ent;
1931
1932 /*
1933 * allocate a translation buffer.
1934 */
1935 buffer_size = HTML_PARSER_BUFFER_SIZE;
1936 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1937 if (buffer == NULL) {
1938 perror("htmlParseHTMLAttribute: malloc failed");
1939 return(NULL);
1940 }
1941 out = buffer;
1942
1943 /*
1944 * Ok loop until we reach one of the ending chars
1945 */
1946 while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1947 if ((stop == 0) && (IS_BLANK(CUR))) break;
1948 if (CUR == '&') {
1949 if (NXT(1) == '#') {
1950 unsigned int c;
1951 int bits;
1952
1953 c = htmlParseCharRef(ctxt);
1954 if (c < 0x80)
1955 { *out++ = c; bits= -6; }
1956 else if (c < 0x800)
1957 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1958 else if (c < 0x10000)
1959 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1960 else
1961 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1962
1963 for ( ; bits >= 0; bits-= 6) {
1964 *out++ = ((c >> bits) & 0x3F) | 0x80;
1965 }
1966 } else {
1967 ent = htmlParseEntityRef(ctxt, &name);
1968 if (name == NULL) {
1969 *out++ = '&';
1970 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001971 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00001972
1973 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001974 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00001975 }
1976 } else if (ent == NULL) {
1977 *out++ = '&';
1978 cur = name;
1979 while (*cur != 0) {
1980 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001981 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00001982
1983 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001984 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00001985 }
1986 *out++ = *cur++;
1987 }
1988 xmlFree(name);
1989 } else {
1990 unsigned int c;
1991 int bits;
1992
1993 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001994 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00001995
1996 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001997 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00001998 }
1999 c = (xmlChar)ent->value;
2000 if (c < 0x80)
2001 { *out++ = c; bits= -6; }
2002 else if (c < 0x800)
2003 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2004 else if (c < 0x10000)
2005 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2006 else
2007 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2008
2009 for ( ; bits >= 0; bits-= 6) {
2010 *out++ = ((c >> bits) & 0x3F) | 0x80;
2011 }
2012 xmlFree(name);
2013 }
2014 }
2015 } else {
2016 unsigned int c;
2017 int bits, l;
2018
2019 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002020 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002021
2022 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002023 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002024 }
2025 c = CUR_CHAR(l);
2026 if (c < 0x80)
2027 { *out++ = c; bits= -6; }
2028 else if (c < 0x800)
2029 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2030 else if (c < 0x10000)
2031 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2032 else
2033 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2034
2035 for ( ; bits >= 0; bits-= 6) {
2036 *out++ = ((c >> bits) & 0x3F) | 0x80;
2037 }
2038 NEXT;
2039 }
2040 }
2041 *out++ = 0;
2042 return(buffer);
2043}
2044
2045/**
Owen Taylor3473f882001-02-23 17:55:21 +00002046 * htmlParseEntityRef:
2047 * @ctxt: an HTML parser context
2048 * @str: location to store the entity name
2049 *
2050 * parse an HTML ENTITY references
2051 *
2052 * [68] EntityRef ::= '&' Name ';'
2053 *
2054 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2055 * if non-NULL *str will have to be freed by the caller.
2056 */
2057htmlEntityDescPtr
2058htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2059 xmlChar *name;
2060 htmlEntityDescPtr ent = NULL;
2061 *str = NULL;
2062
2063 if (CUR == '&') {
2064 NEXT;
2065 name = htmlParseName(ctxt);
2066 if (name == NULL) {
2067 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2068 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2069 ctxt->wellFormed = 0;
2070 } else {
2071 GROW;
2072 if (CUR == ';') {
2073 *str = name;
2074
2075 /*
2076 * Lookup the entity in the table.
2077 */
2078 ent = htmlEntityLookup(name);
2079 if (ent != NULL) /* OK that's ugly !!! */
2080 NEXT;
2081 } else {
2082 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2083 ctxt->sax->error(ctxt->userData,
2084 "htmlParseEntityRef: expecting ';'\n");
2085 *str = name;
2086 }
2087 }
2088 }
2089 return(ent);
2090}
2091
2092/**
2093 * htmlParseAttValue:
2094 * @ctxt: an HTML parser context
2095 *
2096 * parse a value for an attribute
2097 * Note: the parser won't do substitution of entities here, this
2098 * will be handled later in xmlStringGetNodeList, unless it was
2099 * asked for ctxt->replaceEntities != 0
2100 *
2101 * Returns the AttValue parsed or NULL.
2102 */
2103
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002104static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002105htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2106 xmlChar *ret = NULL;
2107
2108 if (CUR == '"') {
2109 NEXT;
2110 ret = htmlParseHTMLAttribute(ctxt, '"');
2111 if (CUR != '"') {
2112 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2113 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2114 ctxt->wellFormed = 0;
2115 } else
2116 NEXT;
2117 } else if (CUR == '\'') {
2118 NEXT;
2119 ret = htmlParseHTMLAttribute(ctxt, '\'');
2120 if (CUR != '\'') {
2121 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2122 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2123 ctxt->wellFormed = 0;
2124 } else
2125 NEXT;
2126 } else {
2127 /*
2128 * That's an HTMLism, the attribute value may not be quoted
2129 */
2130 ret = htmlParseHTMLAttribute(ctxt, 0);
2131 if (ret == NULL) {
2132 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2133 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2134 ctxt->wellFormed = 0;
2135 }
2136 }
2137 return(ret);
2138}
2139
2140/**
2141 * htmlParseSystemLiteral:
2142 * @ctxt: an HTML parser context
2143 *
2144 * parse an HTML Literal
2145 *
2146 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2147 *
2148 * Returns the SystemLiteral parsed or NULL
2149 */
2150
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002151static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002152htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2153 const xmlChar *q;
2154 xmlChar *ret = NULL;
2155
2156 if (CUR == '"') {
2157 NEXT;
2158 q = CUR_PTR;
2159 while ((IS_CHAR(CUR)) && (CUR != '"'))
2160 NEXT;
2161 if (!IS_CHAR(CUR)) {
2162 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2163 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2164 ctxt->wellFormed = 0;
2165 } else {
2166 ret = xmlStrndup(q, CUR_PTR - q);
2167 NEXT;
2168 }
2169 } else if (CUR == '\'') {
2170 NEXT;
2171 q = CUR_PTR;
2172 while ((IS_CHAR(CUR)) && (CUR != '\''))
2173 NEXT;
2174 if (!IS_CHAR(CUR)) {
2175 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2176 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2177 ctxt->wellFormed = 0;
2178 } else {
2179 ret = xmlStrndup(q, CUR_PTR - q);
2180 NEXT;
2181 }
2182 } else {
2183 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2184 ctxt->sax->error(ctxt->userData,
2185 "SystemLiteral \" or ' expected\n");
2186 ctxt->wellFormed = 0;
2187 }
2188
2189 return(ret);
2190}
2191
2192/**
2193 * htmlParsePubidLiteral:
2194 * @ctxt: an HTML parser context
2195 *
2196 * parse an HTML public literal
2197 *
2198 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2199 *
2200 * Returns the PubidLiteral parsed or NULL.
2201 */
2202
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002203static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002204htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2205 const xmlChar *q;
2206 xmlChar *ret = NULL;
2207 /*
2208 * Name ::= (Letter | '_') (NameChar)*
2209 */
2210 if (CUR == '"') {
2211 NEXT;
2212 q = CUR_PTR;
2213 while (IS_PUBIDCHAR(CUR)) NEXT;
2214 if (CUR != '"') {
2215 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2216 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2217 ctxt->wellFormed = 0;
2218 } else {
2219 ret = xmlStrndup(q, CUR_PTR - q);
2220 NEXT;
2221 }
2222 } else if (CUR == '\'') {
2223 NEXT;
2224 q = CUR_PTR;
2225 while ((IS_LETTER(CUR)) && (CUR != '\''))
2226 NEXT;
2227 if (!IS_LETTER(CUR)) {
2228 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2229 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2230 ctxt->wellFormed = 0;
2231 } else {
2232 ret = xmlStrndup(q, CUR_PTR - q);
2233 NEXT;
2234 }
2235 } else {
2236 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2237 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2238 ctxt->wellFormed = 0;
2239 }
2240
2241 return(ret);
2242}
2243
2244/**
2245 * htmlParseScript:
2246 * @ctxt: an HTML parser context
2247 *
2248 * parse the content of an HTML SCRIPT or STYLE element
2249 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2250 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2251 * http://www.w3.org/TR/html4/types.html#type-script
2252 * http://www.w3.org/TR/html4/types.html#h-6.15
2253 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2254 *
2255 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2256 * element and the value of intrinsic event attributes. User agents must
2257 * not evaluate script data as HTML markup but instead must pass it on as
2258 * data to a script engine.
2259 * NOTES:
2260 * - The content is passed like CDATA
2261 * - the attributes for style and scripting "onXXX" are also described
2262 * as CDATA but SGML allows entities references in attributes so their
2263 * processing is identical as other attributes
2264 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002265static void
Owen Taylor3473f882001-02-23 17:55:21 +00002266htmlParseScript(htmlParserCtxtPtr ctxt) {
2267 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2268 int nbchar = 0;
2269 xmlChar cur;
2270
2271 SHRINK;
2272 cur = CUR;
2273 while (IS_CHAR(cur)) {
2274 if ((cur == '<') && (NXT(1) == '/')) {
2275 /*
2276 * One should break here, the specification is clear:
2277 * Authors should therefore escape "</" within the content.
2278 * Escape mechanisms are specific to each scripting or
2279 * style sheet language.
2280 */
2281 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2282 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2283 break; /* while */
2284 }
2285 buf[nbchar++] = cur;
2286 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2287 if (ctxt->sax->cdataBlock!= NULL) {
2288 /*
2289 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2290 */
2291 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2292 }
2293 nbchar = 0;
2294 }
2295 NEXT;
2296 cur = CUR;
2297 }
2298 if (!(IS_CHAR(cur))) {
2299 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2300 ctxt->sax->error(ctxt->userData,
2301 "Invalid char in CDATA 0x%X\n", cur);
2302 ctxt->wellFormed = 0;
2303 NEXT;
2304 }
2305
2306 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2307 if (ctxt->sax->cdataBlock!= NULL) {
2308 /*
2309 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2310 */
2311 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2312 }
2313 }
2314}
2315
2316
2317/**
2318 * htmlParseCharData:
2319 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002320 *
2321 * parse a CharData section.
2322 * if we are within a CDATA section ']]>' marks an end of section.
2323 *
2324 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2325 */
2326
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002327static void
2328htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002329 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2330 int nbchar = 0;
2331 int cur, l;
2332
2333 SHRINK;
2334 cur = CUR_CHAR(l);
2335 while (((cur != '<') || (ctxt->token == '<')) &&
2336 ((cur != '&') || (ctxt->token == '&')) &&
2337 (IS_CHAR(cur))) {
2338 COPY_BUF(l,buf,nbchar,cur);
2339 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2340 /*
2341 * Ok the segment is to be consumed as chars.
2342 */
2343 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2344 if (areBlanks(ctxt, buf, nbchar)) {
2345 if (ctxt->sax->ignorableWhitespace != NULL)
2346 ctxt->sax->ignorableWhitespace(ctxt->userData,
2347 buf, nbchar);
2348 } else {
2349 htmlCheckParagraph(ctxt);
2350 if (ctxt->sax->characters != NULL)
2351 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2352 }
2353 }
2354 nbchar = 0;
2355 }
2356 NEXTL(l);
2357 cur = CUR_CHAR(l);
2358 }
2359 if (nbchar != 0) {
2360 /*
2361 * Ok the segment is to be consumed as chars.
2362 */
2363 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2364 if (areBlanks(ctxt, buf, nbchar)) {
2365 if (ctxt->sax->ignorableWhitespace != NULL)
2366 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2367 } else {
2368 htmlCheckParagraph(ctxt);
2369 if (ctxt->sax->characters != NULL)
2370 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2371 }
2372 }
2373 }
2374}
2375
2376/**
2377 * htmlParseExternalID:
2378 * @ctxt: an HTML parser context
2379 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002380 *
2381 * Parse an External ID or a Public ID
2382 *
Owen Taylor3473f882001-02-23 17:55:21 +00002383 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2384 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2385 *
2386 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2387 *
2388 * Returns the function returns SystemLiteral and in the second
2389 * case publicID receives PubidLiteral, is strict is off
2390 * it is possible to return NULL and have publicID set.
2391 */
2392
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002393static xmlChar *
2394htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002395 xmlChar *URI = NULL;
2396
2397 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2398 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2399 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2400 SKIP(6);
2401 if (!IS_BLANK(CUR)) {
2402 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2403 ctxt->sax->error(ctxt->userData,
2404 "Space required after 'SYSTEM'\n");
2405 ctxt->wellFormed = 0;
2406 }
2407 SKIP_BLANKS;
2408 URI = htmlParseSystemLiteral(ctxt);
2409 if (URI == NULL) {
2410 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2411 ctxt->sax->error(ctxt->userData,
2412 "htmlParseExternalID: SYSTEM, no URI\n");
2413 ctxt->wellFormed = 0;
2414 }
2415 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2416 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2417 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2418 SKIP(6);
2419 if (!IS_BLANK(CUR)) {
2420 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2421 ctxt->sax->error(ctxt->userData,
2422 "Space required after 'PUBLIC'\n");
2423 ctxt->wellFormed = 0;
2424 }
2425 SKIP_BLANKS;
2426 *publicID = htmlParsePubidLiteral(ctxt);
2427 if (*publicID == NULL) {
2428 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2429 ctxt->sax->error(ctxt->userData,
2430 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2431 ctxt->wellFormed = 0;
2432 }
2433 SKIP_BLANKS;
2434 if ((CUR == '"') || (CUR == '\'')) {
2435 URI = htmlParseSystemLiteral(ctxt);
2436 }
2437 }
2438 return(URI);
2439}
2440
2441/**
2442 * htmlParseComment:
2443 * @ctxt: an HTML parser context
2444 *
2445 * Parse an XML (SGML) comment <!-- .... -->
2446 *
2447 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2448 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002449static void
Owen Taylor3473f882001-02-23 17:55:21 +00002450htmlParseComment(htmlParserCtxtPtr ctxt) {
2451 xmlChar *buf = NULL;
2452 int len;
2453 int size = HTML_PARSER_BUFFER_SIZE;
2454 int q, ql;
2455 int r, rl;
2456 int cur, l;
2457 xmlParserInputState state;
2458
2459 /*
2460 * Check that there is a comment right here.
2461 */
2462 if ((RAW != '<') || (NXT(1) != '!') ||
2463 (NXT(2) != '-') || (NXT(3) != '-')) return;
2464
2465 state = ctxt->instate;
2466 ctxt->instate = XML_PARSER_COMMENT;
2467 SHRINK;
2468 SKIP(4);
2469 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2470 if (buf == NULL) {
2471 xmlGenericError(xmlGenericErrorContext,
2472 "malloc of %d byte failed\n", size);
2473 ctxt->instate = state;
2474 return;
2475 }
2476 q = CUR_CHAR(ql);
2477 NEXTL(ql);
2478 r = CUR_CHAR(rl);
2479 NEXTL(rl);
2480 cur = CUR_CHAR(l);
2481 len = 0;
2482 while (IS_CHAR(cur) &&
2483 ((cur != '>') ||
2484 (r != '-') || (q != '-'))) {
2485 if (len + 5 >= size) {
2486 size *= 2;
2487 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2488 if (buf == NULL) {
2489 xmlGenericError(xmlGenericErrorContext,
2490 "realloc of %d byte failed\n", size);
2491 ctxt->instate = state;
2492 return;
2493 }
2494 }
2495 COPY_BUF(ql,buf,len,q);
2496 q = r;
2497 ql = rl;
2498 r = cur;
2499 rl = l;
2500 NEXTL(l);
2501 cur = CUR_CHAR(l);
2502 if (cur == 0) {
2503 SHRINK;
2504 GROW;
2505 cur = CUR_CHAR(l);
2506 }
2507 }
2508 buf[len] = 0;
2509 if (!IS_CHAR(cur)) {
2510 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2511 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2512 ctxt->sax->error(ctxt->userData,
2513 "Comment not terminated \n<!--%.50s\n", buf);
2514 ctxt->wellFormed = 0;
2515 xmlFree(buf);
2516 } else {
2517 NEXT;
2518 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2519 (!ctxt->disableSAX))
2520 ctxt->sax->comment(ctxt->userData, buf);
2521 xmlFree(buf);
2522 }
2523 ctxt->instate = state;
2524}
2525
2526/**
2527 * htmlParseCharRef:
2528 * @ctxt: an HTML parser context
2529 *
2530 * parse Reference declarations
2531 *
2532 * [66] CharRef ::= '&#' [0-9]+ ';' |
2533 * '&#x' [0-9a-fA-F]+ ';'
2534 *
2535 * Returns the value parsed (as an int)
2536 */
2537int
2538htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2539 int val = 0;
2540
2541 if ((CUR == '&') && (NXT(1) == '#') &&
2542 (NXT(2) == 'x')) {
2543 SKIP(3);
2544 while (CUR != ';') {
2545 if ((CUR >= '0') && (CUR <= '9'))
2546 val = val * 16 + (CUR - '0');
2547 else if ((CUR >= 'a') && (CUR <= 'f'))
2548 val = val * 16 + (CUR - 'a') + 10;
2549 else if ((CUR >= 'A') && (CUR <= 'F'))
2550 val = val * 16 + (CUR - 'A') + 10;
2551 else {
2552 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2553 ctxt->sax->error(ctxt->userData,
2554 "htmlParseCharRef: invalid hexadecimal value\n");
2555 ctxt->wellFormed = 0;
2556 return(0);
2557 }
2558 NEXT;
2559 }
2560 if (CUR == ';')
2561 NEXT;
2562 } else if ((CUR == '&') && (NXT(1) == '#')) {
2563 SKIP(2);
2564 while (CUR != ';') {
2565 if ((CUR >= '0') && (CUR <= '9'))
2566 val = val * 10 + (CUR - '0');
2567 else {
2568 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2569 ctxt->sax->error(ctxt->userData,
2570 "htmlParseCharRef: invalid decimal value\n");
2571 ctxt->wellFormed = 0;
2572 return(0);
2573 }
2574 NEXT;
2575 }
2576 if (CUR == ';')
2577 NEXT;
2578 } else {
2579 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2580 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2581 ctxt->wellFormed = 0;
2582 }
2583 /*
2584 * Check the value IS_CHAR ...
2585 */
2586 if (IS_CHAR(val)) {
2587 return(val);
2588 } else {
2589 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2590 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
2591 val);
2592 ctxt->wellFormed = 0;
2593 }
2594 return(0);
2595}
2596
2597
2598/**
2599 * htmlParseDocTypeDecl :
2600 * @ctxt: an HTML parser context
2601 *
2602 * parse a DOCTYPE declaration
2603 *
2604 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2605 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2606 */
2607
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002608static void
Owen Taylor3473f882001-02-23 17:55:21 +00002609htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
2610 xmlChar *name;
2611 xmlChar *ExternalID = NULL;
2612 xmlChar *URI = NULL;
2613
2614 /*
2615 * We know that '<!DOCTYPE' has been detected.
2616 */
2617 SKIP(9);
2618
2619 SKIP_BLANKS;
2620
2621 /*
2622 * Parse the DOCTYPE name.
2623 */
2624 name = htmlParseName(ctxt);
2625 if (name == NULL) {
2626 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2627 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2628 ctxt->wellFormed = 0;
2629 }
2630 /*
2631 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2632 */
2633
2634 SKIP_BLANKS;
2635
2636 /*
2637 * Check for SystemID and ExternalID
2638 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002639 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00002640 SKIP_BLANKS;
2641
2642 /*
2643 * We should be at the end of the DOCTYPE declaration.
2644 */
2645 if (CUR != '>') {
2646 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2647 ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
2648 ctxt->wellFormed = 0;
2649 /* We shouldn't try to resynchronize ... */
2650 }
2651 NEXT;
2652
2653 /*
2654 * Create or update the document accordingly to the DOCTYPE
2655 */
2656 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2657 (!ctxt->disableSAX))
2658 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
2659
2660 /*
2661 * Cleanup, since we don't use all those identifiers
2662 */
2663 if (URI != NULL) xmlFree(URI);
2664 if (ExternalID != NULL) xmlFree(ExternalID);
2665 if (name != NULL) xmlFree(name);
2666}
2667
2668/**
2669 * htmlParseAttribute:
2670 * @ctxt: an HTML parser context
2671 * @value: a xmlChar ** used to store the value of the attribute
2672 *
2673 * parse an attribute
2674 *
2675 * [41] Attribute ::= Name Eq AttValue
2676 *
2677 * [25] Eq ::= S? '=' S?
2678 *
2679 * With namespace:
2680 *
2681 * [NS 11] Attribute ::= QName Eq AttValue
2682 *
2683 * Also the case QName == xmlns:??? is handled independently as a namespace
2684 * definition.
2685 *
2686 * Returns the attribute name, and the value in *value.
2687 */
2688
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002689static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002690htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
2691 xmlChar *name, *val = NULL;
2692
2693 *value = NULL;
2694 name = htmlParseHTMLName(ctxt);
2695 if (name == NULL) {
2696 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2697 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2698 ctxt->wellFormed = 0;
2699 return(NULL);
2700 }
2701
2702 /*
2703 * read the value
2704 */
2705 SKIP_BLANKS;
2706 if (CUR == '=') {
2707 NEXT;
2708 SKIP_BLANKS;
2709 val = htmlParseAttValue(ctxt);
2710 /******
2711 } else {
2712 * TODO : some attribute must have values, some may not
2713 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2714 ctxt->sax->warning(ctxt->userData,
2715 "No value for attribute %s\n", name); */
2716 }
2717
2718 *value = val;
2719 return(name);
2720}
2721
2722/**
2723 * htmlCheckEncoding:
2724 * @ctxt: an HTML parser context
2725 * @attvalue: the attribute value
2726 *
2727 * Checks an http-equiv attribute from a Meta tag to detect
2728 * the encoding
2729 * If a new encoding is detected the parser is switched to decode
2730 * it and pass UTF8
2731 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002732static void
Owen Taylor3473f882001-02-23 17:55:21 +00002733htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2734 const xmlChar *encoding;
2735
2736 if ((ctxt == NULL) || (attvalue == NULL))
2737 return;
2738
2739 /* do not change encoding */
2740 if (ctxt->input->encoding != NULL)
2741 return;
2742
2743 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
2744 if (encoding != NULL) {
2745 encoding += 8;
2746 } else {
2747 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
2748 if (encoding != NULL)
2749 encoding += 9;
2750 }
2751 if (encoding != NULL) {
2752 xmlCharEncoding enc;
2753 xmlCharEncodingHandlerPtr handler;
2754
2755 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2756
2757 if (ctxt->input->encoding != NULL)
2758 xmlFree((xmlChar *) ctxt->input->encoding);
2759 ctxt->input->encoding = xmlStrdup(encoding);
2760
2761 enc = xmlParseCharEncoding((const char *) encoding);
2762 /*
2763 * registered set of known encodings
2764 */
2765 if (enc != XML_CHAR_ENCODING_ERROR) {
2766 xmlSwitchEncoding(ctxt, enc);
2767 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2768 } else {
2769 /*
2770 * fallback for unknown encodings
2771 */
2772 handler = xmlFindCharEncodingHandler((const char *) encoding);
2773 if (handler != NULL) {
2774 xmlSwitchToEncoding(ctxt, handler);
2775 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2776 } else {
2777 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2778 }
2779 }
2780
2781 if ((ctxt->input->buf != NULL) &&
2782 (ctxt->input->buf->encoder != NULL) &&
2783 (ctxt->input->buf->raw != NULL) &&
2784 (ctxt->input->buf->buffer != NULL)) {
2785 int nbchars;
2786 int processed;
2787
2788 /*
2789 * convert as much as possible to the parser reading buffer.
2790 */
2791 processed = ctxt->input->cur - ctxt->input->base;
2792 xmlBufferShrink(ctxt->input->buf->buffer, processed);
2793 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2794 ctxt->input->buf->buffer,
2795 ctxt->input->buf->raw);
2796 if (nbchars < 0) {
2797 ctxt->errNo = XML_ERR_INVALID_ENCODING;
2798 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2799 ctxt->sax->error(ctxt->userData,
2800 "htmlCheckEncoding: encoder error\n");
2801 }
2802 ctxt->input->base =
2803 ctxt->input->cur = ctxt->input->buf->buffer->content;
2804 }
2805 }
2806}
2807
2808/**
2809 * htmlCheckMeta:
2810 * @ctxt: an HTML parser context
2811 * @atts: the attributes values
2812 *
2813 * Checks an attributes from a Meta tag
2814 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002815static void
Owen Taylor3473f882001-02-23 17:55:21 +00002816htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2817 int i;
2818 const xmlChar *att, *value;
2819 int http = 0;
2820 const xmlChar *content = NULL;
2821
2822 if ((ctxt == NULL) || (atts == NULL))
2823 return;
2824
2825 i = 0;
2826 att = atts[i++];
2827 while (att != NULL) {
2828 value = atts[i++];
2829 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
2830 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
2831 http = 1;
2832 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
2833 content = value;
2834 att = atts[i++];
2835 }
2836 if ((http) && (content != NULL))
2837 htmlCheckEncoding(ctxt, content);
2838
2839}
2840
2841/**
2842 * htmlParseStartTag:
2843 * @ctxt: an HTML parser context
2844 *
2845 * parse a start of tag either for rule element or
2846 * EmptyElement. In both case we don't parse the tag closing chars.
2847 *
2848 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2849 *
2850 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2851 *
2852 * With namespace:
2853 *
2854 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2855 *
2856 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2857 *
2858 */
2859
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002860static void
Owen Taylor3473f882001-02-23 17:55:21 +00002861htmlParseStartTag(htmlParserCtxtPtr ctxt) {
2862 xmlChar *name;
2863 xmlChar *attname;
2864 xmlChar *attvalue;
2865 const xmlChar **atts = NULL;
2866 int nbatts = 0;
2867 int maxatts = 0;
2868 int meta = 0;
2869 int i;
2870
2871 if (CUR != '<') return;
2872 NEXT;
2873
2874 GROW;
2875 name = htmlParseHTMLName(ctxt);
2876 if (name == NULL) {
2877 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2878 ctxt->sax->error(ctxt->userData,
2879 "htmlParseStartTag: invalid element name\n");
2880 ctxt->wellFormed = 0;
2881 /* Dump the bogus tag like browsers do */
2882 while ((IS_CHAR(CUR)) && (CUR != '>'))
2883 NEXT;
2884 return;
2885 }
2886 if (xmlStrEqual(name, BAD_CAST"meta"))
2887 meta = 1;
2888
2889 /*
2890 * Check for auto-closure of HTML elements.
2891 */
2892 htmlAutoClose(ctxt, name);
2893
2894 /*
2895 * Check for implied HTML elements.
2896 */
2897 htmlCheckImplied(ctxt, name);
2898
2899 /*
2900 * Avoid html at any level > 0, head at any level != 1
2901 * or any attempt to recurse body
2902 */
2903 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
2904 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2905 ctxt->sax->error(ctxt->userData,
2906 "htmlParseStartTag: misplaced <html> tag\n");
2907 ctxt->wellFormed = 0;
2908 xmlFree(name);
2909 return;
2910 }
2911 if ((ctxt->nameNr != 1) &&
2912 (xmlStrEqual(name, BAD_CAST"head"))) {
2913 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2914 ctxt->sax->error(ctxt->userData,
2915 "htmlParseStartTag: misplaced <head> tag\n");
2916 ctxt->wellFormed = 0;
2917 xmlFree(name);
2918 return;
2919 }
2920 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002921 int indx;
2922 for (indx = 0;indx < ctxt->nameNr;indx++) {
2923 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor3473f882001-02-23 17:55:21 +00002924 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2925 ctxt->sax->error(ctxt->userData,
2926 "htmlParseStartTag: misplaced <body> tag\n");
2927 ctxt->wellFormed = 0;
2928 xmlFree(name);
2929 return;
2930 }
2931 }
2932 }
2933
2934 /*
2935 * Now parse the attributes, it ends up with the ending
2936 *
2937 * (S Attribute)* S?
2938 */
2939 SKIP_BLANKS;
2940 while ((IS_CHAR(CUR)) &&
2941 (CUR != '>') &&
2942 ((CUR != '/') || (NXT(1) != '>'))) {
2943 long cons = ctxt->nbChars;
2944
2945 GROW;
2946 attname = htmlParseAttribute(ctxt, &attvalue);
2947 if (attname != NULL) {
2948
2949 /*
2950 * Well formedness requires at most one declaration of an attribute
2951 */
2952 for (i = 0; i < nbatts;i += 2) {
2953 if (xmlStrEqual(atts[i], attname)) {
2954 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2955 ctxt->sax->error(ctxt->userData,
2956 "Attribute %s redefined\n",
2957 attname);
2958 ctxt->wellFormed = 0;
2959 xmlFree(attname);
2960 if (attvalue != NULL)
2961 xmlFree(attvalue);
2962 goto failed;
2963 }
2964 }
2965
2966 /*
2967 * Add the pair to atts
2968 */
2969 if (atts == NULL) {
2970 maxatts = 10;
2971 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
2972 if (atts == NULL) {
2973 xmlGenericError(xmlGenericErrorContext,
2974 "malloc of %ld byte failed\n",
2975 maxatts * (long)sizeof(xmlChar *));
2976 if (name != NULL) xmlFree(name);
2977 return;
2978 }
2979 } else if (nbatts + 4 > maxatts) {
2980 maxatts *= 2;
2981 atts = (const xmlChar **) xmlRealloc((void *) atts,
2982 maxatts * sizeof(xmlChar *));
2983 if (atts == NULL) {
2984 xmlGenericError(xmlGenericErrorContext,
2985 "realloc of %ld byte failed\n",
2986 maxatts * (long)sizeof(xmlChar *));
2987 if (name != NULL) xmlFree(name);
2988 return;
2989 }
2990 }
2991 atts[nbatts++] = attname;
2992 atts[nbatts++] = attvalue;
2993 atts[nbatts] = NULL;
2994 atts[nbatts + 1] = NULL;
2995 }
2996 else {
2997 /* Dump the bogus attribute string up to the next blank or
2998 * the end of the tag. */
2999 while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
3000 && ((CUR != '/') || (NXT(1) != '>')))
3001 NEXT;
3002 }
3003
3004failed:
3005 SKIP_BLANKS;
3006 if (cons == ctxt->nbChars) {
3007 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3008 ctxt->sax->error(ctxt->userData,
3009 "htmlParseStartTag: problem parsing attributes\n");
3010 ctxt->wellFormed = 0;
3011 break;
3012 }
3013 }
3014
3015 /*
3016 * Handle specific association to the META tag
3017 */
3018 if (meta)
3019 htmlCheckMeta(ctxt, atts);
3020
3021 /*
3022 * SAX: Start of Element !
3023 */
3024 htmlnamePush(ctxt, xmlStrdup(name));
3025#ifdef DEBUG
3026 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
3027#endif
3028 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3029 ctxt->sax->startElement(ctxt->userData, name, atts);
3030
3031 if (atts != NULL) {
3032 for (i = 0;i < nbatts;i++) {
3033 if (atts[i] != NULL)
3034 xmlFree((xmlChar *) atts[i]);
3035 }
3036 xmlFree((void *) atts);
3037 }
3038 if (name != NULL) xmlFree(name);
3039}
3040
3041/**
3042 * htmlParseEndTag:
3043 * @ctxt: an HTML parser context
3044 *
3045 * parse an end of tag
3046 *
3047 * [42] ETag ::= '</' Name S? '>'
3048 *
3049 * With namespace
3050 *
3051 * [NS 9] ETag ::= '</' QName S? '>'
3052 */
3053
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003054static void
Owen Taylor3473f882001-02-23 17:55:21 +00003055htmlParseEndTag(htmlParserCtxtPtr ctxt) {
3056 xmlChar *name;
3057 xmlChar *oldname;
3058 int i;
3059
3060 if ((CUR != '<') || (NXT(1) != '/')) {
3061 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3062 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3063 ctxt->wellFormed = 0;
3064 return;
3065 }
3066 SKIP(2);
3067
3068 name = htmlParseHTMLName(ctxt);
3069 if (name == NULL) return;
3070
3071 /*
3072 * We should definitely be at the ending "S? '>'" part
3073 */
3074 SKIP_BLANKS;
3075 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3076 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3077 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3078 ctxt->wellFormed = 0;
3079 } else
3080 NEXT;
3081
3082 /*
3083 * If the name read is not one of the element in the parsing stack
3084 * then return, it's just an error.
3085 */
3086 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
3087 if (xmlStrEqual(name, ctxt->nameTab[i])) break;
3088 }
3089 if (i < 0) {
3090 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3091 ctxt->sax->error(ctxt->userData,
3092 "Unexpected end tag : %s\n", name);
3093 xmlFree(name);
3094 ctxt->wellFormed = 0;
3095 return;
3096 }
3097
3098
3099 /*
3100 * Check for auto-closure of HTML elements.
3101 */
3102
3103 htmlAutoCloseOnClose(ctxt, name);
3104
3105 /*
3106 * Well formedness constraints, opening and closing must match.
3107 * With the exception that the autoclose may have popped stuff out
3108 * of the stack.
3109 */
3110 if (!xmlStrEqual(name, ctxt->name)) {
3111#ifdef DEBUG
3112 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
3113#endif
3114 if ((ctxt->name != NULL) &&
3115 (!xmlStrEqual(ctxt->name, name))) {
3116 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3117 ctxt->sax->error(ctxt->userData,
3118 "Opening and ending tag mismatch: %s and %s\n",
3119 name, ctxt->name);
3120 ctxt->wellFormed = 0;
3121 }
3122 }
3123
3124 /*
3125 * SAX: End of Tag
3126 */
3127 oldname = ctxt->name;
3128 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3129 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3130 ctxt->sax->endElement(ctxt->userData, name);
3131 oldname = htmlnamePop(ctxt);
3132 if (oldname != NULL) {
3133#ifdef DEBUG
3134 xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
3135#endif
3136 xmlFree(oldname);
3137#ifdef DEBUG
3138 } else {
3139 xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
3140#endif
3141 }
3142 }
3143
3144 if (name != NULL)
3145 xmlFree(name);
3146
3147 return;
3148}
3149
3150
3151/**
3152 * htmlParseReference:
3153 * @ctxt: an HTML parser context
3154 *
3155 * parse and handle entity references in content,
3156 * this will end-up in a call to character() since this is either a
3157 * CharRef, or a predefined entity.
3158 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003159static void
Owen Taylor3473f882001-02-23 17:55:21 +00003160htmlParseReference(htmlParserCtxtPtr ctxt) {
3161 htmlEntityDescPtr ent;
3162 xmlChar out[6];
3163 xmlChar *name;
3164 if (CUR != '&') return;
3165
3166 if (NXT(1) == '#') {
3167 unsigned int c;
3168 int bits, i = 0;
3169
3170 c = htmlParseCharRef(ctxt);
3171 if (c == 0)
3172 return;
3173
3174 if (c < 0x80) { out[i++]= c; bits= -6; }
3175 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3176 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3177 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3178
3179 for ( ; bits >= 0; bits-= 6) {
3180 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3181 }
3182 out[i] = 0;
3183
3184 htmlCheckParagraph(ctxt);
3185 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3186 ctxt->sax->characters(ctxt->userData, out, i);
3187 } else {
3188 ent = htmlParseEntityRef(ctxt, &name);
3189 if (name == NULL) {
3190 htmlCheckParagraph(ctxt);
3191 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3192 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3193 return;
3194 }
3195 if ((ent == NULL) || (ent->value <= 0)) {
3196 htmlCheckParagraph(ctxt);
3197 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3198 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3199 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3200 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3201 }
3202 } else {
3203 unsigned int c;
3204 int bits, i = 0;
3205
3206 c = ent->value;
3207 if (c < 0x80)
3208 { out[i++]= c; bits= -6; }
3209 else if (c < 0x800)
3210 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3211 else if (c < 0x10000)
3212 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3213 else
3214 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3215
3216 for ( ; bits >= 0; bits-= 6) {
3217 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3218 }
3219 out[i] = 0;
3220
3221 htmlCheckParagraph(ctxt);
3222 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3223 ctxt->sax->characters(ctxt->userData, out, i);
3224 }
3225 xmlFree(name);
3226 }
3227}
3228
3229/**
3230 * htmlParseContent:
3231 * @ctxt: an HTML parser context
3232 * @name: the node name
3233 *
3234 * Parse a content: comment, sub-element, reference or text.
3235 *
3236 */
3237
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003238static void
Owen Taylor3473f882001-02-23 17:55:21 +00003239htmlParseContent(htmlParserCtxtPtr ctxt) {
3240 xmlChar *currentNode;
3241 int depth;
3242
3243 currentNode = xmlStrdup(ctxt->name);
3244 depth = ctxt->nameNr;
3245 while (1) {
3246 long cons = ctxt->nbChars;
3247
3248 GROW;
3249 /*
3250 * Our tag or one of it's parent or children is ending.
3251 */
3252 if ((CUR == '<') && (NXT(1) == '/')) {
3253 htmlParseEndTag(ctxt);
3254 if (currentNode != NULL) xmlFree(currentNode);
3255 return;
3256 }
3257
3258 /*
3259 * Has this node been popped out during parsing of
3260 * the next element
3261 */
3262 if ((!xmlStrEqual(currentNode, ctxt->name)) &&
3263 (depth >= ctxt->nameNr)) {
3264 if (currentNode != NULL) xmlFree(currentNode);
3265 return;
3266 }
3267
Daniel Veillardf9533d12001-03-03 10:04:57 +00003268 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3269 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003270 /*
3271 * Handle SCRIPT/STYLE separately
3272 */
3273 htmlParseScript(ctxt);
3274 } else {
3275 /*
3276 * Sometimes DOCTYPE arrives in the middle of the document
3277 */
3278 if ((CUR == '<') && (NXT(1) == '!') &&
3279 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3280 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3281 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3282 (UPP(8) == 'E')) {
3283 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3284 ctxt->sax->error(ctxt->userData,
3285 "Misplaced DOCTYPE declaration\n");
3286 ctxt->wellFormed = 0;
3287 htmlParseDocTypeDecl(ctxt);
3288 }
3289
3290 /*
3291 * First case : a comment
3292 */
3293 if ((CUR == '<') && (NXT(1) == '!') &&
3294 (NXT(2) == '-') && (NXT(3) == '-')) {
3295 htmlParseComment(ctxt);
3296 }
3297
3298 /*
3299 * Second case : a sub-element.
3300 */
3301 else if (CUR == '<') {
3302 htmlParseElement(ctxt);
3303 }
3304
3305 /*
3306 * Third case : a reference. If if has not been resolved,
3307 * parsing returns it's Name, create the node
3308 */
3309 else if (CUR == '&') {
3310 htmlParseReference(ctxt);
3311 }
3312
3313 /*
3314 * Fourth : end of the resource
3315 */
3316 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003317 htmlAutoCloseOnEnd(ctxt);
3318 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003319 }
3320
3321 /*
3322 * Last case, text. Note that References are handled directly.
3323 */
3324 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003325 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003326 }
3327
3328 if (cons == ctxt->nbChars) {
3329 if (ctxt->node != NULL) {
3330 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3331 ctxt->sax->error(ctxt->userData,
3332 "detected an error in element content\n");
3333 ctxt->wellFormed = 0;
3334 }
3335 break;
3336 }
3337 }
3338 GROW;
3339 }
3340 if (currentNode != NULL) xmlFree(currentNode);
3341}
3342
3343/**
3344 * htmlParseElement:
3345 * @ctxt: an HTML parser context
3346 *
3347 * parse an HTML element, this is highly recursive
3348 *
3349 * [39] element ::= EmptyElemTag | STag content ETag
3350 *
3351 * [41] Attribute ::= Name Eq AttValue
3352 */
3353
3354void
3355htmlParseElement(htmlParserCtxtPtr ctxt) {
3356 xmlChar *name;
3357 xmlChar *currentNode = NULL;
3358 htmlElemDescPtr info;
3359 htmlParserNodeInfo node_info;
3360 xmlChar *oldname;
3361 int depth = ctxt->nameNr;
3362
3363 /* Capture start position */
3364 if (ctxt->record_info) {
3365 node_info.begin_pos = ctxt->input->consumed +
3366 (CUR_PTR - ctxt->input->base);
3367 node_info.begin_line = ctxt->input->line;
3368 }
3369
3370 oldname = xmlStrdup(ctxt->name);
3371 htmlParseStartTag(ctxt);
3372 name = ctxt->name;
3373#ifdef DEBUG
3374 if (oldname == NULL)
3375 xmlGenericError(xmlGenericErrorContext,
3376 "Start of element %s\n", name);
3377 else if (name == NULL)
3378 xmlGenericError(xmlGenericErrorContext,
3379 "Start of element failed, was %s\n", oldname);
3380 else
3381 xmlGenericError(xmlGenericErrorContext,
3382 "Start of element %s, was %s\n", name, oldname);
3383#endif
3384 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3385 (name == NULL)) {
3386 if (CUR == '>')
3387 NEXT;
3388 if (oldname != NULL)
3389 xmlFree(oldname);
3390 return;
3391 }
3392 if (oldname != NULL)
3393 xmlFree(oldname);
3394
3395 /*
3396 * Lookup the info for that element.
3397 */
3398 info = htmlTagLookup(name);
3399 if (info == NULL) {
3400 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3401 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3402 name);
3403 ctxt->wellFormed = 0;
3404 } else if (info->depr) {
3405/***************************
3406 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3407 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3408 name);
3409 ***************************/
3410 }
3411
3412 /*
3413 * Check for an Empty Element labelled the XML/SGML way
3414 */
3415 if ((CUR == '/') && (NXT(1) == '>')) {
3416 SKIP(2);
3417 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3418 ctxt->sax->endElement(ctxt->userData, name);
3419 oldname = htmlnamePop(ctxt);
3420#ifdef DEBUG
3421 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
3422#endif
3423 if (oldname != NULL)
3424 xmlFree(oldname);
3425 return;
3426 }
3427
3428 if (CUR == '>') {
3429 NEXT;
3430 } else {
3431 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3432 ctxt->sax->error(ctxt->userData,
3433 "Couldn't find end of Start Tag %s\n",
3434 name);
3435 ctxt->wellFormed = 0;
3436
3437 /*
3438 * end of parsing of this node.
3439 */
3440 if (xmlStrEqual(name, ctxt->name)) {
3441 nodePop(ctxt);
3442 oldname = htmlnamePop(ctxt);
3443#ifdef DEBUG
3444 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
3445#endif
3446 if (oldname != NULL)
3447 xmlFree(oldname);
3448 }
3449
3450 /*
3451 * Capture end position and add node
3452 */
3453 if ( currentNode != NULL && ctxt->record_info ) {
3454 node_info.end_pos = ctxt->input->consumed +
3455 (CUR_PTR - ctxt->input->base);
3456 node_info.end_line = ctxt->input->line;
3457 node_info.node = ctxt->node;
3458 xmlParserAddNodeInfo(ctxt, &node_info);
3459 }
3460 return;
3461 }
3462
3463 /*
3464 * Check for an Empty Element from DTD definition
3465 */
3466 if ((info != NULL) && (info->empty)) {
3467 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3468 ctxt->sax->endElement(ctxt->userData, name);
3469 oldname = htmlnamePop(ctxt);
3470#ifdef DEBUG
3471 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
3472#endif
3473 if (oldname != NULL)
3474 xmlFree(oldname);
3475 return;
3476 }
3477
3478 /*
3479 * Parse the content of the element:
3480 */
3481 currentNode = xmlStrdup(ctxt->name);
3482 depth = ctxt->nameNr;
3483 while (IS_CHAR(CUR)) {
3484 htmlParseContent(ctxt);
3485 if (ctxt->nameNr < depth) break;
3486 }
3487
Owen Taylor3473f882001-02-23 17:55:21 +00003488 /*
3489 * Capture end position and add node
3490 */
3491 if ( currentNode != NULL && ctxt->record_info ) {
3492 node_info.end_pos = ctxt->input->consumed +
3493 (CUR_PTR - ctxt->input->base);
3494 node_info.end_line = ctxt->input->line;
3495 node_info.node = ctxt->node;
3496 xmlParserAddNodeInfo(ctxt, &node_info);
3497 }
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003498 if (!IS_CHAR(CUR)) {
3499 htmlAutoCloseOnEnd(ctxt);
3500 }
3501
Owen Taylor3473f882001-02-23 17:55:21 +00003502 if (currentNode != NULL)
3503 xmlFree(currentNode);
3504}
3505
3506/**
3507 * htmlParseDocument :
3508 * @ctxt: an HTML parser context
3509 *
3510 * parse an HTML document (and build a tree if using the standard SAX
3511 * interface).
3512 *
3513 * Returns 0, -1 in case of error. the parser context is augmented
3514 * as a result of the parsing.
3515 */
3516
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003517static int
Owen Taylor3473f882001-02-23 17:55:21 +00003518htmlParseDocument(htmlParserCtxtPtr ctxt) {
3519 xmlDtdPtr dtd;
3520
3521 htmlDefaultSAXHandlerInit();
3522 ctxt->html = 1;
3523
3524 GROW;
3525 /*
3526 * SAX: beginning of the document processing.
3527 */
3528 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3529 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3530
3531 /*
3532 * Wipe out everything which is before the first '<'
3533 */
3534 SKIP_BLANKS;
3535 if (CUR == 0) {
3536 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3537 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3538 ctxt->wellFormed = 0;
3539 }
3540
3541 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3542 ctxt->sax->startDocument(ctxt->userData);
3543
3544
3545 /*
3546 * Parse possible comments before any content
3547 */
3548 while ((CUR == '<') && (NXT(1) == '!') &&
3549 (NXT(2) == '-') && (NXT(3) == '-')) {
3550 htmlParseComment(ctxt);
3551 SKIP_BLANKS;
3552 }
3553
3554
3555 /*
3556 * Then possibly doc type declaration(s) and more Misc
3557 * (doctypedecl Misc*)?
3558 */
3559 if ((CUR == '<') && (NXT(1) == '!') &&
3560 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3561 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3562 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3563 (UPP(8) == 'E')) {
3564 htmlParseDocTypeDecl(ctxt);
3565 }
3566 SKIP_BLANKS;
3567
3568 /*
3569 * Parse possible comments before any content
3570 */
3571 while ((CUR == '<') && (NXT(1) == '!') &&
3572 (NXT(2) == '-') && (NXT(3) == '-')) {
3573 htmlParseComment(ctxt);
3574 SKIP_BLANKS;
3575 }
3576
3577 /*
3578 * Time to start parsing the tree itself
3579 */
3580 htmlParseContent(ctxt);
3581
3582 /*
3583 * autoclose
3584 */
3585 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003586 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003587
3588
3589 /*
3590 * SAX: end of the document processing.
3591 */
3592 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3593 ctxt->sax->endDocument(ctxt->userData);
3594
3595 if (ctxt->myDoc != NULL) {
3596 dtd = xmlGetIntSubset(ctxt->myDoc);
3597 if (dtd == NULL)
3598 ctxt->myDoc->intSubset =
3599 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3600 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3601 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3602 }
3603 if (! ctxt->wellFormed) return(-1);
3604 return(0);
3605}
3606
3607
3608/************************************************************************
3609 * *
3610 * Parser contexts handling *
3611 * *
3612 ************************************************************************/
3613
3614/**
3615 * xmlInitParserCtxt:
3616 * @ctxt: an HTML parser context
3617 *
3618 * Initialize a parser context
3619 */
3620
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003621static void
Owen Taylor3473f882001-02-23 17:55:21 +00003622htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3623{
3624 htmlSAXHandler *sax;
3625
3626 if (ctxt == NULL) return;
3627 memset(ctxt, 0, sizeof(htmlParserCtxt));
3628
3629 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3630 if (sax == NULL) {
3631 xmlGenericError(xmlGenericErrorContext,
3632 "htmlInitParserCtxt: out of memory\n");
3633 }
3634 else
3635 memset(sax, 0, sizeof(htmlSAXHandler));
3636
3637 /* Allocate the Input stack */
3638 ctxt->inputTab = (htmlParserInputPtr *)
3639 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3640 if (ctxt->inputTab == NULL) {
3641 xmlGenericError(xmlGenericErrorContext,
3642 "htmlInitParserCtxt: out of memory\n");
3643 ctxt->inputNr = 0;
3644 ctxt->inputMax = 0;
3645 ctxt->input = NULL;
3646 return;
3647 }
3648 ctxt->inputNr = 0;
3649 ctxt->inputMax = 5;
3650 ctxt->input = NULL;
3651 ctxt->version = NULL;
3652 ctxt->encoding = NULL;
3653 ctxt->standalone = -1;
3654 ctxt->instate = XML_PARSER_START;
3655
3656 /* Allocate the Node stack */
3657 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3658 if (ctxt->nodeTab == NULL) {
3659 xmlGenericError(xmlGenericErrorContext,
3660 "htmlInitParserCtxt: out of memory\n");
3661 ctxt->nodeNr = 0;
3662 ctxt->nodeMax = 0;
3663 ctxt->node = NULL;
3664 ctxt->inputNr = 0;
3665 ctxt->inputMax = 0;
3666 ctxt->input = NULL;
3667 return;
3668 }
3669 ctxt->nodeNr = 0;
3670 ctxt->nodeMax = 10;
3671 ctxt->node = NULL;
3672
3673 /* Allocate the Name stack */
3674 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
3675 if (ctxt->nameTab == NULL) {
3676 xmlGenericError(xmlGenericErrorContext,
3677 "htmlInitParserCtxt: out of memory\n");
3678 ctxt->nameNr = 0;
3679 ctxt->nameMax = 10;
3680 ctxt->name = NULL;
3681 ctxt->nodeNr = 0;
3682 ctxt->nodeMax = 0;
3683 ctxt->node = NULL;
3684 ctxt->inputNr = 0;
3685 ctxt->inputMax = 0;
3686 ctxt->input = NULL;
3687 return;
3688 }
3689 ctxt->nameNr = 0;
3690 ctxt->nameMax = 10;
3691 ctxt->name = NULL;
3692
3693 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3694 else {
3695 ctxt->sax = sax;
3696 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3697 }
3698 ctxt->userData = ctxt;
3699 ctxt->myDoc = NULL;
3700 ctxt->wellFormed = 1;
3701 ctxt->replaceEntities = 0;
3702 ctxt->html = 1;
3703 ctxt->record_info = 0;
3704 ctxt->validate = 0;
3705 ctxt->nbChars = 0;
3706 ctxt->checkIndex = 0;
3707 xmlInitNodeInfoSeq(&ctxt->node_seq);
3708}
3709
3710/**
3711 * htmlFreeParserCtxt:
3712 * @ctxt: an HTML parser context
3713 *
3714 * Free all the memory used by a parser context. However the parsed
3715 * document in ctxt->myDoc is not freed.
3716 */
3717
3718void
3719htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3720{
3721 xmlFreeParserCtxt(ctxt);
3722}
3723
3724/**
3725 * htmlCreateDocParserCtxt :
3726 * @cur: a pointer to an array of xmlChar
3727 * @encoding: a free form C string describing the HTML document encoding, or NULL
3728 *
3729 * Create a parser context for an HTML document.
3730 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003731 * TODO: check the need to add encoding handling there
3732 *
Owen Taylor3473f882001-02-23 17:55:21 +00003733 * Returns the new parser context or NULL
3734 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003735static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00003736htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Owen Taylor3473f882001-02-23 17:55:21 +00003737 htmlParserCtxtPtr ctxt;
3738 htmlParserInputPtr input;
3739 /* htmlCharEncoding enc; */
3740
3741 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
3742 if (ctxt == NULL) {
3743 perror("malloc");
3744 return(NULL);
3745 }
3746 htmlInitParserCtxt(ctxt);
3747 input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
3748 if (input == NULL) {
3749 perror("malloc");
3750 xmlFree(ctxt);
3751 return(NULL);
3752 }
3753 memset(input, 0, sizeof(htmlParserInput));
3754
3755 input->line = 1;
3756 input->col = 1;
3757 input->base = cur;
3758 input->cur = cur;
3759
3760 inputPush(ctxt, input);
3761 return(ctxt);
3762}
3763
3764/************************************************************************
3765 * *
3766 * Progressive parsing interfaces *
3767 * *
3768 ************************************************************************/
3769
3770/**
3771 * htmlParseLookupSequence:
3772 * @ctxt: an HTML parser context
3773 * @first: the first char to lookup
3774 * @next: the next char to lookup or zero
3775 * @third: the next char to lookup or zero
3776 *
3777 * Try to find if a sequence (first, next, third) or just (first next) or
3778 * (first) is available in the input stream.
3779 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3780 * to avoid rescanning sequences of bytes, it DOES change the state of the
3781 * parser, do not use liberally.
3782 * This is basically similar to xmlParseLookupSequence()
3783 *
3784 * Returns the index to the current parsing point if the full sequence
3785 * is available, -1 otherwise.
3786 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003787static int
Owen Taylor3473f882001-02-23 17:55:21 +00003788htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3789 xmlChar next, xmlChar third) {
3790 int base, len;
3791 htmlParserInputPtr in;
3792 const xmlChar *buf;
3793
3794 in = ctxt->input;
3795 if (in == NULL) return(-1);
3796 base = in->cur - in->base;
3797 if (base < 0) return(-1);
3798 if (ctxt->checkIndex > base)
3799 base = ctxt->checkIndex;
3800 if (in->buf == NULL) {
3801 buf = in->base;
3802 len = in->length;
3803 } else {
3804 buf = in->buf->buffer->content;
3805 len = in->buf->buffer->use;
3806 }
3807 /* take into account the sequence length */
3808 if (third) len -= 2;
3809 else if (next) len --;
3810 for (;base < len;base++) {
3811 if (buf[base] == first) {
3812 if (third != 0) {
3813 if ((buf[base + 1] != next) ||
3814 (buf[base + 2] != third)) continue;
3815 } else if (next != 0) {
3816 if (buf[base + 1] != next) continue;
3817 }
3818 ctxt->checkIndex = 0;
3819#ifdef DEBUG_PUSH
3820 if (next == 0)
3821 xmlGenericError(xmlGenericErrorContext,
3822 "HPP: lookup '%c' found at %d\n",
3823 first, base);
3824 else if (third == 0)
3825 xmlGenericError(xmlGenericErrorContext,
3826 "HPP: lookup '%c%c' found at %d\n",
3827 first, next, base);
3828 else
3829 xmlGenericError(xmlGenericErrorContext,
3830 "HPP: lookup '%c%c%c' found at %d\n",
3831 first, next, third, base);
3832#endif
3833 return(base - (in->cur - in->base));
3834 }
3835 }
3836 ctxt->checkIndex = base;
3837#ifdef DEBUG_PUSH
3838 if (next == 0)
3839 xmlGenericError(xmlGenericErrorContext,
3840 "HPP: lookup '%c' failed\n", first);
3841 else if (third == 0)
3842 xmlGenericError(xmlGenericErrorContext,
3843 "HPP: lookup '%c%c' failed\n", first, next);
3844 else
3845 xmlGenericError(xmlGenericErrorContext,
3846 "HPP: lookup '%c%c%c' failed\n", first, next, third);
3847#endif
3848 return(-1);
3849}
3850
3851/**
3852 * htmlParseTryOrFinish:
3853 * @ctxt: an HTML parser context
3854 * @terminate: last chunk indicator
3855 *
3856 * Try to progress on parsing
3857 *
3858 * Returns zero if no parsing was possible
3859 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003860static int
Owen Taylor3473f882001-02-23 17:55:21 +00003861htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
3862 int ret = 0;
3863 htmlParserInputPtr in;
3864 int avail = 0;
3865 xmlChar cur, next;
3866
3867#ifdef DEBUG_PUSH
3868 switch (ctxt->instate) {
3869 case XML_PARSER_EOF:
3870 xmlGenericError(xmlGenericErrorContext,
3871 "HPP: try EOF\n"); break;
3872 case XML_PARSER_START:
3873 xmlGenericError(xmlGenericErrorContext,
3874 "HPP: try START\n"); break;
3875 case XML_PARSER_MISC:
3876 xmlGenericError(xmlGenericErrorContext,
3877 "HPP: try MISC\n");break;
3878 case XML_PARSER_COMMENT:
3879 xmlGenericError(xmlGenericErrorContext,
3880 "HPP: try COMMENT\n");break;
3881 case XML_PARSER_PROLOG:
3882 xmlGenericError(xmlGenericErrorContext,
3883 "HPP: try PROLOG\n");break;
3884 case XML_PARSER_START_TAG:
3885 xmlGenericError(xmlGenericErrorContext,
3886 "HPP: try START_TAG\n");break;
3887 case XML_PARSER_CONTENT:
3888 xmlGenericError(xmlGenericErrorContext,
3889 "HPP: try CONTENT\n");break;
3890 case XML_PARSER_CDATA_SECTION:
3891 xmlGenericError(xmlGenericErrorContext,
3892 "HPP: try CDATA_SECTION\n");break;
3893 case XML_PARSER_END_TAG:
3894 xmlGenericError(xmlGenericErrorContext,
3895 "HPP: try END_TAG\n");break;
3896 case XML_PARSER_ENTITY_DECL:
3897 xmlGenericError(xmlGenericErrorContext,
3898 "HPP: try ENTITY_DECL\n");break;
3899 case XML_PARSER_ENTITY_VALUE:
3900 xmlGenericError(xmlGenericErrorContext,
3901 "HPP: try ENTITY_VALUE\n");break;
3902 case XML_PARSER_ATTRIBUTE_VALUE:
3903 xmlGenericError(xmlGenericErrorContext,
3904 "HPP: try ATTRIBUTE_VALUE\n");break;
3905 case XML_PARSER_DTD:
3906 xmlGenericError(xmlGenericErrorContext,
3907 "HPP: try DTD\n");break;
3908 case XML_PARSER_EPILOG:
3909 xmlGenericError(xmlGenericErrorContext,
3910 "HPP: try EPILOG\n");break;
3911 case XML_PARSER_PI:
3912 xmlGenericError(xmlGenericErrorContext,
3913 "HPP: try PI\n");break;
3914 case XML_PARSER_SYSTEM_LITERAL:
3915 xmlGenericError(xmlGenericErrorContext,
3916 "HPP: try SYSTEM_LITERAL\n");break;
3917 }
3918#endif
3919
3920 while (1) {
3921
3922 in = ctxt->input;
3923 if (in == NULL) break;
3924 if (in->buf == NULL)
3925 avail = in->length - (in->cur - in->base);
3926 else
3927 avail = in->buf->buffer->use - (in->cur - in->base);
3928 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003929 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003930 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
3931 /*
3932 * SAX: end of the document processing.
3933 */
3934 ctxt->instate = XML_PARSER_EOF;
3935 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3936 ctxt->sax->endDocument(ctxt->userData);
3937 }
3938 }
3939 if (avail < 1)
3940 goto done;
3941 switch (ctxt->instate) {
3942 case XML_PARSER_EOF:
3943 /*
3944 * Document parsing is done !
3945 */
3946 goto done;
3947 case XML_PARSER_START:
3948 /*
3949 * Very first chars read from the document flow.
3950 */
3951 cur = in->cur[0];
3952 if (IS_BLANK(cur)) {
3953 SKIP_BLANKS;
3954 if (in->buf == NULL)
3955 avail = in->length - (in->cur - in->base);
3956 else
3957 avail = in->buf->buffer->use - (in->cur - in->base);
3958 }
3959 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3960 ctxt->sax->setDocumentLocator(ctxt->userData,
3961 &xmlDefaultSAXLocator);
3962 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
3963 (!ctxt->disableSAX))
3964 ctxt->sax->startDocument(ctxt->userData);
3965
3966 cur = in->cur[0];
3967 next = in->cur[1];
3968 if ((cur == '<') && (next == '!') &&
3969 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3970 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3971 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3972 (UPP(8) == 'E')) {
3973 if ((!terminate) &&
3974 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
3975 goto done;
3976#ifdef DEBUG_PUSH
3977 xmlGenericError(xmlGenericErrorContext,
3978 "HPP: Parsing internal subset\n");
3979#endif
3980 htmlParseDocTypeDecl(ctxt);
3981 ctxt->instate = XML_PARSER_PROLOG;
3982#ifdef DEBUG_PUSH
3983 xmlGenericError(xmlGenericErrorContext,
3984 "HPP: entering PROLOG\n");
3985#endif
3986 } else {
3987 ctxt->instate = XML_PARSER_MISC;
3988 }
3989#ifdef DEBUG_PUSH
3990 xmlGenericError(xmlGenericErrorContext,
3991 "HPP: entering MISC\n");
3992#endif
3993 break;
3994 case XML_PARSER_MISC:
3995 SKIP_BLANKS;
3996 if (in->buf == NULL)
3997 avail = in->length - (in->cur - in->base);
3998 else
3999 avail = in->buf->buffer->use - (in->cur - in->base);
4000 if (avail < 2)
4001 goto done;
4002 cur = in->cur[0];
4003 next = in->cur[1];
4004 if ((cur == '<') && (next == '!') &&
4005 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4006 if ((!terminate) &&
4007 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4008 goto done;
4009#ifdef DEBUG_PUSH
4010 xmlGenericError(xmlGenericErrorContext,
4011 "HPP: Parsing Comment\n");
4012#endif
4013 htmlParseComment(ctxt);
4014 ctxt->instate = XML_PARSER_MISC;
4015 } else if ((cur == '<') && (next == '!') &&
4016 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4017 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4018 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4019 (UPP(8) == 'E')) {
4020 if ((!terminate) &&
4021 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4022 goto done;
4023#ifdef DEBUG_PUSH
4024 xmlGenericError(xmlGenericErrorContext,
4025 "HPP: Parsing internal subset\n");
4026#endif
4027 htmlParseDocTypeDecl(ctxt);
4028 ctxt->instate = XML_PARSER_PROLOG;
4029#ifdef DEBUG_PUSH
4030 xmlGenericError(xmlGenericErrorContext,
4031 "HPP: entering PROLOG\n");
4032#endif
4033 } else if ((cur == '<') && (next == '!') &&
4034 (avail < 9)) {
4035 goto done;
4036 } else {
4037 ctxt->instate = XML_PARSER_START_TAG;
4038#ifdef DEBUG_PUSH
4039 xmlGenericError(xmlGenericErrorContext,
4040 "HPP: entering START_TAG\n");
4041#endif
4042 }
4043 break;
4044 case XML_PARSER_PROLOG:
4045 SKIP_BLANKS;
4046 if (in->buf == NULL)
4047 avail = in->length - (in->cur - in->base);
4048 else
4049 avail = in->buf->buffer->use - (in->cur - in->base);
4050 if (avail < 2)
4051 goto done;
4052 cur = in->cur[0];
4053 next = in->cur[1];
4054 if ((cur == '<') && (next == '!') &&
4055 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4056 if ((!terminate) &&
4057 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4058 goto done;
4059#ifdef DEBUG_PUSH
4060 xmlGenericError(xmlGenericErrorContext,
4061 "HPP: Parsing Comment\n");
4062#endif
4063 htmlParseComment(ctxt);
4064 ctxt->instate = XML_PARSER_PROLOG;
4065 } else if ((cur == '<') && (next == '!') &&
4066 (avail < 4)) {
4067 goto done;
4068 } else {
4069 ctxt->instate = XML_PARSER_START_TAG;
4070#ifdef DEBUG_PUSH
4071 xmlGenericError(xmlGenericErrorContext,
4072 "HPP: entering START_TAG\n");
4073#endif
4074 }
4075 break;
4076 case XML_PARSER_EPILOG:
4077 if (in->buf == NULL)
4078 avail = in->length - (in->cur - in->base);
4079 else
4080 avail = in->buf->buffer->use - (in->cur - in->base);
4081 if (avail < 1)
4082 goto done;
4083 cur = in->cur[0];
4084 if (IS_BLANK(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004085 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004086 goto done;
4087 }
4088 if (avail < 2)
4089 goto done;
4090 next = in->cur[1];
4091 if ((cur == '<') && (next == '!') &&
4092 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4093 if ((!terminate) &&
4094 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4095 goto done;
4096#ifdef DEBUG_PUSH
4097 xmlGenericError(xmlGenericErrorContext,
4098 "HPP: Parsing Comment\n");
4099#endif
4100 htmlParseComment(ctxt);
4101 ctxt->instate = XML_PARSER_EPILOG;
4102 } else if ((cur == '<') && (next == '!') &&
4103 (avail < 4)) {
4104 goto done;
4105 } else {
4106 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004107 ctxt->wellFormed = 0;
4108 ctxt->instate = XML_PARSER_EOF;
4109#ifdef DEBUG_PUSH
4110 xmlGenericError(xmlGenericErrorContext,
4111 "HPP: entering EOF\n");
4112#endif
4113 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4114 ctxt->sax->endDocument(ctxt->userData);
4115 goto done;
4116 }
4117 break;
4118 case XML_PARSER_START_TAG: {
4119 xmlChar *name, *oldname;
4120 int depth = ctxt->nameNr;
4121 htmlElemDescPtr info;
4122
4123 if (avail < 2)
4124 goto done;
4125 cur = in->cur[0];
4126 if (cur != '<') {
4127 ctxt->instate = XML_PARSER_CONTENT;
4128#ifdef DEBUG_PUSH
4129 xmlGenericError(xmlGenericErrorContext,
4130 "HPP: entering CONTENT\n");
4131#endif
4132 break;
4133 }
4134 if ((!terminate) &&
4135 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4136 goto done;
4137
4138 oldname = xmlStrdup(ctxt->name);
4139 htmlParseStartTag(ctxt);
4140 name = ctxt->name;
4141#ifdef DEBUG
4142 if (oldname == NULL)
4143 xmlGenericError(xmlGenericErrorContext,
4144 "Start of element %s\n", name);
4145 else if (name == NULL)
4146 xmlGenericError(xmlGenericErrorContext,
4147 "Start of element failed, was %s\n",
4148 oldname);
4149 else
4150 xmlGenericError(xmlGenericErrorContext,
4151 "Start of element %s, was %s\n",
4152 name, oldname);
4153#endif
4154 if (((depth == ctxt->nameNr) &&
4155 (xmlStrEqual(oldname, ctxt->name))) ||
4156 (name == NULL)) {
4157 if (CUR == '>')
4158 NEXT;
4159 if (oldname != NULL)
4160 xmlFree(oldname);
4161 break;
4162 }
4163 if (oldname != NULL)
4164 xmlFree(oldname);
4165
4166 /*
4167 * Lookup the info for that element.
4168 */
4169 info = htmlTagLookup(name);
4170 if (info == NULL) {
4171 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4172 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4173 name);
4174 ctxt->wellFormed = 0;
4175 } else if (info->depr) {
4176 /***************************
4177 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4178 ctxt->sax->warning(ctxt->userData,
4179 "Tag %s is deprecated\n",
4180 name);
4181 ***************************/
4182 }
4183
4184 /*
4185 * Check for an Empty Element labelled the XML/SGML way
4186 */
4187 if ((CUR == '/') && (NXT(1) == '>')) {
4188 SKIP(2);
4189 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4190 ctxt->sax->endElement(ctxt->userData, name);
4191 oldname = htmlnamePop(ctxt);
4192#ifdef DEBUG
4193 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
4194 oldname);
4195#endif
4196 if (oldname != NULL)
4197 xmlFree(oldname);
4198 ctxt->instate = XML_PARSER_CONTENT;
4199#ifdef DEBUG_PUSH
4200 xmlGenericError(xmlGenericErrorContext,
4201 "HPP: entering CONTENT\n");
4202#endif
4203 break;
4204 }
4205
4206 if (CUR == '>') {
4207 NEXT;
4208 } else {
4209 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4210 ctxt->sax->error(ctxt->userData,
4211 "Couldn't find end of Start Tag %s\n",
4212 name);
4213 ctxt->wellFormed = 0;
4214
4215 /*
4216 * end of parsing of this node.
4217 */
4218 if (xmlStrEqual(name, ctxt->name)) {
4219 nodePop(ctxt);
4220 oldname = htmlnamePop(ctxt);
4221#ifdef DEBUG
4222 xmlGenericError(xmlGenericErrorContext,
4223 "End of start tag problem: popping out %s\n", oldname);
4224#endif
4225 if (oldname != NULL)
4226 xmlFree(oldname);
4227 }
4228
4229 ctxt->instate = XML_PARSER_CONTENT;
4230#ifdef DEBUG_PUSH
4231 xmlGenericError(xmlGenericErrorContext,
4232 "HPP: entering CONTENT\n");
4233#endif
4234 break;
4235 }
4236
4237 /*
4238 * Check for an Empty Element from DTD definition
4239 */
4240 if ((info != NULL) && (info->empty)) {
4241 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4242 ctxt->sax->endElement(ctxt->userData, name);
4243 oldname = htmlnamePop(ctxt);
4244#ifdef DEBUG
4245 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
4246#endif
4247 if (oldname != NULL)
4248 xmlFree(oldname);
4249 }
4250 ctxt->instate = XML_PARSER_CONTENT;
4251#ifdef DEBUG_PUSH
4252 xmlGenericError(xmlGenericErrorContext,
4253 "HPP: entering CONTENT\n");
4254#endif
4255 break;
4256 }
4257 case XML_PARSER_CONTENT: {
4258 long cons;
4259 /*
4260 * Handle preparsed entities and charRef
4261 */
4262 if (ctxt->token != 0) {
4263 xmlChar chr[2] = { 0 , 0 } ;
4264
4265 chr[0] = (xmlChar) ctxt->token;
4266 htmlCheckParagraph(ctxt);
4267 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4268 ctxt->sax->characters(ctxt->userData, chr, 1);
4269 ctxt->token = 0;
4270 ctxt->checkIndex = 0;
4271 }
4272 if ((avail == 1) && (terminate)) {
4273 cur = in->cur[0];
4274 if ((cur != '<') && (cur != '&')) {
4275 if (ctxt->sax != NULL) {
4276 if (IS_BLANK(cur)) {
4277 if (ctxt->sax->ignorableWhitespace != NULL)
4278 ctxt->sax->ignorableWhitespace(
4279 ctxt->userData, &cur, 1);
4280 } else {
4281 htmlCheckParagraph(ctxt);
4282 if (ctxt->sax->characters != NULL)
4283 ctxt->sax->characters(
4284 ctxt->userData, &cur, 1);
4285 }
4286 }
4287 ctxt->token = 0;
4288 ctxt->checkIndex = 0;
4289 NEXT;
4290 }
4291 break;
4292 }
4293 if (avail < 2)
4294 goto done;
4295 cur = in->cur[0];
4296 next = in->cur[1];
4297 cons = ctxt->nbChars;
4298 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4299 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4300 /*
4301 * Handle SCRIPT/STYLE separately
4302 */
4303 if ((!terminate) &&
4304 (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4305 goto done;
4306 htmlParseScript(ctxt);
4307 if ((cur == '<') && (next == '/')) {
4308 ctxt->instate = XML_PARSER_END_TAG;
4309 ctxt->checkIndex = 0;
4310#ifdef DEBUG_PUSH
4311 xmlGenericError(xmlGenericErrorContext,
4312 "HPP: entering END_TAG\n");
4313#endif
4314 break;
4315 }
4316 } else {
4317 /*
4318 * Sometimes DOCTYPE arrives in the middle of the document
4319 */
4320 if ((cur == '<') && (next == '!') &&
4321 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4322 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4323 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4324 (UPP(8) == 'E')) {
4325 if ((!terminate) &&
4326 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4327 goto done;
4328 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4329 ctxt->sax->error(ctxt->userData,
4330 "Misplaced DOCTYPE declaration\n");
4331 ctxt->wellFormed = 0;
4332 htmlParseDocTypeDecl(ctxt);
4333 } else if ((cur == '<') && (next == '!') &&
4334 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4335 if ((!terminate) &&
4336 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4337 goto done;
4338#ifdef DEBUG_PUSH
4339 xmlGenericError(xmlGenericErrorContext,
4340 "HPP: Parsing Comment\n");
4341#endif
4342 htmlParseComment(ctxt);
4343 ctxt->instate = XML_PARSER_CONTENT;
4344 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4345 goto done;
4346 } else if ((cur == '<') && (next == '/')) {
4347 ctxt->instate = XML_PARSER_END_TAG;
4348 ctxt->checkIndex = 0;
4349#ifdef DEBUG_PUSH
4350 xmlGenericError(xmlGenericErrorContext,
4351 "HPP: entering END_TAG\n");
4352#endif
4353 break;
4354 } else if (cur == '<') {
4355 ctxt->instate = XML_PARSER_START_TAG;
4356 ctxt->checkIndex = 0;
4357#ifdef DEBUG_PUSH
4358 xmlGenericError(xmlGenericErrorContext,
4359 "HPP: entering START_TAG\n");
4360#endif
4361 break;
4362 } else if (cur == '&') {
4363 if ((!terminate) &&
4364 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4365 goto done;
4366#ifdef DEBUG_PUSH
4367 xmlGenericError(xmlGenericErrorContext,
4368 "HPP: Parsing Reference\n");
4369#endif
4370 /* TODO: check generation of subtrees if noent !!! */
4371 htmlParseReference(ctxt);
4372 } else {
4373 /* TODO Avoid the extra copy, handle directly !!!!!! */
4374 /*
4375 * Goal of the following test is :
4376 * - minimize calls to the SAX 'character' callback
4377 * when they are mergeable
4378 */
4379 if ((ctxt->inputNr == 1) &&
4380 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4381 if ((!terminate) &&
4382 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4383 goto done;
4384 }
4385 ctxt->checkIndex = 0;
4386#ifdef DEBUG_PUSH
4387 xmlGenericError(xmlGenericErrorContext,
4388 "HPP: Parsing char data\n");
4389#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004390 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004391 }
4392 }
4393 if (cons == ctxt->nbChars) {
4394 if (ctxt->node != NULL) {
4395 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4396 ctxt->sax->error(ctxt->userData,
4397 "detected an error in element content\n");
4398 ctxt->wellFormed = 0;
4399 }
4400 NEXT;
4401 break;
4402 }
4403
4404 break;
4405 }
4406 case XML_PARSER_END_TAG:
4407 if (avail < 2)
4408 goto done;
4409 if ((!terminate) &&
4410 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4411 goto done;
4412 htmlParseEndTag(ctxt);
4413 if (ctxt->nameNr == 0) {
4414 ctxt->instate = XML_PARSER_EPILOG;
4415 } else {
4416 ctxt->instate = XML_PARSER_CONTENT;
4417 }
4418 ctxt->checkIndex = 0;
4419#ifdef DEBUG_PUSH
4420 xmlGenericError(xmlGenericErrorContext,
4421 "HPP: entering CONTENT\n");
4422#endif
4423 break;
4424 case XML_PARSER_CDATA_SECTION:
4425 xmlGenericError(xmlGenericErrorContext,
4426 "HPP: internal error, state == CDATA\n");
4427 ctxt->instate = XML_PARSER_CONTENT;
4428 ctxt->checkIndex = 0;
4429#ifdef DEBUG_PUSH
4430 xmlGenericError(xmlGenericErrorContext,
4431 "HPP: entering CONTENT\n");
4432#endif
4433 break;
4434 case XML_PARSER_DTD:
4435 xmlGenericError(xmlGenericErrorContext,
4436 "HPP: internal error, state == DTD\n");
4437 ctxt->instate = XML_PARSER_CONTENT;
4438 ctxt->checkIndex = 0;
4439#ifdef DEBUG_PUSH
4440 xmlGenericError(xmlGenericErrorContext,
4441 "HPP: entering CONTENT\n");
4442#endif
4443 break;
4444 case XML_PARSER_COMMENT:
4445 xmlGenericError(xmlGenericErrorContext,
4446 "HPP: internal error, state == COMMENT\n");
4447 ctxt->instate = XML_PARSER_CONTENT;
4448 ctxt->checkIndex = 0;
4449#ifdef DEBUG_PUSH
4450 xmlGenericError(xmlGenericErrorContext,
4451 "HPP: entering CONTENT\n");
4452#endif
4453 break;
4454 case XML_PARSER_PI:
4455 xmlGenericError(xmlGenericErrorContext,
4456 "HPP: internal error, state == PI\n");
4457 ctxt->instate = XML_PARSER_CONTENT;
4458 ctxt->checkIndex = 0;
4459#ifdef DEBUG_PUSH
4460 xmlGenericError(xmlGenericErrorContext,
4461 "HPP: entering CONTENT\n");
4462#endif
4463 break;
4464 case XML_PARSER_ENTITY_DECL:
4465 xmlGenericError(xmlGenericErrorContext,
4466 "HPP: internal error, state == ENTITY_DECL\n");
4467 ctxt->instate = XML_PARSER_CONTENT;
4468 ctxt->checkIndex = 0;
4469#ifdef DEBUG_PUSH
4470 xmlGenericError(xmlGenericErrorContext,
4471 "HPP: entering CONTENT\n");
4472#endif
4473 break;
4474 case XML_PARSER_ENTITY_VALUE:
4475 xmlGenericError(xmlGenericErrorContext,
4476 "HPP: internal error, state == ENTITY_VALUE\n");
4477 ctxt->instate = XML_PARSER_CONTENT;
4478 ctxt->checkIndex = 0;
4479#ifdef DEBUG_PUSH
4480 xmlGenericError(xmlGenericErrorContext,
4481 "HPP: entering DTD\n");
4482#endif
4483 break;
4484 case XML_PARSER_ATTRIBUTE_VALUE:
4485 xmlGenericError(xmlGenericErrorContext,
4486 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
4487 ctxt->instate = XML_PARSER_START_TAG;
4488 ctxt->checkIndex = 0;
4489#ifdef DEBUG_PUSH
4490 xmlGenericError(xmlGenericErrorContext,
4491 "HPP: entering START_TAG\n");
4492#endif
4493 break;
4494 case XML_PARSER_SYSTEM_LITERAL:
4495 xmlGenericError(xmlGenericErrorContext,
4496 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
4497 ctxt->instate = XML_PARSER_CONTENT;
4498 ctxt->checkIndex = 0;
4499#ifdef DEBUG_PUSH
4500 xmlGenericError(xmlGenericErrorContext,
4501 "HPP: entering CONTENT\n");
4502#endif
4503 break;
4504 case XML_PARSER_IGNORE:
4505 xmlGenericError(xmlGenericErrorContext,
4506 "HPP: internal error, state == XML_PARSER_IGNORE\n");
4507 ctxt->instate = XML_PARSER_CONTENT;
4508 ctxt->checkIndex = 0;
4509#ifdef DEBUG_PUSH
4510 xmlGenericError(xmlGenericErrorContext,
4511 "HPP: entering CONTENT\n");
4512#endif
4513 break;
4514 }
4515 }
4516done:
4517 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004518 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004519 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4520 /*
4521 * SAX: end of the document processing.
4522 */
4523 ctxt->instate = XML_PARSER_EOF;
4524 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4525 ctxt->sax->endDocument(ctxt->userData);
4526 }
4527 }
4528 if ((ctxt->myDoc != NULL) &&
4529 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4530 (ctxt->instate == XML_PARSER_EPILOG))) {
4531 xmlDtdPtr dtd;
4532 dtd = xmlGetIntSubset(ctxt->myDoc);
4533 if (dtd == NULL)
4534 ctxt->myDoc->intSubset =
4535 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4536 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4537 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4538 }
4539#ifdef DEBUG_PUSH
4540 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4541#endif
4542 return(ret);
4543}
4544
4545/**
Owen Taylor3473f882001-02-23 17:55:21 +00004546 * htmlParseChunk:
4547 * @ctxt: an XML parser context
4548 * @chunk: an char array
4549 * @size: the size in byte of the chunk
4550 * @terminate: last chunk indicator
4551 *
4552 * Parse a Chunk of memory
4553 *
4554 * Returns zero if no error, the xmlParserErrors otherwise.
4555 */
4556int
4557htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4558 int terminate) {
4559 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4560 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4561 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4562 int cur = ctxt->input->cur - ctxt->input->base;
4563
4564 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4565 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4566 ctxt->input->cur = ctxt->input->base + cur;
4567#ifdef DEBUG_PUSH
4568 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4569#endif
4570
4571 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4572 htmlParseTryOrFinish(ctxt, terminate);
4573 } else if (ctxt->instate != XML_PARSER_EOF) {
4574 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
4575 htmlParseTryOrFinish(ctxt, terminate);
4576 }
4577 if (terminate) {
4578 if ((ctxt->instate != XML_PARSER_EOF) &&
4579 (ctxt->instate != XML_PARSER_EPILOG) &&
4580 (ctxt->instate != XML_PARSER_MISC)) {
4581 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004582 ctxt->wellFormed = 0;
4583 }
4584 if (ctxt->instate != XML_PARSER_EOF) {
4585 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4586 ctxt->sax->endDocument(ctxt->userData);
4587 }
4588 ctxt->instate = XML_PARSER_EOF;
4589 }
4590 return((xmlParserErrors) ctxt->errNo);
4591}
4592
4593/************************************************************************
4594 * *
4595 * User entry points *
4596 * *
4597 ************************************************************************/
4598
4599/**
4600 * htmlCreatePushParserCtxt :
4601 * @sax: a SAX handler
4602 * @user_data: The user data returned on SAX callbacks
4603 * @chunk: a pointer to an array of chars
4604 * @size: number of chars in the array
4605 * @filename: an optional file name or URI
4606 * @enc: an optional encoding
4607 *
4608 * Create a parser context for using the HTML parser in push mode
4609 * To allow content encoding detection, @size should be >= 4
4610 * The value of @filename is used for fetching external entities
4611 * and error/warning reports.
4612 *
4613 * Returns the new parser context or NULL
4614 */
4615htmlParserCtxtPtr
4616htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4617 const char *chunk, int size, const char *filename,
4618 xmlCharEncoding enc) {
4619 htmlParserCtxtPtr ctxt;
4620 htmlParserInputPtr inputStream;
4621 xmlParserInputBufferPtr buf;
4622
4623 buf = xmlAllocParserInputBuffer(enc);
4624 if (buf == NULL) return(NULL);
4625
4626 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4627 if (ctxt == NULL) {
4628 xmlFree(buf);
4629 return(NULL);
4630 }
4631 memset(ctxt, 0, sizeof(htmlParserCtxt));
4632 htmlInitParserCtxt(ctxt);
4633 if (sax != NULL) {
4634 if (ctxt->sax != &htmlDefaultSAXHandler)
4635 xmlFree(ctxt->sax);
4636 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4637 if (ctxt->sax == NULL) {
4638 xmlFree(buf);
4639 xmlFree(ctxt);
4640 return(NULL);
4641 }
4642 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4643 if (user_data != NULL)
4644 ctxt->userData = user_data;
4645 }
4646 if (filename == NULL) {
4647 ctxt->directory = NULL;
4648 } else {
4649 ctxt->directory = xmlParserGetDirectory(filename);
4650 }
4651
4652 inputStream = htmlNewInputStream(ctxt);
4653 if (inputStream == NULL) {
4654 xmlFreeParserCtxt(ctxt);
4655 return(NULL);
4656 }
4657
4658 if (filename == NULL)
4659 inputStream->filename = NULL;
4660 else
4661 inputStream->filename = xmlMemStrdup(filename);
4662 inputStream->buf = buf;
4663 inputStream->base = inputStream->buf->buffer->content;
4664 inputStream->cur = inputStream->buf->buffer->content;
4665
4666 inputPush(ctxt, inputStream);
4667
4668 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4669 (ctxt->input->buf != NULL)) {
4670 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4671#ifdef DEBUG_PUSH
4672 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4673#endif
4674 }
4675
4676 return(ctxt);
4677}
4678
4679/**
4680 * htmlSAXParseDoc :
4681 * @cur: a pointer to an array of xmlChar
4682 * @encoding: a free form C string describing the HTML document encoding, or NULL
4683 * @sax: the SAX handler block
4684 * @userData: if using SAX, this pointer will be provided on callbacks.
4685 *
4686 * parse an HTML in-memory document and build a tree.
4687 * It use the given SAX function block to handle the parsing callback.
4688 * If sax is NULL, fallback to the default DOM tree building routines.
4689 *
4690 * Returns the resulting document tree
4691 */
4692
4693htmlDocPtr
4694htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
4695 htmlDocPtr ret;
4696 htmlParserCtxtPtr ctxt;
4697
4698 if (cur == NULL) return(NULL);
4699
4700
4701 ctxt = htmlCreateDocParserCtxt(cur, encoding);
4702 if (ctxt == NULL) return(NULL);
4703 if (sax != NULL) {
4704 ctxt->sax = sax;
4705 ctxt->userData = userData;
4706 }
4707
4708 htmlParseDocument(ctxt);
4709 ret = ctxt->myDoc;
4710 if (sax != NULL) {
4711 ctxt->sax = NULL;
4712 ctxt->userData = NULL;
4713 }
4714 htmlFreeParserCtxt(ctxt);
4715
4716 return(ret);
4717}
4718
4719/**
4720 * htmlParseDoc :
4721 * @cur: a pointer to an array of xmlChar
4722 * @encoding: a free form C string describing the HTML document encoding, or NULL
4723 *
4724 * parse an HTML in-memory document and build a tree.
4725 *
4726 * Returns the resulting document tree
4727 */
4728
4729htmlDocPtr
4730htmlParseDoc(xmlChar *cur, const char *encoding) {
4731 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4732}
4733
4734
4735/**
4736 * htmlCreateFileParserCtxt :
4737 * @filename: the filename
4738 * @encoding: a free form C string describing the HTML document encoding, or NULL
4739 *
4740 * Create a parser context for a file content.
4741 * Automatic support for ZLIB/Compress compressed document is provided
4742 * by default if found at compile-time.
4743 *
4744 * Returns the new parser context or NULL
4745 */
4746htmlParserCtxtPtr
4747htmlCreateFileParserCtxt(const char *filename, const char *encoding)
4748{
4749 htmlParserCtxtPtr ctxt;
4750 htmlParserInputPtr inputStream;
4751 xmlParserInputBufferPtr buf;
4752 /* htmlCharEncoding enc; */
4753 xmlChar *content, *content_line = (xmlChar *) "charset=";
4754
4755 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
4756 if (buf == NULL) return(NULL);
4757
4758 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4759 if (ctxt == NULL) {
4760 perror("malloc");
4761 return(NULL);
4762 }
4763 memset(ctxt, 0, sizeof(htmlParserCtxt));
4764 htmlInitParserCtxt(ctxt);
4765 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
4766 if (inputStream == NULL) {
4767 perror("malloc");
4768 xmlFree(ctxt);
4769 return(NULL);
4770 }
4771 memset(inputStream, 0, sizeof(htmlParserInput));
4772
4773 inputStream->filename = xmlMemStrdup(filename);
4774 inputStream->line = 1;
4775 inputStream->col = 1;
4776 inputStream->buf = buf;
4777 inputStream->directory = NULL;
4778
4779 inputStream->base = inputStream->buf->buffer->content;
4780 inputStream->cur = inputStream->buf->buffer->content;
4781 inputStream->free = NULL;
4782
4783 inputPush(ctxt, inputStream);
4784
4785 /* set encoding */
4786 if (encoding) {
4787 content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
4788 if (content) {
4789 strcpy ((char *)content, (char *)content_line);
4790 strcat ((char *)content, (char *)encoding);
4791 htmlCheckEncoding (ctxt, content);
4792 xmlFree (content);
4793 }
4794 }
4795
4796 return(ctxt);
4797}
4798
4799/**
4800 * htmlSAXParseFile :
4801 * @filename: the filename
4802 * @encoding: a free form C string describing the HTML document encoding, or NULL
4803 * @sax: the SAX handler block
4804 * @userData: if using SAX, this pointer will be provided on callbacks.
4805 *
4806 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4807 * compressed document is provided by default if found at compile-time.
4808 * It use the given SAX function block to handle the parsing callback.
4809 * If sax is NULL, fallback to the default DOM tree building routines.
4810 *
4811 * Returns the resulting document tree
4812 */
4813
4814htmlDocPtr
4815htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
4816 void *userData) {
4817 htmlDocPtr ret;
4818 htmlParserCtxtPtr ctxt;
4819 htmlSAXHandlerPtr oldsax = NULL;
4820
4821 ctxt = htmlCreateFileParserCtxt(filename, encoding);
4822 if (ctxt == NULL) return(NULL);
4823 if (sax != NULL) {
4824 oldsax = ctxt->sax;
4825 ctxt->sax = sax;
4826 ctxt->userData = userData;
4827 }
4828
4829 htmlParseDocument(ctxt);
4830
4831 ret = ctxt->myDoc;
4832 if (sax != NULL) {
4833 ctxt->sax = oldsax;
4834 ctxt->userData = NULL;
4835 }
4836 htmlFreeParserCtxt(ctxt);
4837
4838 return(ret);
4839}
4840
4841/**
4842 * htmlParseFile :
4843 * @filename: the filename
4844 * @encoding: a free form C string describing the HTML document encoding, or NULL
4845 *
4846 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4847 * compressed document is provided by default if found at compile-time.
4848 *
4849 * Returns the resulting document tree
4850 */
4851
4852htmlDocPtr
4853htmlParseFile(const char *filename, const char *encoding) {
4854 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
4855}
4856
4857/**
4858 * htmlHandleOmittedElem:
4859 * @val: int 0 or 1
4860 *
4861 * Set and return the previous value for handling HTML omitted tags.
4862 *
4863 * Returns the last value for 0 for no handling, 1 for auto insertion.
4864 */
4865
4866int
4867htmlHandleOmittedElem(int val) {
4868 int old = htmlOmittedDefaultValue;
4869
4870 htmlOmittedDefaultValue = val;
4871 return(old);
4872}
4873
4874#endif /* LIBXML_HTML_ENABLED */