blob: ace49d9fcb952d202291092f6ec3badb7e28d3e0 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * Daniel.Veillard@w3.org
7 */
8
Bjorn Reese70a9da52001-04-21 16:57:29 +00009#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000010#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000011
Owen Taylor3473f882001-02-23 17:55:21 +000012#include <string.h>
13#ifdef HAVE_CTYPE_H
14#include <ctype.h>
15#endif
16#ifdef HAVE_STDLIB_H
17#include <stdlib.h>
18#endif
19#ifdef HAVE_SYS_STAT_H
20#include <sys/stat.h>
21#endif
22#ifdef HAVE_FCNTL_H
23#include <fcntl.h>
24#endif
25#ifdef HAVE_UNISTD_H
26#include <unistd.h>
27#endif
28#ifdef HAVE_ZLIB_H
29#include <zlib.h>
30#endif
31
32#include <libxml/xmlmemory.h>
33#include <libxml/tree.h>
34#include <libxml/parser.h>
35#include <libxml/parserInternals.h>
36#include <libxml/xmlerror.h>
37#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000038#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000039#include <libxml/entities.h>
40#include <libxml/encoding.h>
41#include <libxml/valid.h>
42#include <libxml/xmlIO.h>
43
44#define HTML_MAX_NAMELEN 1000
45#define HTML_PARSER_BIG_BUFFER_SIZE 1000
46#define HTML_PARSER_BUFFER_SIZE 100
47
48/* #define DEBUG */
49/* #define DEBUG_PUSH */
50
51int htmlOmittedDefaultValue = 1;
52
Daniel Veillard56a4cb82001-03-24 17:00:36 +000053xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
54 xmlChar end, xmlChar end2, xmlChar end3);
55
56/************************************************************************
57 * *
Owen Taylor3473f882001-02-23 17:55:21 +000058 * Parser stacks related functions and macros *
59 * *
60 ************************************************************************/
61
62/*
63 * Generic function for accessing stacks in the Parser Context
64 */
65
66#define PUSH_AND_POP(scope, type, name) \
67scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
68 if (ctxt->name##Nr >= ctxt->name##Max) { \
69 ctxt->name##Max *= 2; \
70 ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
71 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
72 if (ctxt->name##Tab == NULL) { \
73 xmlGenericError(xmlGenericErrorContext, \
74 "realloc failed !\n"); \
75 return(0); \
76 } \
77 } \
78 ctxt->name##Tab[ctxt->name##Nr] = value; \
79 ctxt->name = value; \
80 return(ctxt->name##Nr++); \
81} \
82scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
83 type ret; \
84 if (ctxt->name##Nr < 0) return(0); \
85 ctxt->name##Nr--; \
86 if (ctxt->name##Nr < 0) return(0); \
87 if (ctxt->name##Nr > 0) \
88 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
89 else \
90 ctxt->name = NULL; \
91 ret = ctxt->name##Tab[ctxt->name##Nr]; \
92 ctxt->name##Tab[ctxt->name##Nr] = 0; \
93 return(ret); \
94} \
95
Daniel Veillard56a4cb82001-03-24 17:00:36 +000096/* PUSH_AND_POP(static, xmlNodePtr, node) */
97PUSH_AND_POP(static, xmlChar*, name)
Owen Taylor3473f882001-02-23 17:55:21 +000098
99/*
100 * Macros for accessing the content. Those should be used only by the parser,
101 * and not exported.
102 *
103 * Dirty macros, i.e. one need to make assumption on the context to use them
104 *
105 * CUR_PTR return the current pointer to the xmlChar to be parsed.
106 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
107 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
108 * in UNICODE mode. This should be used internally by the parser
109 * only to compare to ASCII values otherwise it would break when
110 * running with UTF-8 encoding.
111 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
112 * to compare on ASCII based substring.
113 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
114 * it should be used only to compare on ASCII based substring.
115 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
116 * strings within the parser.
117 *
118 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
119 *
120 * CURRENT Returns the current char value, with the full decoding of
121 * UTF-8 if we are using this mode. It returns an int.
122 * NEXT Skip to the next character, this does the proper decoding
123 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
124 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
125 */
126
127#define UPPER (toupper(*ctxt->input->cur))
128
129#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
130
131#define NXT(val) ctxt->input->cur[(val)]
132
133#define UPP(val) (toupper(ctxt->input->cur[(val)]))
134
135#define CUR_PTR ctxt->input->cur
136
137#define SHRINK xmlParserInputShrink(ctxt->input)
138
139#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
140
141#define CURRENT ((int) (*ctxt->input->cur))
142
143#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
144
145/* Inported from XML */
146
147/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
148#define CUR ((int) (*ctxt->input->cur))
149#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
150
151#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
152#define NXT(val) ctxt->input->cur[(val)]
153#define CUR_PTR ctxt->input->cur
154
155
156#define NEXTL(l) do { \
157 if (*(ctxt->input->cur) == '\n') { \
158 ctxt->input->line++; ctxt->input->col = 1; \
159 } else ctxt->input->col++; \
160 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
161 } while (0)
162
163/************
164 \
165 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
166 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
167 ************/
168
169#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
170#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
171
172#define COPY_BUF(l,b,i,v) \
173 if (l == 1) b[i++] = (xmlChar) v; \
174 else i += xmlCopyChar(l,&b[i],v)
175
176/**
177 * htmlCurrentChar:
178 * @ctxt: the HTML parser context
179 * @len: pointer to the length of the char read
180 *
181 * The current char value, if using UTF-8 this may actaully span multiple
182 * bytes in the input buffer. Implement the end of line normalization:
183 * 2.11 End-of-Line Handling
184 * If the encoding is unspecified, in the case we find an ISO-Latin-1
185 * char, then the encoding converter is plugged in automatically.
186 *
187 * Returns the current char value and its lenght
188 */
189
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000190static int
Owen Taylor3473f882001-02-23 17:55:21 +0000191htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
192 if (ctxt->instate == XML_PARSER_EOF)
193 return(0);
194
195 if (ctxt->token != 0) {
196 *len = 0;
197 return(ctxt->token);
198 }
199 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
200 /*
201 * We are supposed to handle UTF8, check it's valid
202 * From rfc2044: encoding of the Unicode values on UTF-8:
203 *
204 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
205 * 0000 0000-0000 007F 0xxxxxxx
206 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
207 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
208 *
209 * Check for the 0x110000 limit too
210 */
211 const unsigned char *cur = ctxt->input->cur;
212 unsigned char c;
213 unsigned int val;
214
215 c = *cur;
216 if (c & 0x80) {
217 if (cur[1] == 0)
218 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
219 if ((cur[1] & 0xc0) != 0x80)
220 goto encoding_error;
221 if ((c & 0xe0) == 0xe0) {
222
223 if (cur[2] == 0)
224 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
225 if ((cur[2] & 0xc0) != 0x80)
226 goto encoding_error;
227 if ((c & 0xf0) == 0xf0) {
228 if (cur[3] == 0)
229 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
230 if (((c & 0xf8) != 0xf0) ||
231 ((cur[3] & 0xc0) != 0x80))
232 goto encoding_error;
233 /* 4-byte code */
234 *len = 4;
235 val = (cur[0] & 0x7) << 18;
236 val |= (cur[1] & 0x3f) << 12;
237 val |= (cur[2] & 0x3f) << 6;
238 val |= cur[3] & 0x3f;
239 } else {
240 /* 3-byte code */
241 *len = 3;
242 val = (cur[0] & 0xf) << 12;
243 val |= (cur[1] & 0x3f) << 6;
244 val |= cur[2] & 0x3f;
245 }
246 } else {
247 /* 2-byte code */
248 *len = 2;
249 val = (cur[0] & 0x1f) << 6;
250 val |= cur[1] & 0x3f;
251 }
252 if (!IS_CHAR(val)) {
253 ctxt->errNo = XML_ERR_INVALID_ENCODING;
254 if ((ctxt->sax != NULL) &&
255 (ctxt->sax->error != NULL))
256 ctxt->sax->error(ctxt->userData,
257 "Char 0x%X out of allowed range\n", val);
258 ctxt->wellFormed = 0;
259 ctxt->disableSAX = 1;
260 }
261 return(val);
262 } else {
263 /* 1-byte code */
264 *len = 1;
265 return((int) *ctxt->input->cur);
266 }
267 }
268 /*
269 * Assume it's a fixed lenght encoding (1) with
270 * a compatibke encoding for the ASCII set, since
271 * XML constructs only use < 128 chars
272 */
273 *len = 1;
274 if ((int) *ctxt->input->cur < 0x80)
275 return((int) *ctxt->input->cur);
276
277 /*
278 * Humm this is bad, do an automatic flow conversion
279 */
280 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
281 ctxt->charset = XML_CHAR_ENCODING_UTF8;
282 return(xmlCurrentChar(ctxt, len));
283
284encoding_error:
285 /*
286 * If we detect an UTF8 error that probably mean that the
287 * input encoding didn't get properly advertized in the
288 * declaration header. Report the error and switch the encoding
289 * to ISO-Latin-1 (if you don't like this policy, just declare the
290 * encoding !)
291 */
292 ctxt->errNo = XML_ERR_INVALID_ENCODING;
293 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
294 ctxt->sax->error(ctxt->userData,
295 "Input is not proper UTF-8, indicate encoding !\n");
296 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
297 ctxt->input->cur[0], ctxt->input->cur[1],
298 ctxt->input->cur[2], ctxt->input->cur[3]);
299 }
300
301 ctxt->charset = XML_CHAR_ENCODING_8859_1;
302 *len = 1;
303 return((int) *ctxt->input->cur);
304}
305
306/**
Owen Taylor3473f882001-02-23 17:55:21 +0000307 * htmlSkipBlankChars:
308 * @ctxt: the HTML parser context
309 *
310 * skip all blanks character found at that point in the input streams.
311 *
312 * Returns the number of space chars skipped
313 */
314
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000315static int
Owen Taylor3473f882001-02-23 17:55:21 +0000316htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
317 int res = 0;
318
319 while (IS_BLANK(*(ctxt->input->cur))) {
320 if ((*ctxt->input->cur == 0) &&
321 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
322 xmlPopInput(ctxt);
323 } else {
324 if (*(ctxt->input->cur) == '\n') {
325 ctxt->input->line++; ctxt->input->col = 1;
326 } else ctxt->input->col++;
327 ctxt->input->cur++;
328 ctxt->nbChars++;
329 if (*ctxt->input->cur == 0)
330 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
331 }
332 res++;
333 }
334 return(res);
335}
336
337
338
339/************************************************************************
340 * *
341 * The list of HTML elements and their properties *
342 * *
343 ************************************************************************/
344
345/*
346 * Start Tag: 1 means the start tag can be ommited
347 * End Tag: 1 means the end tag can be ommited
348 * 2 means it's forbidden (empty elements)
Daniel Veillard56098d42001-04-24 12:51:09 +0000349 * 3 means the tag is stylistic and should be closed easilly
Owen Taylor3473f882001-02-23 17:55:21 +0000350 * Depr: this element is deprecated
351 * DTD: 1 means that this element is valid only in the Loose DTD
352 * 2 means that this element is valid only in the Frameset DTD
353 *
354 * Name,Start Tag,End Tag,Save End, Empty, Depr., DTD, Description
355 */
356htmlElemDesc html40ElementTable[] = {
357{ "a", 0, 0, 0, 0, 0, 0, "anchor " },
358{ "abbr", 0, 0, 0, 0, 0, 0, "abbreviated form" },
359{ "acronym", 0, 0, 0, 0, 0, 0, "" },
360{ "address", 0, 0, 0, 0, 0, 0, "information on author " },
361{ "applet", 0, 0, 0, 0, 1, 1, "java applet " },
362{ "area", 0, 2, 2, 1, 0, 0, "client-side image map area " },
Daniel Veillard56098d42001-04-24 12:51:09 +0000363{ "b", 0, 3, 0, 0, 0, 0, "bold text style" },
Owen Taylor3473f882001-02-23 17:55:21 +0000364{ "base", 0, 2, 2, 1, 0, 0, "document base uri " },
365{ "basefont", 0, 2, 2, 1, 1, 1, "base font size " },
366{ "bdo", 0, 0, 0, 0, 0, 0, "i18n bidi over-ride " },
Daniel Veillard56098d42001-04-24 12:51:09 +0000367{ "big", 0, 3, 0, 0, 0, 0, "large text style" },
Owen Taylor3473f882001-02-23 17:55:21 +0000368{ "blockquote", 0, 0, 0, 0, 0, 0, "long quotation " },
369{ "body", 1, 1, 0, 0, 0, 0, "document body " },
370{ "br", 0, 2, 2, 1, 0, 0, "forced line break " },
371{ "button", 0, 0, 0, 0, 0, 0, "push button " },
372{ "caption", 0, 0, 0, 0, 0, 0, "table caption " },
Daniel Veillard56098d42001-04-24 12:51:09 +0000373{ "center", 0, 3, 0, 0, 1, 1, "shorthand for div align=center " },
Owen Taylor3473f882001-02-23 17:55:21 +0000374{ "cite", 0, 0, 0, 0, 0, 0, "citation" },
375{ "code", 0, 0, 0, 0, 0, 0, "computer code fragment" },
376{ "col", 0, 2, 2, 1, 0, 0, "table column " },
377{ "colgroup", 0, 1, 0, 0, 0, 0, "table column group " },
378{ "dd", 0, 1, 0, 0, 0, 0, "definition description " },
379{ "del", 0, 0, 0, 0, 0, 0, "deleted text " },
380{ "dfn", 0, 0, 0, 0, 0, 0, "instance definition" },
381{ "dir", 0, 0, 0, 0, 1, 1, "directory list" },
382{ "div", 0, 0, 0, 0, 0, 0, "generic language/style container"},
383{ "dl", 0, 0, 0, 0, 0, 0, "definition list " },
384{ "dt", 0, 1, 0, 0, 0, 0, "definition term " },
Daniel Veillard56098d42001-04-24 12:51:09 +0000385{ "em", 0, 3, 0, 0, 0, 0, "emphasis" },
Owen Taylor3473f882001-02-23 17:55:21 +0000386{ "fieldset", 0, 0, 0, 0, 0, 0, "form control group " },
Daniel Veillard56098d42001-04-24 12:51:09 +0000387{ "font", 0, 3, 0, 0, 1, 1, "local change to font " },
Owen Taylor3473f882001-02-23 17:55:21 +0000388{ "form", 0, 0, 0, 0, 0, 0, "interactive form " },
389{ "frame", 0, 2, 2, 1, 0, 2, "subwindow " },
390{ "frameset", 0, 0, 0, 0, 0, 2, "window subdivision" },
391{ "h1", 0, 0, 0, 0, 0, 0, "heading " },
392{ "h2", 0, 0, 0, 0, 0, 0, "heading " },
393{ "h3", 0, 0, 0, 0, 0, 0, "heading " },
394{ "h4", 0, 0, 0, 0, 0, 0, "heading " },
395{ "h5", 0, 0, 0, 0, 0, 0, "heading " },
396{ "h6", 0, 0, 0, 0, 0, 0, "heading " },
397{ "head", 1, 1, 0, 0, 0, 0, "document head " },
398{ "hr", 0, 2, 2, 1, 0, 0, "horizontal rule " },
399{ "html", 1, 1, 0, 0, 0, 0, "document root element " },
Daniel Veillard56098d42001-04-24 12:51:09 +0000400{ "i", 0, 3, 0, 0, 0, 0, "italic text style" },
Owen Taylor3473f882001-02-23 17:55:21 +0000401{ "iframe", 0, 0, 0, 0, 0, 1, "inline subwindow " },
402{ "img", 0, 2, 2, 1, 0, 0, "embedded image " },
403{ "input", 0, 2, 2, 1, 0, 0, "form control " },
404{ "ins", 0, 0, 0, 0, 0, 0, "inserted text" },
405{ "isindex", 0, 2, 2, 1, 1, 1, "single line prompt " },
406{ "kbd", 0, 0, 0, 0, 0, 0, "text to be entered by the user" },
407{ "label", 0, 0, 0, 0, 0, 0, "form field label text " },
408{ "legend", 0, 0, 0, 0, 0, 0, "fieldset legend " },
409{ "li", 0, 1, 1, 0, 0, 0, "list item " },
410{ "link", 0, 2, 2, 1, 0, 0, "a media-independent link " },
411{ "map", 0, 0, 0, 0, 0, 0, "client-side image map " },
412{ "menu", 0, 0, 0, 0, 1, 1, "menu list " },
413{ "meta", 0, 2, 2, 1, 0, 0, "generic metainformation " },
414{ "noframes", 0, 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " },
415{ "noscript", 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
416{ "object", 0, 0, 0, 0, 0, 0, "generic embedded object " },
417{ "ol", 0, 0, 0, 0, 0, 0, "ordered list " },
418{ "optgroup", 0, 0, 0, 0, 0, 0, "option group " },
419{ "option", 0, 1, 0, 0, 0, 0, "selectable choice " },
420{ "p", 0, 1, 1, 0, 0, 0, "paragraph " },
421{ "param", 0, 2, 2, 1, 0, 0, "named property value " },
422{ "pre", 0, 0, 0, 0, 0, 0, "preformatted text " },
423{ "q", 0, 0, 0, 0, 0, 0, "short inline quotation " },
Daniel Veillard56098d42001-04-24 12:51:09 +0000424{ "s", 0, 3, 0, 0, 1, 1, "strike-through text style" },
Owen Taylor3473f882001-02-23 17:55:21 +0000425{ "samp", 0, 0, 0, 0, 0, 0, "sample program output, scripts, etc." },
426{ "script", 0, 0, 0, 0, 0, 0, "script statements " },
427{ "select", 0, 0, 0, 0, 0, 0, "option selector " },
Daniel Veillard56098d42001-04-24 12:51:09 +0000428{ "small", 0, 3, 0, 0, 0, 0, "small text style" },
Owen Taylor3473f882001-02-23 17:55:21 +0000429{ "span", 0, 0, 0, 0, 0, 0, "generic language/style container " },
Daniel Veillard56098d42001-04-24 12:51:09 +0000430{ "strike", 0, 3, 0, 0, 1, 1, "strike-through text" },
431{ "strong", 0, 3, 0, 0, 0, 0, "strong emphasis" },
Owen Taylor3473f882001-02-23 17:55:21 +0000432{ "style", 0, 0, 0, 0, 0, 0, "style info " },
Daniel Veillard56098d42001-04-24 12:51:09 +0000433{ "sub", 0, 3, 0, 0, 0, 0, "subscript" },
434{ "sup", 0, 3, 0, 0, 0, 0, "superscript " },
Owen Taylor3473f882001-02-23 17:55:21 +0000435{ "table", 0, 0, 0, 0, 0, 0, "&#160;" },
436{ "tbody", 1, 0, 0, 0, 0, 0, "table body " },
437{ "td", 0, 0, 0, 0, 0, 0, "table data cell" },
438{ "textarea", 0, 0, 0, 0, 0, 0, "multi-line text field " },
439{ "tfoot", 0, 1, 0, 0, 0, 0, "table footer " },
440{ "th", 0, 1, 0, 0, 0, 0, "table header cell" },
441{ "thead", 0, 1, 0, 0, 0, 0, "table header " },
442{ "title", 0, 0, 0, 0, 0, 0, "document title " },
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000443{ "tr", 0, 0, 0, 0, 0, 0, "table row " },
Daniel Veillard56098d42001-04-24 12:51:09 +0000444{ "tt", 0, 3, 0, 0, 0, 0, "teletype or monospaced text style" },
445{ "u", 0, 3, 0, 0, 1, 1, "underlined text style" },
Owen Taylor3473f882001-02-23 17:55:21 +0000446{ "ul", 0, 0, 0, 0, 0, 0, "unordered list " },
447{ "var", 0, 0, 0, 0, 0, 0, "instance of a variable or program argument" },
448};
449
450/*
451 * start tags that imply the end of a current element
452 * any tag of each line implies the end of the current element if the type of
453 * that element is in the same line
454 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000455const char *htmlEquEnd[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000456"dt", "dd", "li", "option", NULL,
457"h1", "h2", "h3", "h4", "h5", "h6", NULL,
458"ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,
459NULL
460};
461/*
462 * acording the HTML DTD, HR should be added to the 2nd line above, as it
463 * is not allowed within a H1, H2, H3, etc. But we should tolerate that case
464 * because many documents contain rules in headings...
465 */
466
467/*
468 * start tags that imply the end of current element
469 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000470const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000471"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
472 "dl", "ul", "ol", "menu", "dir", "address", "pre",
473 "listing", "xmp", "head", NULL,
474"head", "p", NULL,
475"title", "p", NULL,
476"body", "head", "style", "link", "title", "p", NULL,
477"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
478 "pre", "listing", "xmp", "head", "li", NULL,
479"hr", "p", "head", NULL,
480"h1", "p", "head", NULL,
481"h2", "p", "head", NULL,
482"h3", "p", "head", NULL,
483"h4", "p", "head", NULL,
484"h5", "p", "head", NULL,
485"h6", "p", "head", NULL,
486"dir", "p", "head", NULL,
487"address", "p", "head", "ul", NULL,
488"pre", "p", "head", "ul", NULL,
489"listing", "p", "head", NULL,
490"xmp", "p", "head", NULL,
491"blockquote", "p", "head", NULL,
492"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
493 "xmp", "head", NULL,
494"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
495 "head", "dd", NULL,
496"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
497 "head", "dt", NULL,
498"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
499 "listing", "xmp", NULL,
500"ol", "p", "head", "ul", NULL,
501"menu", "p", "head", "ul", NULL,
502"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
503"div", "p", "head", NULL,
504"noscript", "p", "head", NULL,
505"center", "font", "b", "i", "p", "head", NULL,
506"a", "a", NULL,
507"caption", "p", NULL,
508"colgroup", "caption", "colgroup", "col", "p", NULL,
509"col", "caption", "col", "p", NULL,
510"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
511 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000512"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
513"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000514"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
515"thead", "caption", "col", "colgroup", NULL,
516"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
517 "tbody", "p", NULL,
518"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
519 "tfoot", "tbody", "p", NULL,
520"optgroup", "option", NULL,
521"option", "option", NULL,
522"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
523 "pre", "listing", "xmp", "a", NULL,
524NULL
525};
526
527/*
528 * The list of HTML elements which are supposed not to have
529 * CDATA content and where a p element will be implied
530 *
531 * TODO: extend that list by reading the HTML SGML DtD on
532 * implied paragraph
533 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000534static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000535 "html",
536 "head",
537 "body",
538 NULL
539};
540
541/*
542 * The list of HTML attributes which are of content %Script;
543 * NOTE: when adding ones, check htmlIsScriptAttribute() since
544 * it assumes the name starts with 'on'
545 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000546static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000547 "onclick",
548 "ondblclick",
549 "onmousedown",
550 "onmouseup",
551 "onmouseover",
552 "onmousemove",
553 "onmouseout",
554 "onkeypress",
555 "onkeydown",
556 "onkeyup",
557 "onload",
558 "onunload",
559 "onfocus",
560 "onblur",
561 "onsubmit",
562 "onrest",
563 "onchange",
564 "onselect"
565};
566
567
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000568static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +0000569static int htmlStartCloseIndexinitialized = 0;
570
571/************************************************************************
572 * *
573 * functions to handle HTML specific data *
574 * *
575 ************************************************************************/
576
577/**
578 * htmlInitAutoClose:
579 *
580 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
581 * This is not reentrant. Call xmlInitParser() once before processing in
582 * case of use in multithreaded programs.
583 */
584void
585htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000586 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000587
588 if (htmlStartCloseIndexinitialized) return;
589
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000590 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
591 indx = 0;
592 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
593 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +0000594 while (htmlStartClose[i] != NULL) i++;
595 i++;
596 }
597 htmlStartCloseIndexinitialized = 1;
598}
599
600/**
601 * htmlTagLookup:
602 * @tag: The tag name in lowercase
603 *
604 * Lookup the HTML tag in the ElementTable
605 *
606 * Returns the related htmlElemDescPtr or NULL if not found.
607 */
608htmlElemDescPtr
609htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000610 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000611
612 for (i = 0; i < (sizeof(html40ElementTable) /
613 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000614 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Owen Taylor3473f882001-02-23 17:55:21 +0000615 return(&html40ElementTable[i]);
616 }
617 return(NULL);
618}
619
620/**
621 * htmlCheckAutoClose:
622 * @newtag: The new tag name
623 * @oldtag: The old tag name
624 *
625 * Checks wether the new tag is one of the registered valid tags for closing old.
626 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
627 *
628 * Returns 0 if no, 1 if yes.
629 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000630static int
Owen Taylor3473f882001-02-23 17:55:21 +0000631htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000632 int i, indx;
633 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +0000634
635 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
636
637 /* inefficient, but not a big deal */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000638 for (indx = 0; indx < 100;indx++) {
639 closed = htmlStartCloseIndex[indx];
640 if (closed == NULL) return(0);
641 if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor3473f882001-02-23 17:55:21 +0000642 }
643
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000644 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +0000645 i++;
646 while (htmlStartClose[i] != NULL) {
647 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
648 return(1);
649 }
650 i++;
651 }
652 return(0);
653}
654
655/**
656 * htmlAutoCloseOnClose:
657 * @ctxt: an HTML parser context
658 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000659 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +0000660 *
661 * The HTmL DtD allows an ending tag to implicitely close other tags.
662 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000663static void
Owen Taylor3473f882001-02-23 17:55:21 +0000664htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
665 htmlElemDescPtr info;
666 xmlChar *oldname;
667 int i;
668
669#ifdef DEBUG
670 xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
671 for (i = 0;i < ctxt->nameNr;i++)
672 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
673#endif
674
675 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
676 if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
677 }
678 if (i < 0) return;
679
680 while (!xmlStrEqual(newtag, ctxt->name)) {
681 info = htmlTagLookup(ctxt->name);
682 if ((info == NULL) || (info->endTag == 1)) {
683#ifdef DEBUG
684 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
685#endif
Daniel Veillard56098d42001-04-24 12:51:09 +0000686 } else if (info->endTag == 3) {
687#ifdef DEBUG
688 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
689#endif
690 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
691 ctxt->sax->error(ctxt->userData,
692 "Opening and ending tag mismatch: %s and %s\n",
693 newtag, ctxt->name);
694 ctxt->wellFormed = 0;
695 } else {
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000696 return;
Owen Taylor3473f882001-02-23 17:55:21 +0000697 }
698 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
699 ctxt->sax->endElement(ctxt->userData, ctxt->name);
700 oldname = htmlnamePop(ctxt);
701 if (oldname != NULL) {
702#ifdef DEBUG
703 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
704#endif
705 xmlFree(oldname);
706 }
707 }
708}
709
710/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000711 * htmlAutoCloseOnEnd:
712 * @ctxt: an HTML parser context
713 *
714 * Close all remaining tags at the end of the stream
715 */
716static void
717htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
718 xmlChar *oldname;
719 int i;
720
721 if (ctxt->nameNr == 0)
722 return;
723#ifdef DEBUG
724 xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
725#endif
726
727 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
728#ifdef DEBUG
729 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
730#endif
731 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
732 ctxt->sax->endElement(ctxt->userData, ctxt->name);
733 oldname = htmlnamePop(ctxt);
734 if (oldname != NULL) {
735#ifdef DEBUG
736 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
737#endif
738 xmlFree(oldname);
739 }
740 }
741}
742
743/**
Owen Taylor3473f882001-02-23 17:55:21 +0000744 * htmlAutoClose:
745 * @ctxt: an HTML parser context
746 * @newtag: The new tag name or NULL
747 *
748 * The HTmL DtD allows a tag to implicitely close other tags.
749 * The list is kept in htmlStartClose array. This function is
750 * called when a new tag has been detected and generates the
751 * appropriates closes if possible/needed.
752 * If newtag is NULL this mean we are at the end of the resource
753 * and we should check
754 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000755static void
Owen Taylor3473f882001-02-23 17:55:21 +0000756htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
757 xmlChar *oldname;
758 while ((newtag != NULL) && (ctxt->name != NULL) &&
759 (htmlCheckAutoClose(newtag, ctxt->name))) {
760#ifdef DEBUG
761 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
762#endif
763 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
764 ctxt->sax->endElement(ctxt->userData, ctxt->name);
765 oldname = htmlnamePop(ctxt);
766 if (oldname != NULL) {
767#ifdef DEBUG
768 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
769#endif
770 xmlFree(oldname);
771 }
772 }
773 if (newtag == NULL) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000774 htmlAutoCloseOnEnd(ctxt);
775 return;
Owen Taylor3473f882001-02-23 17:55:21 +0000776 }
777 while ((newtag == NULL) && (ctxt->name != NULL) &&
778 ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
779 (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
780 (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
781#ifdef DEBUG
782 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
783#endif
784 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
785 ctxt->sax->endElement(ctxt->userData, ctxt->name);
786 oldname = htmlnamePop(ctxt);
787 if (oldname != NULL) {
788#ifdef DEBUG
789 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
790#endif
791 xmlFree(oldname);
792 }
793 }
794
795}
796
797/**
798 * htmlAutoCloseTag:
799 * @doc: the HTML document
800 * @name: The tag name
801 * @elem: the HTML element
802 *
803 * The HTmL DtD allows a tag to implicitely close other tags.
804 * The list is kept in htmlStartClose array. This function checks
805 * if the element or one of it's children would autoclose the
806 * given tag.
807 *
808 * Returns 1 if autoclose, 0 otherwise
809 */
810int
811htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
812 htmlNodePtr child;
813
814 if (elem == NULL) return(1);
815 if (xmlStrEqual(name, elem->name)) return(0);
816 if (htmlCheckAutoClose(elem->name, name)) return(1);
817 child = elem->children;
818 while (child != NULL) {
819 if (htmlAutoCloseTag(doc, name, child)) return(1);
820 child = child->next;
821 }
822 return(0);
823}
824
825/**
826 * htmlIsAutoClosed:
827 * @doc: the HTML document
828 * @elem: the HTML element
829 *
830 * The HTmL DtD allows a tag to implicitely close other tags.
831 * The list is kept in htmlStartClose array. This function checks
832 * if a tag is autoclosed by one of it's child
833 *
834 * Returns 1 if autoclosed, 0 otherwise
835 */
836int
837htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
838 htmlNodePtr child;
839
840 if (elem == NULL) return(1);
841 child = elem->children;
842 while (child != NULL) {
843 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
844 child = child->next;
845 }
846 return(0);
847}
848
849/**
850 * htmlCheckImplied:
851 * @ctxt: an HTML parser context
852 * @newtag: The new tag name
853 *
854 * The HTML DtD allows a tag to exists only implicitely
855 * called when a new tag has been detected and generates the
856 * appropriates implicit tags if missing
857 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000858static void
Owen Taylor3473f882001-02-23 17:55:21 +0000859htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
860 if (!htmlOmittedDefaultValue)
861 return;
862 if (xmlStrEqual(newtag, BAD_CAST"html"))
863 return;
864 if (ctxt->nameNr <= 0) {
865#ifdef DEBUG
866 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
867#endif
868 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
869 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
870 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
871 }
872 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
873 return;
874 if ((ctxt->nameNr <= 1) &&
875 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
876 (xmlStrEqual(newtag, BAD_CAST"style")) ||
877 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
878 (xmlStrEqual(newtag, BAD_CAST"link")) ||
879 (xmlStrEqual(newtag, BAD_CAST"title")) ||
880 (xmlStrEqual(newtag, BAD_CAST"base")))) {
881 /*
882 * dropped OBJECT ... i you put it first BODY will be
883 * assumed !
884 */
885#ifdef DEBUG
886 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
887#endif
888 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
889 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
890 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
891 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
892 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
893 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
894 int i;
895 for (i = 0;i < ctxt->nameNr;i++) {
896 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
897 return;
898 }
899 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
900 return;
901 }
902 }
903
904#ifdef DEBUG
905 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
906#endif
907 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
908 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
909 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
910 }
911}
912
913/**
914 * htmlCheckParagraph
915 * @ctxt: an HTML parser context
916 *
917 * Check whether a p element need to be implied before inserting
918 * characters in the current element.
919 *
920 * Returns 1 if a paragraph has been inserted, 0 if not and -1
921 * in case of error.
922 */
923
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000924static int
Owen Taylor3473f882001-02-23 17:55:21 +0000925htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
926 const xmlChar *tag;
927 int i;
928
929 if (ctxt == NULL)
930 return(-1);
931 tag = ctxt->name;
932 if (tag == NULL) {
933 htmlAutoClose(ctxt, BAD_CAST"p");
934 htmlCheckImplied(ctxt, BAD_CAST"p");
935 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
936 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
937 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
938 return(1);
939 }
940 if (!htmlOmittedDefaultValue)
941 return(0);
942 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
943 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
944#ifdef DEBUG
945 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
946#endif
947 htmlAutoClose(ctxt, BAD_CAST"p");
948 htmlCheckImplied(ctxt, BAD_CAST"p");
949 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
950 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
951 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
952 return(1);
953 }
954 }
955 return(0);
956}
957
958/**
959 * htmlIsScriptAttribute:
960 * @name: an attribute name
961 *
962 * Check if an attribute is of content type Script
963 *
964 * Returns 1 is the attribute is a script 0 otherwise
965 */
966int
967htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000968 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000969
970 if (name == NULL)
971 return(0);
972 /*
973 * all script attributes start with 'on'
974 */
975 if ((name[0] != 'o') || (name[1] != 'n'))
976 return(0);
977 for (i = 0;
978 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
979 i++) {
980 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
981 return(1);
982 }
983 return(0);
984}
985
986/************************************************************************
987 * *
988 * The list of HTML predefined entities *
989 * *
990 ************************************************************************/
991
992
993htmlEntityDesc html40EntitiesTable[] = {
994/*
995 * the 4 absolute ones, plus apostrophe.
996 */
997{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
998{ 38, "amp", "ampersand, U+0026 ISOnum" },
999{ 39, "apos", "single quote" },
1000{ 60, "lt", "less-than sign, U+003C ISOnum" },
1001{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1002
1003/*
1004 * A bunch still in the 128-255 range
1005 * Replacing them depend really on the charset used.
1006 */
1007{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1008{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1009{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1010{ 163, "pound","pound sign, U+00A3 ISOnum" },
1011{ 164, "curren","currency sign, U+00A4 ISOnum" },
1012{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1013{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1014{ 167, "sect", "section sign, U+00A7 ISOnum" },
1015{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1016{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1017{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1018{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1019{ 172, "not", "not sign, U+00AC ISOnum" },
1020{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1021{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1022{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1023{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1024{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1025{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1026{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1027{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1028{ 181, "micro","micro sign, U+00B5 ISOnum" },
1029{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1030{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1031{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1032{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1033{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1034{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1035{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1036{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1037{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1038{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1039{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1040{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1041{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1042{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1043{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1044{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1045{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1046{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1047{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1048{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1049{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1050{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1051{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1052{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1053{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1054{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1055{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1056{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1057{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1058{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1059{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1060{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1061{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1062{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1063{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1064{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1065{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1066{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1067{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1068{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1069{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1070{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1071{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1072{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1073{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1074{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1075{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1076{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1077{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1078{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1079{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1080{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1081{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1082{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1083{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1084{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1085{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1086{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1087{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1088{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1089{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1090{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1091{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1092{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1093{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1094{ 247, "divide","division sign, U+00F7 ISOnum" },
1095{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1096{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1097{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1098{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1099{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1100{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1101{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1102{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1103
1104{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1105{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1106{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1107{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1108{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1109
1110/*
1111 * Anything below should really be kept as entities references
1112 */
1113{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1114
1115{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1116{ 732, "tilde","small tilde, U+02DC ISOdia" },
1117
1118{ 913, "Alpha","greek capital letter alpha, U+0391" },
1119{ 914, "Beta", "greek capital letter beta, U+0392" },
1120{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1121{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1122{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1123{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1124{ 919, "Eta", "greek capital letter eta, U+0397" },
1125{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1126{ 921, "Iota", "greek capital letter iota, U+0399" },
1127{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001128{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001129{ 924, "Mu", "greek capital letter mu, U+039C" },
1130{ 925, "Nu", "greek capital letter nu, U+039D" },
1131{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1132{ 927, "Omicron","greek capital letter omicron, U+039F" },
1133{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1134{ 929, "Rho", "greek capital letter rho, U+03A1" },
1135{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1136{ 932, "Tau", "greek capital letter tau, U+03A4" },
1137{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1138{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1139{ 935, "Chi", "greek capital letter chi, U+03A7" },
1140{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1141{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1142
1143{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1144{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1145{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1146{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1147{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1148{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1149{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1150{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1151{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1152{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1153{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1154{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1155{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1156{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1157{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1158{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1159{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1160{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1161{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1162{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1163{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1164{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1165{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1166{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1167{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1168{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1169{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1170{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1171
1172{ 8194, "ensp", "en space, U+2002 ISOpub" },
1173{ 8195, "emsp", "em space, U+2003 ISOpub" },
1174{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1175{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1176{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1177{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1178{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1179{ 8211, "ndash","en dash, U+2013 ISOpub" },
1180{ 8212, "mdash","em dash, U+2014 ISOpub" },
1181{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1182{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1183{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1184{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1185{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1186{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1187{ 8224, "dagger","dagger, U+2020 ISOpub" },
1188{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1189
1190{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1191{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1192
1193{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1194
1195{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1196{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1197
1198{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1199{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1200
1201{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1202{ 8260, "frasl","fraction slash, U+2044 NEW" },
1203
1204{ 8364, "euro", "euro sign, U+20AC NEW" },
1205
1206{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1207{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1208{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1209{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1210{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1211{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1212{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1213{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1214{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1215{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1216{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1217{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1218{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1219{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1220{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1221{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1222
1223{ 8704, "forall","for all, U+2200 ISOtech" },
1224{ 8706, "part", "partial differential, U+2202 ISOtech" },
1225{ 8707, "exist","there exists, U+2203 ISOtech" },
1226{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1227{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1228{ 8712, "isin", "element of, U+2208 ISOtech" },
1229{ 8713, "notin","not an element of, U+2209 ISOtech" },
1230{ 8715, "ni", "contains as member, U+220B ISOtech" },
1231{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1232{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
1233{ 8722, "minus","minus sign, U+2212 ISOtech" },
1234{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1235{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1236{ 8733, "prop", "proportional to, U+221D ISOtech" },
1237{ 8734, "infin","infinity, U+221E ISOtech" },
1238{ 8736, "ang", "angle, U+2220 ISOamso" },
1239{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1240{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1241{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1242{ 8746, "cup", "union = cup, U+222A ISOtech" },
1243{ 8747, "int", "integral, U+222B ISOtech" },
1244{ 8756, "there4","therefore, U+2234 ISOtech" },
1245{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1246{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1247{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1248{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1249{ 8801, "equiv","identical to, U+2261 ISOtech" },
1250{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1251{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1252{ 8834, "sub", "subset of, U+2282 ISOtech" },
1253{ 8835, "sup", "superset of, U+2283 ISOtech" },
1254{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1255{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1256{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1257{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1258{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1259{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1260{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1261{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1262{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1263{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1264{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1265{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1266{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1267{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1268
1269{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1270{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1271{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1272{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1273
1274};
1275
1276/************************************************************************
1277 * *
1278 * Commodity functions to handle entities *
1279 * *
1280 ************************************************************************/
1281
1282/*
1283 * Macro used to grow the current buffer.
1284 */
1285#define growBuffer(buffer) { \
1286 buffer##_size *= 2; \
1287 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1288 if (buffer == NULL) { \
1289 perror("realloc failed"); \
1290 return(NULL); \
1291 } \
1292}
1293
1294/**
1295 * htmlEntityLookup:
1296 * @name: the entity name
1297 *
1298 * Lookup the given entity in EntitiesTable
1299 *
1300 * TODO: the linear scan is really ugly, an hash table is really needed.
1301 *
1302 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1303 */
1304htmlEntityDescPtr
1305htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001306 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001307
1308 for (i = 0;i < (sizeof(html40EntitiesTable)/
1309 sizeof(html40EntitiesTable[0]));i++) {
1310 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1311#ifdef DEBUG
1312 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
1313#endif
1314 return(&html40EntitiesTable[i]);
1315 }
1316 }
1317 return(NULL);
1318}
1319
1320/**
1321 * htmlEntityValueLookup:
1322 * @value: the entity's unicode value
1323 *
1324 * Lookup the given entity in EntitiesTable
1325 *
1326 * TODO: the linear scan is really ugly, an hash table is really needed.
1327 *
1328 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1329 */
1330htmlEntityDescPtr
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001331htmlEntityValueLookup(unsigned int value) {
1332 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001333#ifdef DEBUG
1334 int lv = 0;
1335#endif
1336
1337 for (i = 0;i < (sizeof(html40EntitiesTable)/
1338 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001339 if (html40EntitiesTable[i].value >= value) {
1340 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001341 break;
1342#ifdef DEBUG
1343 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
1344#endif
1345 return(&html40EntitiesTable[i]);
1346 }
1347#ifdef DEBUG
1348 if (lv > html40EntitiesTable[i].value) {
1349 xmlGenericError(xmlGenericErrorContext,
1350 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1351 lv, html40EntitiesTable[i].value);
1352 }
1353 lv = html40EntitiesTable[i].value;
1354#endif
1355 }
1356 return(NULL);
1357}
1358
1359/**
1360 * UTF8ToHtml:
1361 * @out: a pointer to an array of bytes to store the result
1362 * @outlen: the length of @out
1363 * @in: a pointer to an array of UTF-8 chars
1364 * @inlen: the length of @in
1365 *
1366 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1367 * plus HTML entities block of chars out.
1368 *
1369 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1370 * The value of @inlen after return is the number of octets consumed
1371 * as the return value is positive, else unpredictiable.
1372 * The value of @outlen after return is the number of octets consumed.
1373 */
1374int
1375UTF8ToHtml(unsigned char* out, int *outlen,
1376 const unsigned char* in, int *inlen) {
1377 const unsigned char* processed = in;
1378 const unsigned char* outend;
1379 const unsigned char* outstart = out;
1380 const unsigned char* instart = in;
1381 const unsigned char* inend;
1382 unsigned int c, d;
1383 int trailing;
1384
1385 if (in == NULL) {
1386 /*
1387 * initialization nothing to do
1388 */
1389 *outlen = 0;
1390 *inlen = 0;
1391 return(0);
1392 }
1393 inend = in + (*inlen);
1394 outend = out + (*outlen);
1395 while (in < inend) {
1396 d = *in++;
1397 if (d < 0x80) { c= d; trailing= 0; }
1398 else if (d < 0xC0) {
1399 /* trailing byte in leading position */
1400 *outlen = out - outstart;
1401 *inlen = processed - instart;
1402 return(-2);
1403 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1404 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1405 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1406 else {
1407 /* no chance for this in Ascii */
1408 *outlen = out - outstart;
1409 *inlen = processed - instart;
1410 return(-2);
1411 }
1412
1413 if (inend - in < trailing) {
1414 break;
1415 }
1416
1417 for ( ; trailing; trailing--) {
1418 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1419 break;
1420 c <<= 6;
1421 c |= d & 0x3F;
1422 }
1423
1424 /* assertion: c is a single UTF-4 value */
1425 if (c < 0x80) {
1426 if (out + 1 >= outend)
1427 break;
1428 *out++ = c;
1429 } else {
1430 int len;
1431 htmlEntityDescPtr ent;
1432
1433 /*
1434 * Try to lookup a predefined HTML entity for it
1435 */
1436
1437 ent = htmlEntityValueLookup(c);
1438 if (ent == NULL) {
1439 /* no chance for this in Ascii */
1440 *outlen = out - outstart;
1441 *inlen = processed - instart;
1442 return(-2);
1443 }
1444 len = strlen(ent->name);
1445 if (out + 2 + len >= outend)
1446 break;
1447 *out++ = '&';
1448 memcpy(out, ent->name, len);
1449 out += len;
1450 *out++ = ';';
1451 }
1452 processed = in;
1453 }
1454 *outlen = out - outstart;
1455 *inlen = processed - instart;
1456 return(0);
1457}
1458
1459/**
1460 * htmlEncodeEntities:
1461 * @out: a pointer to an array of bytes to store the result
1462 * @outlen: the length of @out
1463 * @in: a pointer to an array of UTF-8 chars
1464 * @inlen: the length of @in
1465 * @quoteChar: the quote character to escape (' or ") or zero.
1466 *
1467 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1468 * plus HTML entities block of chars out.
1469 *
1470 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1471 * The value of @inlen after return is the number of octets consumed
1472 * as the return value is positive, else unpredictiable.
1473 * The value of @outlen after return is the number of octets consumed.
1474 */
1475int
1476htmlEncodeEntities(unsigned char* out, int *outlen,
1477 const unsigned char* in, int *inlen, int quoteChar) {
1478 const unsigned char* processed = in;
1479 const unsigned char* outend = out + (*outlen);
1480 const unsigned char* outstart = out;
1481 const unsigned char* instart = in;
1482 const unsigned char* inend = in + (*inlen);
1483 unsigned int c, d;
1484 int trailing;
1485
1486 while (in < inend) {
1487 d = *in++;
1488 if (d < 0x80) { c= d; trailing= 0; }
1489 else if (d < 0xC0) {
1490 /* trailing byte in leading position */
1491 *outlen = out - outstart;
1492 *inlen = processed - instart;
1493 return(-2);
1494 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1495 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1496 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1497 else {
1498 /* no chance for this in Ascii */
1499 *outlen = out - outstart;
1500 *inlen = processed - instart;
1501 return(-2);
1502 }
1503
1504 if (inend - in < trailing)
1505 break;
1506
1507 while (trailing--) {
1508 if (((d= *in++) & 0xC0) != 0x80) {
1509 *outlen = out - outstart;
1510 *inlen = processed - instart;
1511 return(-2);
1512 }
1513 c <<= 6;
1514 c |= d & 0x3F;
1515 }
1516
1517 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001518 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1519 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001520 if (out >= outend)
1521 break;
1522 *out++ = c;
1523 } else {
1524 htmlEntityDescPtr ent;
1525 const char *cp;
1526 char nbuf[16];
1527 int len;
1528
1529 /*
1530 * Try to lookup a predefined HTML entity for it
1531 */
1532 ent = htmlEntityValueLookup(c);
1533 if (ent == NULL) {
1534 sprintf(nbuf, "#%u", c);
1535 cp = nbuf;
1536 }
1537 else
1538 cp = ent->name;
1539 len = strlen(cp);
1540 if (out + 2 + len > outend)
1541 break;
1542 *out++ = '&';
1543 memcpy(out, cp, len);
1544 out += len;
1545 *out++ = ';';
1546 }
1547 processed = in;
1548 }
1549 *outlen = out - outstart;
1550 *inlen = processed - instart;
1551 return(0);
1552}
1553
1554/**
1555 * htmlDecodeEntities:
1556 * @ctxt: the parser context
1557 * @len: the len to decode (in bytes !), -1 for no size limit
1558 * @end: an end marker xmlChar, 0 if none
1559 * @end2: an end marker xmlChar, 0 if none
1560 * @end3: an end marker xmlChar, 0 if none
1561 *
1562 * Subtitute the HTML entities by their value
1563 *
1564 * DEPRECATED !!!!
1565 *
1566 * Returns A newly allocated string with the substitution done. The caller
1567 * must deallocate it !
1568 */
1569xmlChar *
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00001570htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
1571 xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001572 static int deprecated = 0;
1573 if (!deprecated) {
1574 xmlGenericError(xmlGenericErrorContext,
1575 "htmlDecodeEntities() deprecated function reached\n");
1576 deprecated = 1;
1577 }
1578 return(NULL);
1579#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00001580 xmlChar *name = NULL;
1581 xmlChar *buffer = NULL;
1582 unsigned int buffer_size = 0;
1583 unsigned int nbchars = 0;
1584 htmlEntityDescPtr ent;
1585 unsigned int max = (unsigned int) len;
1586 int c,l;
1587
1588 if (ctxt->depth > 40) {
1589 ctxt->errNo = XML_ERR_ENTITY_LOOP;
1590 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1591 ctxt->sax->error(ctxt->userData,
1592 "Detected entity reference loop\n");
1593 ctxt->wellFormed = 0;
1594 ctxt->disableSAX = 1;
1595 return(NULL);
1596 }
1597
1598 /*
1599 * allocate a translation buffer.
1600 */
1601 buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1602 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1603 if (buffer == NULL) {
1604 perror("xmlDecodeEntities: malloc failed");
1605 return(NULL);
1606 }
1607
1608 /*
1609 * Ok loop until we reach one of the ending char or a size limit.
1610 */
1611 c = CUR_CHAR(l);
1612 while ((nbchars < max) && (c != end) &&
1613 (c != end2) && (c != end3)) {
1614
1615 if (c == 0) break;
1616 if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
1617 int val = htmlParseCharRef(ctxt);
1618 COPY_BUF(0,buffer,nbchars,val);
1619 NEXTL(l);
1620 } else if ((c == '&') && (ctxt->token != '&')) {
1621 ent = htmlParseEntityRef(ctxt, &name);
1622 if (name != NULL) {
1623 if (ent != NULL) {
1624 int val = ent->value;
1625 COPY_BUF(0,buffer,nbchars,val);
1626 NEXTL(l);
1627 } else {
1628 const xmlChar *cur = name;
1629
1630 buffer[nbchars++] = '&';
1631 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1632 growBuffer(buffer);
1633 }
1634 while (*cur != 0) {
1635 buffer[nbchars++] = *cur++;
1636 }
1637 buffer[nbchars++] = ';';
1638 }
1639 }
1640 } else {
1641 COPY_BUF(l,buffer,nbchars,c);
1642 NEXTL(l);
1643 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1644 growBuffer(buffer);
1645 }
1646 }
1647 c = CUR_CHAR(l);
1648 }
1649 buffer[nbchars++] = 0;
1650 return(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001651#endif
Owen Taylor3473f882001-02-23 17:55:21 +00001652}
1653
1654/************************************************************************
1655 * *
1656 * Commodity functions to handle streams *
1657 * *
1658 ************************************************************************/
1659
1660/**
Owen Taylor3473f882001-02-23 17:55:21 +00001661 * htmlNewInputStream:
1662 * @ctxt: an HTML parser context
1663 *
1664 * Create a new input stream structure
1665 * Returns the new input stream or NULL
1666 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001667static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001668htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1669 htmlParserInputPtr input;
1670
1671 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1672 if (input == NULL) {
1673 ctxt->errNo = XML_ERR_NO_MEMORY;
1674 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1675 ctxt->sax->error(ctxt->userData,
1676 "malloc: couldn't allocate a new input stream\n");
1677 return(NULL);
1678 }
1679 memset(input, 0, sizeof(htmlParserInput));
1680 input->filename = NULL;
1681 input->directory = NULL;
1682 input->base = NULL;
1683 input->cur = NULL;
1684 input->buf = NULL;
1685 input->line = 1;
1686 input->col = 1;
1687 input->buf = NULL;
1688 input->free = NULL;
1689 input->version = NULL;
1690 input->consumed = 0;
1691 input->length = 0;
1692 return(input);
1693}
1694
1695
1696/************************************************************************
1697 * *
1698 * Commodity functions, cleanup needed ? *
1699 * *
1700 ************************************************************************/
1701
1702/**
1703 * areBlanks:
1704 * @ctxt: an HTML parser context
1705 * @str: a xmlChar *
1706 * @len: the size of @str
1707 *
1708 * Is this a sequence of blank chars that one can ignore ?
1709 *
1710 * Returns 1 if ignorable 0 otherwise.
1711 */
1712
1713static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
1714 int i;
1715 xmlNodePtr lastChild;
1716
1717 for (i = 0;i < len;i++)
1718 if (!(IS_BLANK(str[i]))) return(0);
1719
1720 if (CUR == 0) return(1);
1721 if (CUR != '<') return(0);
1722 if (ctxt->name == NULL)
1723 return(1);
1724 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
1725 return(1);
1726 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
1727 return(1);
1728 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
1729 return(1);
1730 if (ctxt->node == NULL) return(0);
1731 lastChild = xmlGetLastChild(ctxt->node);
1732 if (lastChild == NULL) {
1733 if (ctxt->node->content != NULL) return(0);
1734 } else if (xmlNodeIsText(lastChild)) {
1735 return(0);
1736 } else if (xmlStrEqual(lastChild->name, BAD_CAST"b")) {
1737 return(0);
1738 } else if (xmlStrEqual(lastChild->name, BAD_CAST"bold")) {
1739 return(0);
1740 } else if (xmlStrEqual(lastChild->name, BAD_CAST"em")) {
1741 return(0);
1742 }
1743 return(1);
1744}
1745
1746/**
Owen Taylor3473f882001-02-23 17:55:21 +00001747 * htmlNewDocNoDtD:
1748 * @URI: URI for the dtd, or NULL
1749 * @ExternalID: the external ID of the DTD, or NULL
1750 *
1751 * Returns a new document, do not intialize the DTD if not provided
1752 */
1753htmlDocPtr
1754htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
1755 xmlDocPtr cur;
1756
1757 /*
1758 * Allocate a new document and fill the fields.
1759 */
1760 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
1761 if (cur == NULL) {
1762 xmlGenericError(xmlGenericErrorContext,
1763 "xmlNewDoc : malloc failed\n");
1764 return(NULL);
1765 }
1766 memset(cur, 0, sizeof(xmlDoc));
1767
1768 cur->type = XML_HTML_DOCUMENT_NODE;
1769 cur->version = NULL;
1770 cur->intSubset = NULL;
1771 if ((ExternalID != NULL) ||
1772 (URI != NULL))
1773 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
1774 cur->doc = cur;
1775 cur->name = NULL;
1776 cur->children = NULL;
1777 cur->extSubset = NULL;
1778 cur->oldNs = NULL;
1779 cur->encoding = NULL;
1780 cur->standalone = 1;
1781 cur->compression = 0;
1782 cur->ids = NULL;
1783 cur->refs = NULL;
1784#ifndef XML_WITHOUT_CORBA
1785 cur->_private = NULL;
1786#endif
1787 return(cur);
1788}
1789
1790/**
1791 * htmlNewDoc:
1792 * @URI: URI for the dtd, or NULL
1793 * @ExternalID: the external ID of the DTD, or NULL
1794 *
1795 * Returns a new document
1796 */
1797htmlDocPtr
1798htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1799 if ((URI == NULL) && (ExternalID == NULL))
1800 return(htmlNewDocNoDtD(
1801 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
1802 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"));
1803
1804 return(htmlNewDocNoDtD(URI, ExternalID));
1805}
1806
1807
1808/************************************************************************
1809 * *
1810 * The parser itself *
1811 * Relates to http://www.w3.org/TR/html40 *
1812 * *
1813 ************************************************************************/
1814
1815/************************************************************************
1816 * *
1817 * The parser itself *
1818 * *
1819 ************************************************************************/
1820
1821/**
1822 * htmlParseHTMLName:
1823 * @ctxt: an HTML parser context
1824 *
1825 * parse an HTML tag or attribute name, note that we convert it to lowercase
1826 * since HTML names are not case-sensitive.
1827 *
1828 * Returns the Tag Name parsed or NULL
1829 */
1830
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001831static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001832htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1833 xmlChar *ret = NULL;
1834 int i = 0;
1835 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
1836
1837 if (!IS_LETTER(CUR) && (CUR != '_') &&
1838 (CUR != ':')) return(NULL);
1839
1840 while ((i < HTML_PARSER_BUFFER_SIZE) &&
1841 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1842 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
1843 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
1844 else loc[i] = CUR;
1845 i++;
1846
1847 NEXT;
1848 }
1849
1850 ret = xmlStrndup(loc, i);
1851
1852 return(ret);
1853}
1854
1855/**
1856 * htmlParseName:
1857 * @ctxt: an HTML parser context
1858 *
1859 * parse an HTML name, this routine is case sensistive.
1860 *
1861 * Returns the Name parsed or NULL
1862 */
1863
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001864static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001865htmlParseName(htmlParserCtxtPtr ctxt) {
1866 xmlChar buf[HTML_MAX_NAMELEN];
1867 int len = 0;
1868
1869 GROW;
1870 if (!IS_LETTER(CUR) && (CUR != '_')) {
1871 return(NULL);
1872 }
1873
1874 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1875 (CUR == '.') || (CUR == '-') ||
1876 (CUR == '_') || (CUR == ':') ||
1877 (IS_COMBINING(CUR)) ||
1878 (IS_EXTENDER(CUR))) {
1879 buf[len++] = CUR;
1880 NEXT;
1881 if (len >= HTML_MAX_NAMELEN) {
1882 xmlGenericError(xmlGenericErrorContext,
1883 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1884 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1885 (CUR == '.') || (CUR == '-') ||
1886 (CUR == '_') || (CUR == ':') ||
1887 (IS_COMBINING(CUR)) ||
1888 (IS_EXTENDER(CUR)))
1889 NEXT;
1890 break;
1891 }
1892 }
1893 return(xmlStrndup(buf, len));
1894}
1895
1896/**
1897 * htmlParseHTMLAttribute:
1898 * @ctxt: an HTML parser context
1899 * @stop: a char stop value
1900 *
1901 * parse an HTML attribute value till the stop (quote), if
1902 * stop is 0 then it stops at the first space
1903 *
1904 * Returns the attribute parsed or NULL
1905 */
1906
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001907static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001908htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
1909 xmlChar *buffer = NULL;
1910 int buffer_size = 0;
1911 xmlChar *out = NULL;
1912 xmlChar *name = NULL;
1913
1914 xmlChar *cur = NULL;
1915 htmlEntityDescPtr ent;
1916
1917 /*
1918 * allocate a translation buffer.
1919 */
1920 buffer_size = HTML_PARSER_BUFFER_SIZE;
1921 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1922 if (buffer == NULL) {
1923 perror("htmlParseHTMLAttribute: malloc failed");
1924 return(NULL);
1925 }
1926 out = buffer;
1927
1928 /*
1929 * Ok loop until we reach one of the ending chars
1930 */
1931 while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1932 if ((stop == 0) && (IS_BLANK(CUR))) break;
1933 if (CUR == '&') {
1934 if (NXT(1) == '#') {
1935 unsigned int c;
1936 int bits;
1937
1938 c = htmlParseCharRef(ctxt);
1939 if (c < 0x80)
1940 { *out++ = c; bits= -6; }
1941 else if (c < 0x800)
1942 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1943 else if (c < 0x10000)
1944 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1945 else
1946 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1947
1948 for ( ; bits >= 0; bits-= 6) {
1949 *out++ = ((c >> bits) & 0x3F) | 0x80;
1950 }
1951 } else {
1952 ent = htmlParseEntityRef(ctxt, &name);
1953 if (name == NULL) {
1954 *out++ = '&';
1955 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001956 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00001957
1958 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001959 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00001960 }
1961 } else if (ent == NULL) {
1962 *out++ = '&';
1963 cur = name;
1964 while (*cur != 0) {
1965 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001966 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00001967
1968 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001969 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00001970 }
1971 *out++ = *cur++;
1972 }
1973 xmlFree(name);
1974 } else {
1975 unsigned int c;
1976 int bits;
1977
1978 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001979 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00001980
1981 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001982 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00001983 }
1984 c = (xmlChar)ent->value;
1985 if (c < 0x80)
1986 { *out++ = c; bits= -6; }
1987 else if (c < 0x800)
1988 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1989 else if (c < 0x10000)
1990 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1991 else
1992 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1993
1994 for ( ; bits >= 0; bits-= 6) {
1995 *out++ = ((c >> bits) & 0x3F) | 0x80;
1996 }
1997 xmlFree(name);
1998 }
1999 }
2000 } else {
2001 unsigned int c;
2002 int bits, l;
2003
2004 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002005 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002006
2007 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002008 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002009 }
2010 c = CUR_CHAR(l);
2011 if (c < 0x80)
2012 { *out++ = c; bits= -6; }
2013 else if (c < 0x800)
2014 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2015 else if (c < 0x10000)
2016 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2017 else
2018 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2019
2020 for ( ; bits >= 0; bits-= 6) {
2021 *out++ = ((c >> bits) & 0x3F) | 0x80;
2022 }
2023 NEXT;
2024 }
2025 }
2026 *out++ = 0;
2027 return(buffer);
2028}
2029
2030/**
Owen Taylor3473f882001-02-23 17:55:21 +00002031 * htmlParseEntityRef:
2032 * @ctxt: an HTML parser context
2033 * @str: location to store the entity name
2034 *
2035 * parse an HTML ENTITY references
2036 *
2037 * [68] EntityRef ::= '&' Name ';'
2038 *
2039 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2040 * if non-NULL *str will have to be freed by the caller.
2041 */
2042htmlEntityDescPtr
2043htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2044 xmlChar *name;
2045 htmlEntityDescPtr ent = NULL;
2046 *str = NULL;
2047
2048 if (CUR == '&') {
2049 NEXT;
2050 name = htmlParseName(ctxt);
2051 if (name == NULL) {
2052 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2053 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2054 ctxt->wellFormed = 0;
2055 } else {
2056 GROW;
2057 if (CUR == ';') {
2058 *str = name;
2059
2060 /*
2061 * Lookup the entity in the table.
2062 */
2063 ent = htmlEntityLookup(name);
2064 if (ent != NULL) /* OK that's ugly !!! */
2065 NEXT;
2066 } else {
2067 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2068 ctxt->sax->error(ctxt->userData,
2069 "htmlParseEntityRef: expecting ';'\n");
2070 *str = name;
2071 }
2072 }
2073 }
2074 return(ent);
2075}
2076
2077/**
2078 * htmlParseAttValue:
2079 * @ctxt: an HTML parser context
2080 *
2081 * parse a value for an attribute
2082 * Note: the parser won't do substitution of entities here, this
2083 * will be handled later in xmlStringGetNodeList, unless it was
2084 * asked for ctxt->replaceEntities != 0
2085 *
2086 * Returns the AttValue parsed or NULL.
2087 */
2088
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002089static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002090htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2091 xmlChar *ret = NULL;
2092
2093 if (CUR == '"') {
2094 NEXT;
2095 ret = htmlParseHTMLAttribute(ctxt, '"');
2096 if (CUR != '"') {
2097 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2098 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2099 ctxt->wellFormed = 0;
2100 } else
2101 NEXT;
2102 } else if (CUR == '\'') {
2103 NEXT;
2104 ret = htmlParseHTMLAttribute(ctxt, '\'');
2105 if (CUR != '\'') {
2106 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2107 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2108 ctxt->wellFormed = 0;
2109 } else
2110 NEXT;
2111 } else {
2112 /*
2113 * That's an HTMLism, the attribute value may not be quoted
2114 */
2115 ret = htmlParseHTMLAttribute(ctxt, 0);
2116 if (ret == NULL) {
2117 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2118 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2119 ctxt->wellFormed = 0;
2120 }
2121 }
2122 return(ret);
2123}
2124
2125/**
2126 * htmlParseSystemLiteral:
2127 * @ctxt: an HTML parser context
2128 *
2129 * parse an HTML Literal
2130 *
2131 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2132 *
2133 * Returns the SystemLiteral parsed or NULL
2134 */
2135
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002136static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002137htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2138 const xmlChar *q;
2139 xmlChar *ret = NULL;
2140
2141 if (CUR == '"') {
2142 NEXT;
2143 q = CUR_PTR;
2144 while ((IS_CHAR(CUR)) && (CUR != '"'))
2145 NEXT;
2146 if (!IS_CHAR(CUR)) {
2147 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2148 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2149 ctxt->wellFormed = 0;
2150 } else {
2151 ret = xmlStrndup(q, CUR_PTR - q);
2152 NEXT;
2153 }
2154 } else if (CUR == '\'') {
2155 NEXT;
2156 q = CUR_PTR;
2157 while ((IS_CHAR(CUR)) && (CUR != '\''))
2158 NEXT;
2159 if (!IS_CHAR(CUR)) {
2160 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2161 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2162 ctxt->wellFormed = 0;
2163 } else {
2164 ret = xmlStrndup(q, CUR_PTR - q);
2165 NEXT;
2166 }
2167 } else {
2168 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2169 ctxt->sax->error(ctxt->userData,
2170 "SystemLiteral \" or ' expected\n");
2171 ctxt->wellFormed = 0;
2172 }
2173
2174 return(ret);
2175}
2176
2177/**
2178 * htmlParsePubidLiteral:
2179 * @ctxt: an HTML parser context
2180 *
2181 * parse an HTML public literal
2182 *
2183 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2184 *
2185 * Returns the PubidLiteral parsed or NULL.
2186 */
2187
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002188static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002189htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2190 const xmlChar *q;
2191 xmlChar *ret = NULL;
2192 /*
2193 * Name ::= (Letter | '_') (NameChar)*
2194 */
2195 if (CUR == '"') {
2196 NEXT;
2197 q = CUR_PTR;
2198 while (IS_PUBIDCHAR(CUR)) NEXT;
2199 if (CUR != '"') {
2200 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2201 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2202 ctxt->wellFormed = 0;
2203 } else {
2204 ret = xmlStrndup(q, CUR_PTR - q);
2205 NEXT;
2206 }
2207 } else if (CUR == '\'') {
2208 NEXT;
2209 q = CUR_PTR;
2210 while ((IS_LETTER(CUR)) && (CUR != '\''))
2211 NEXT;
2212 if (!IS_LETTER(CUR)) {
2213 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2214 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2215 ctxt->wellFormed = 0;
2216 } else {
2217 ret = xmlStrndup(q, CUR_PTR - q);
2218 NEXT;
2219 }
2220 } else {
2221 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2222 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2223 ctxt->wellFormed = 0;
2224 }
2225
2226 return(ret);
2227}
2228
2229/**
2230 * htmlParseScript:
2231 * @ctxt: an HTML parser context
2232 *
2233 * parse the content of an HTML SCRIPT or STYLE element
2234 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2235 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2236 * http://www.w3.org/TR/html4/types.html#type-script
2237 * http://www.w3.org/TR/html4/types.html#h-6.15
2238 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2239 *
2240 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2241 * element and the value of intrinsic event attributes. User agents must
2242 * not evaluate script data as HTML markup but instead must pass it on as
2243 * data to a script engine.
2244 * NOTES:
2245 * - The content is passed like CDATA
2246 * - the attributes for style and scripting "onXXX" are also described
2247 * as CDATA but SGML allows entities references in attributes so their
2248 * processing is identical as other attributes
2249 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002250static void
Owen Taylor3473f882001-02-23 17:55:21 +00002251htmlParseScript(htmlParserCtxtPtr ctxt) {
2252 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2253 int nbchar = 0;
2254 xmlChar cur;
2255
2256 SHRINK;
2257 cur = CUR;
2258 while (IS_CHAR(cur)) {
2259 if ((cur == '<') && (NXT(1) == '/')) {
2260 /*
2261 * One should break here, the specification is clear:
2262 * Authors should therefore escape "</" within the content.
2263 * Escape mechanisms are specific to each scripting or
2264 * style sheet language.
2265 */
2266 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2267 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2268 break; /* while */
2269 }
2270 buf[nbchar++] = cur;
2271 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2272 if (ctxt->sax->cdataBlock!= NULL) {
2273 /*
2274 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2275 */
2276 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2277 }
2278 nbchar = 0;
2279 }
2280 NEXT;
2281 cur = CUR;
2282 }
2283 if (!(IS_CHAR(cur))) {
2284 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2285 ctxt->sax->error(ctxt->userData,
2286 "Invalid char in CDATA 0x%X\n", cur);
2287 ctxt->wellFormed = 0;
2288 NEXT;
2289 }
2290
2291 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2292 if (ctxt->sax->cdataBlock!= NULL) {
2293 /*
2294 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2295 */
2296 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2297 }
2298 }
2299}
2300
2301
2302/**
2303 * htmlParseCharData:
2304 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002305 *
2306 * parse a CharData section.
2307 * if we are within a CDATA section ']]>' marks an end of section.
2308 *
2309 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2310 */
2311
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002312static void
2313htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002314 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2315 int nbchar = 0;
2316 int cur, l;
2317
2318 SHRINK;
2319 cur = CUR_CHAR(l);
2320 while (((cur != '<') || (ctxt->token == '<')) &&
2321 ((cur != '&') || (ctxt->token == '&')) &&
2322 (IS_CHAR(cur))) {
2323 COPY_BUF(l,buf,nbchar,cur);
2324 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2325 /*
2326 * Ok the segment is to be consumed as chars.
2327 */
2328 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2329 if (areBlanks(ctxt, buf, nbchar)) {
2330 if (ctxt->sax->ignorableWhitespace != NULL)
2331 ctxt->sax->ignorableWhitespace(ctxt->userData,
2332 buf, nbchar);
2333 } else {
2334 htmlCheckParagraph(ctxt);
2335 if (ctxt->sax->characters != NULL)
2336 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2337 }
2338 }
2339 nbchar = 0;
2340 }
2341 NEXTL(l);
2342 cur = CUR_CHAR(l);
2343 }
2344 if (nbchar != 0) {
2345 /*
2346 * Ok the segment is to be consumed as chars.
2347 */
2348 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2349 if (areBlanks(ctxt, buf, nbchar)) {
2350 if (ctxt->sax->ignorableWhitespace != NULL)
2351 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2352 } else {
2353 htmlCheckParagraph(ctxt);
2354 if (ctxt->sax->characters != NULL)
2355 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2356 }
2357 }
2358 }
2359}
2360
2361/**
2362 * htmlParseExternalID:
2363 * @ctxt: an HTML parser context
2364 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002365 *
2366 * Parse an External ID or a Public ID
2367 *
Owen Taylor3473f882001-02-23 17:55:21 +00002368 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2369 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2370 *
2371 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2372 *
2373 * Returns the function returns SystemLiteral and in the second
2374 * case publicID receives PubidLiteral, is strict is off
2375 * it is possible to return NULL and have publicID set.
2376 */
2377
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002378static xmlChar *
2379htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002380 xmlChar *URI = NULL;
2381
2382 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2383 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2384 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2385 SKIP(6);
2386 if (!IS_BLANK(CUR)) {
2387 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2388 ctxt->sax->error(ctxt->userData,
2389 "Space required after 'SYSTEM'\n");
2390 ctxt->wellFormed = 0;
2391 }
2392 SKIP_BLANKS;
2393 URI = htmlParseSystemLiteral(ctxt);
2394 if (URI == NULL) {
2395 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2396 ctxt->sax->error(ctxt->userData,
2397 "htmlParseExternalID: SYSTEM, no URI\n");
2398 ctxt->wellFormed = 0;
2399 }
2400 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2401 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2402 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2403 SKIP(6);
2404 if (!IS_BLANK(CUR)) {
2405 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2406 ctxt->sax->error(ctxt->userData,
2407 "Space required after 'PUBLIC'\n");
2408 ctxt->wellFormed = 0;
2409 }
2410 SKIP_BLANKS;
2411 *publicID = htmlParsePubidLiteral(ctxt);
2412 if (*publicID == NULL) {
2413 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2414 ctxt->sax->error(ctxt->userData,
2415 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2416 ctxt->wellFormed = 0;
2417 }
2418 SKIP_BLANKS;
2419 if ((CUR == '"') || (CUR == '\'')) {
2420 URI = htmlParseSystemLiteral(ctxt);
2421 }
2422 }
2423 return(URI);
2424}
2425
2426/**
2427 * htmlParseComment:
2428 * @ctxt: an HTML parser context
2429 *
2430 * Parse an XML (SGML) comment <!-- .... -->
2431 *
2432 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2433 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002434static void
Owen Taylor3473f882001-02-23 17:55:21 +00002435htmlParseComment(htmlParserCtxtPtr ctxt) {
2436 xmlChar *buf = NULL;
2437 int len;
2438 int size = HTML_PARSER_BUFFER_SIZE;
2439 int q, ql;
2440 int r, rl;
2441 int cur, l;
2442 xmlParserInputState state;
2443
2444 /*
2445 * Check that there is a comment right here.
2446 */
2447 if ((RAW != '<') || (NXT(1) != '!') ||
2448 (NXT(2) != '-') || (NXT(3) != '-')) return;
2449
2450 state = ctxt->instate;
2451 ctxt->instate = XML_PARSER_COMMENT;
2452 SHRINK;
2453 SKIP(4);
2454 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2455 if (buf == NULL) {
2456 xmlGenericError(xmlGenericErrorContext,
2457 "malloc of %d byte failed\n", size);
2458 ctxt->instate = state;
2459 return;
2460 }
2461 q = CUR_CHAR(ql);
2462 NEXTL(ql);
2463 r = CUR_CHAR(rl);
2464 NEXTL(rl);
2465 cur = CUR_CHAR(l);
2466 len = 0;
2467 while (IS_CHAR(cur) &&
2468 ((cur != '>') ||
2469 (r != '-') || (q != '-'))) {
2470 if (len + 5 >= size) {
2471 size *= 2;
2472 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2473 if (buf == NULL) {
2474 xmlGenericError(xmlGenericErrorContext,
2475 "realloc of %d byte failed\n", size);
2476 ctxt->instate = state;
2477 return;
2478 }
2479 }
2480 COPY_BUF(ql,buf,len,q);
2481 q = r;
2482 ql = rl;
2483 r = cur;
2484 rl = l;
2485 NEXTL(l);
2486 cur = CUR_CHAR(l);
2487 if (cur == 0) {
2488 SHRINK;
2489 GROW;
2490 cur = CUR_CHAR(l);
2491 }
2492 }
2493 buf[len] = 0;
2494 if (!IS_CHAR(cur)) {
2495 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2496 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2497 ctxt->sax->error(ctxt->userData,
2498 "Comment not terminated \n<!--%.50s\n", buf);
2499 ctxt->wellFormed = 0;
2500 xmlFree(buf);
2501 } else {
2502 NEXT;
2503 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2504 (!ctxt->disableSAX))
2505 ctxt->sax->comment(ctxt->userData, buf);
2506 xmlFree(buf);
2507 }
2508 ctxt->instate = state;
2509}
2510
2511/**
2512 * htmlParseCharRef:
2513 * @ctxt: an HTML parser context
2514 *
2515 * parse Reference declarations
2516 *
2517 * [66] CharRef ::= '&#' [0-9]+ ';' |
2518 * '&#x' [0-9a-fA-F]+ ';'
2519 *
2520 * Returns the value parsed (as an int)
2521 */
2522int
2523htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2524 int val = 0;
2525
2526 if ((CUR == '&') && (NXT(1) == '#') &&
2527 (NXT(2) == 'x')) {
2528 SKIP(3);
2529 while (CUR != ';') {
2530 if ((CUR >= '0') && (CUR <= '9'))
2531 val = val * 16 + (CUR - '0');
2532 else if ((CUR >= 'a') && (CUR <= 'f'))
2533 val = val * 16 + (CUR - 'a') + 10;
2534 else if ((CUR >= 'A') && (CUR <= 'F'))
2535 val = val * 16 + (CUR - 'A') + 10;
2536 else {
2537 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2538 ctxt->sax->error(ctxt->userData,
2539 "htmlParseCharRef: invalid hexadecimal value\n");
2540 ctxt->wellFormed = 0;
2541 return(0);
2542 }
2543 NEXT;
2544 }
2545 if (CUR == ';')
2546 NEXT;
2547 } else if ((CUR == '&') && (NXT(1) == '#')) {
2548 SKIP(2);
2549 while (CUR != ';') {
2550 if ((CUR >= '0') && (CUR <= '9'))
2551 val = val * 10 + (CUR - '0');
2552 else {
2553 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2554 ctxt->sax->error(ctxt->userData,
2555 "htmlParseCharRef: invalid decimal value\n");
2556 ctxt->wellFormed = 0;
2557 return(0);
2558 }
2559 NEXT;
2560 }
2561 if (CUR == ';')
2562 NEXT;
2563 } else {
2564 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2565 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2566 ctxt->wellFormed = 0;
2567 }
2568 /*
2569 * Check the value IS_CHAR ...
2570 */
2571 if (IS_CHAR(val)) {
2572 return(val);
2573 } else {
2574 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2575 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
2576 val);
2577 ctxt->wellFormed = 0;
2578 }
2579 return(0);
2580}
2581
2582
2583/**
2584 * htmlParseDocTypeDecl :
2585 * @ctxt: an HTML parser context
2586 *
2587 * parse a DOCTYPE declaration
2588 *
2589 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2590 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2591 */
2592
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002593static void
Owen Taylor3473f882001-02-23 17:55:21 +00002594htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
2595 xmlChar *name;
2596 xmlChar *ExternalID = NULL;
2597 xmlChar *URI = NULL;
2598
2599 /*
2600 * We know that '<!DOCTYPE' has been detected.
2601 */
2602 SKIP(9);
2603
2604 SKIP_BLANKS;
2605
2606 /*
2607 * Parse the DOCTYPE name.
2608 */
2609 name = htmlParseName(ctxt);
2610 if (name == NULL) {
2611 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2612 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2613 ctxt->wellFormed = 0;
2614 }
2615 /*
2616 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2617 */
2618
2619 SKIP_BLANKS;
2620
2621 /*
2622 * Check for SystemID and ExternalID
2623 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002624 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00002625 SKIP_BLANKS;
2626
2627 /*
2628 * We should be at the end of the DOCTYPE declaration.
2629 */
2630 if (CUR != '>') {
2631 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2632 ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
2633 ctxt->wellFormed = 0;
2634 /* We shouldn't try to resynchronize ... */
2635 }
2636 NEXT;
2637
2638 /*
2639 * Create or update the document accordingly to the DOCTYPE
2640 */
2641 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2642 (!ctxt->disableSAX))
2643 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
2644
2645 /*
2646 * Cleanup, since we don't use all those identifiers
2647 */
2648 if (URI != NULL) xmlFree(URI);
2649 if (ExternalID != NULL) xmlFree(ExternalID);
2650 if (name != NULL) xmlFree(name);
2651}
2652
2653/**
2654 * htmlParseAttribute:
2655 * @ctxt: an HTML parser context
2656 * @value: a xmlChar ** used to store the value of the attribute
2657 *
2658 * parse an attribute
2659 *
2660 * [41] Attribute ::= Name Eq AttValue
2661 *
2662 * [25] Eq ::= S? '=' S?
2663 *
2664 * With namespace:
2665 *
2666 * [NS 11] Attribute ::= QName Eq AttValue
2667 *
2668 * Also the case QName == xmlns:??? is handled independently as a namespace
2669 * definition.
2670 *
2671 * Returns the attribute name, and the value in *value.
2672 */
2673
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002674static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002675htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
2676 xmlChar *name, *val = NULL;
2677
2678 *value = NULL;
2679 name = htmlParseHTMLName(ctxt);
2680 if (name == NULL) {
2681 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2682 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2683 ctxt->wellFormed = 0;
2684 return(NULL);
2685 }
2686
2687 /*
2688 * read the value
2689 */
2690 SKIP_BLANKS;
2691 if (CUR == '=') {
2692 NEXT;
2693 SKIP_BLANKS;
2694 val = htmlParseAttValue(ctxt);
2695 /******
2696 } else {
2697 * TODO : some attribute must have values, some may not
2698 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2699 ctxt->sax->warning(ctxt->userData,
2700 "No value for attribute %s\n", name); */
2701 }
2702
2703 *value = val;
2704 return(name);
2705}
2706
2707/**
2708 * htmlCheckEncoding:
2709 * @ctxt: an HTML parser context
2710 * @attvalue: the attribute value
2711 *
2712 * Checks an http-equiv attribute from a Meta tag to detect
2713 * the encoding
2714 * If a new encoding is detected the parser is switched to decode
2715 * it and pass UTF8
2716 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002717static void
Owen Taylor3473f882001-02-23 17:55:21 +00002718htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2719 const xmlChar *encoding;
2720
2721 if ((ctxt == NULL) || (attvalue == NULL))
2722 return;
2723
2724 /* do not change encoding */
2725 if (ctxt->input->encoding != NULL)
2726 return;
2727
2728 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
2729 if (encoding != NULL) {
2730 encoding += 8;
2731 } else {
2732 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
2733 if (encoding != NULL)
2734 encoding += 9;
2735 }
2736 if (encoding != NULL) {
2737 xmlCharEncoding enc;
2738 xmlCharEncodingHandlerPtr handler;
2739
2740 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2741
2742 if (ctxt->input->encoding != NULL)
2743 xmlFree((xmlChar *) ctxt->input->encoding);
2744 ctxt->input->encoding = xmlStrdup(encoding);
2745
2746 enc = xmlParseCharEncoding((const char *) encoding);
2747 /*
2748 * registered set of known encodings
2749 */
2750 if (enc != XML_CHAR_ENCODING_ERROR) {
2751 xmlSwitchEncoding(ctxt, enc);
2752 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2753 } else {
2754 /*
2755 * fallback for unknown encodings
2756 */
2757 handler = xmlFindCharEncodingHandler((const char *) encoding);
2758 if (handler != NULL) {
2759 xmlSwitchToEncoding(ctxt, handler);
2760 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2761 } else {
2762 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2763 }
2764 }
2765
2766 if ((ctxt->input->buf != NULL) &&
2767 (ctxt->input->buf->encoder != NULL) &&
2768 (ctxt->input->buf->raw != NULL) &&
2769 (ctxt->input->buf->buffer != NULL)) {
2770 int nbchars;
2771 int processed;
2772
2773 /*
2774 * convert as much as possible to the parser reading buffer.
2775 */
2776 processed = ctxt->input->cur - ctxt->input->base;
2777 xmlBufferShrink(ctxt->input->buf->buffer, processed);
2778 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2779 ctxt->input->buf->buffer,
2780 ctxt->input->buf->raw);
2781 if (nbchars < 0) {
2782 ctxt->errNo = XML_ERR_INVALID_ENCODING;
2783 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2784 ctxt->sax->error(ctxt->userData,
2785 "htmlCheckEncoding: encoder error\n");
2786 }
2787 ctxt->input->base =
2788 ctxt->input->cur = ctxt->input->buf->buffer->content;
2789 }
2790 }
2791}
2792
2793/**
2794 * htmlCheckMeta:
2795 * @ctxt: an HTML parser context
2796 * @atts: the attributes values
2797 *
2798 * Checks an attributes from a Meta tag
2799 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002800static void
Owen Taylor3473f882001-02-23 17:55:21 +00002801htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2802 int i;
2803 const xmlChar *att, *value;
2804 int http = 0;
2805 const xmlChar *content = NULL;
2806
2807 if ((ctxt == NULL) || (atts == NULL))
2808 return;
2809
2810 i = 0;
2811 att = atts[i++];
2812 while (att != NULL) {
2813 value = atts[i++];
2814 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
2815 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
2816 http = 1;
2817 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
2818 content = value;
2819 att = atts[i++];
2820 }
2821 if ((http) && (content != NULL))
2822 htmlCheckEncoding(ctxt, content);
2823
2824}
2825
2826/**
2827 * htmlParseStartTag:
2828 * @ctxt: an HTML parser context
2829 *
2830 * parse a start of tag either for rule element or
2831 * EmptyElement. In both case we don't parse the tag closing chars.
2832 *
2833 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2834 *
2835 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2836 *
2837 * With namespace:
2838 *
2839 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2840 *
2841 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2842 *
2843 */
2844
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002845static void
Owen Taylor3473f882001-02-23 17:55:21 +00002846htmlParseStartTag(htmlParserCtxtPtr ctxt) {
2847 xmlChar *name;
2848 xmlChar *attname;
2849 xmlChar *attvalue;
2850 const xmlChar **atts = NULL;
2851 int nbatts = 0;
2852 int maxatts = 0;
2853 int meta = 0;
2854 int i;
2855
2856 if (CUR != '<') return;
2857 NEXT;
2858
2859 GROW;
2860 name = htmlParseHTMLName(ctxt);
2861 if (name == NULL) {
2862 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2863 ctxt->sax->error(ctxt->userData,
2864 "htmlParseStartTag: invalid element name\n");
2865 ctxt->wellFormed = 0;
2866 /* Dump the bogus tag like browsers do */
2867 while ((IS_CHAR(CUR)) && (CUR != '>'))
2868 NEXT;
2869 return;
2870 }
2871 if (xmlStrEqual(name, BAD_CAST"meta"))
2872 meta = 1;
2873
2874 /*
2875 * Check for auto-closure of HTML elements.
2876 */
2877 htmlAutoClose(ctxt, name);
2878
2879 /*
2880 * Check for implied HTML elements.
2881 */
2882 htmlCheckImplied(ctxt, name);
2883
2884 /*
2885 * Avoid html at any level > 0, head at any level != 1
2886 * or any attempt to recurse body
2887 */
2888 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
2889 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2890 ctxt->sax->error(ctxt->userData,
2891 "htmlParseStartTag: misplaced <html> tag\n");
2892 ctxt->wellFormed = 0;
2893 xmlFree(name);
2894 return;
2895 }
2896 if ((ctxt->nameNr != 1) &&
2897 (xmlStrEqual(name, BAD_CAST"head"))) {
2898 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2899 ctxt->sax->error(ctxt->userData,
2900 "htmlParseStartTag: misplaced <head> tag\n");
2901 ctxt->wellFormed = 0;
2902 xmlFree(name);
2903 return;
2904 }
2905 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002906 int indx;
2907 for (indx = 0;indx < ctxt->nameNr;indx++) {
2908 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor3473f882001-02-23 17:55:21 +00002909 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2910 ctxt->sax->error(ctxt->userData,
2911 "htmlParseStartTag: misplaced <body> tag\n");
2912 ctxt->wellFormed = 0;
2913 xmlFree(name);
2914 return;
2915 }
2916 }
2917 }
2918
2919 /*
2920 * Now parse the attributes, it ends up with the ending
2921 *
2922 * (S Attribute)* S?
2923 */
2924 SKIP_BLANKS;
2925 while ((IS_CHAR(CUR)) &&
2926 (CUR != '>') &&
2927 ((CUR != '/') || (NXT(1) != '>'))) {
2928 long cons = ctxt->nbChars;
2929
2930 GROW;
2931 attname = htmlParseAttribute(ctxt, &attvalue);
2932 if (attname != NULL) {
2933
2934 /*
2935 * Well formedness requires at most one declaration of an attribute
2936 */
2937 for (i = 0; i < nbatts;i += 2) {
2938 if (xmlStrEqual(atts[i], attname)) {
2939 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2940 ctxt->sax->error(ctxt->userData,
2941 "Attribute %s redefined\n",
2942 attname);
2943 ctxt->wellFormed = 0;
2944 xmlFree(attname);
2945 if (attvalue != NULL)
2946 xmlFree(attvalue);
2947 goto failed;
2948 }
2949 }
2950
2951 /*
2952 * Add the pair to atts
2953 */
2954 if (atts == NULL) {
2955 maxatts = 10;
2956 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
2957 if (atts == NULL) {
2958 xmlGenericError(xmlGenericErrorContext,
2959 "malloc of %ld byte failed\n",
2960 maxatts * (long)sizeof(xmlChar *));
2961 if (name != NULL) xmlFree(name);
2962 return;
2963 }
2964 } else if (nbatts + 4 > maxatts) {
2965 maxatts *= 2;
2966 atts = (const xmlChar **) xmlRealloc((void *) atts,
2967 maxatts * sizeof(xmlChar *));
2968 if (atts == NULL) {
2969 xmlGenericError(xmlGenericErrorContext,
2970 "realloc of %ld byte failed\n",
2971 maxatts * (long)sizeof(xmlChar *));
2972 if (name != NULL) xmlFree(name);
2973 return;
2974 }
2975 }
2976 atts[nbatts++] = attname;
2977 atts[nbatts++] = attvalue;
2978 atts[nbatts] = NULL;
2979 atts[nbatts + 1] = NULL;
2980 }
2981 else {
2982 /* Dump the bogus attribute string up to the next blank or
2983 * the end of the tag. */
2984 while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
2985 && ((CUR != '/') || (NXT(1) != '>')))
2986 NEXT;
2987 }
2988
2989failed:
2990 SKIP_BLANKS;
2991 if (cons == ctxt->nbChars) {
2992 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2993 ctxt->sax->error(ctxt->userData,
2994 "htmlParseStartTag: problem parsing attributes\n");
2995 ctxt->wellFormed = 0;
2996 break;
2997 }
2998 }
2999
3000 /*
3001 * Handle specific association to the META tag
3002 */
3003 if (meta)
3004 htmlCheckMeta(ctxt, atts);
3005
3006 /*
3007 * SAX: Start of Element !
3008 */
3009 htmlnamePush(ctxt, xmlStrdup(name));
3010#ifdef DEBUG
3011 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
3012#endif
3013 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3014 ctxt->sax->startElement(ctxt->userData, name, atts);
3015
3016 if (atts != NULL) {
3017 for (i = 0;i < nbatts;i++) {
3018 if (atts[i] != NULL)
3019 xmlFree((xmlChar *) atts[i]);
3020 }
3021 xmlFree((void *) atts);
3022 }
3023 if (name != NULL) xmlFree(name);
3024}
3025
3026/**
3027 * htmlParseEndTag:
3028 * @ctxt: an HTML parser context
3029 *
3030 * parse an end of tag
3031 *
3032 * [42] ETag ::= '</' Name S? '>'
3033 *
3034 * With namespace
3035 *
3036 * [NS 9] ETag ::= '</' QName S? '>'
3037 */
3038
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003039static void
Owen Taylor3473f882001-02-23 17:55:21 +00003040htmlParseEndTag(htmlParserCtxtPtr ctxt) {
3041 xmlChar *name;
3042 xmlChar *oldname;
3043 int i;
3044
3045 if ((CUR != '<') || (NXT(1) != '/')) {
3046 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3047 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3048 ctxt->wellFormed = 0;
3049 return;
3050 }
3051 SKIP(2);
3052
3053 name = htmlParseHTMLName(ctxt);
3054 if (name == NULL) return;
3055
3056 /*
3057 * We should definitely be at the ending "S? '>'" part
3058 */
3059 SKIP_BLANKS;
3060 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3061 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3062 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3063 ctxt->wellFormed = 0;
3064 } else
3065 NEXT;
3066
3067 /*
3068 * If the name read is not one of the element in the parsing stack
3069 * then return, it's just an error.
3070 */
3071 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
3072 if (xmlStrEqual(name, ctxt->nameTab[i])) break;
3073 }
3074 if (i < 0) {
3075 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3076 ctxt->sax->error(ctxt->userData,
3077 "Unexpected end tag : %s\n", name);
3078 xmlFree(name);
3079 ctxt->wellFormed = 0;
3080 return;
3081 }
3082
3083
3084 /*
3085 * Check for auto-closure of HTML elements.
3086 */
3087
3088 htmlAutoCloseOnClose(ctxt, name);
3089
3090 /*
3091 * Well formedness constraints, opening and closing must match.
3092 * With the exception that the autoclose may have popped stuff out
3093 * of the stack.
3094 */
3095 if (!xmlStrEqual(name, ctxt->name)) {
3096#ifdef DEBUG
3097 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
3098#endif
3099 if ((ctxt->name != NULL) &&
3100 (!xmlStrEqual(ctxt->name, name))) {
3101 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3102 ctxt->sax->error(ctxt->userData,
3103 "Opening and ending tag mismatch: %s and %s\n",
3104 name, ctxt->name);
3105 ctxt->wellFormed = 0;
3106 }
3107 }
3108
3109 /*
3110 * SAX: End of Tag
3111 */
3112 oldname = ctxt->name;
3113 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3114 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3115 ctxt->sax->endElement(ctxt->userData, name);
3116 oldname = htmlnamePop(ctxt);
3117 if (oldname != NULL) {
3118#ifdef DEBUG
3119 xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
3120#endif
3121 xmlFree(oldname);
3122#ifdef DEBUG
3123 } else {
3124 xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
3125#endif
3126 }
3127 }
3128
3129 if (name != NULL)
3130 xmlFree(name);
3131
3132 return;
3133}
3134
3135
3136/**
3137 * htmlParseReference:
3138 * @ctxt: an HTML parser context
3139 *
3140 * parse and handle entity references in content,
3141 * this will end-up in a call to character() since this is either a
3142 * CharRef, or a predefined entity.
3143 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003144static void
Owen Taylor3473f882001-02-23 17:55:21 +00003145htmlParseReference(htmlParserCtxtPtr ctxt) {
3146 htmlEntityDescPtr ent;
3147 xmlChar out[6];
3148 xmlChar *name;
3149 if (CUR != '&') return;
3150
3151 if (NXT(1) == '#') {
3152 unsigned int c;
3153 int bits, i = 0;
3154
3155 c = htmlParseCharRef(ctxt);
3156 if (c == 0)
3157 return;
3158
3159 if (c < 0x80) { out[i++]= c; bits= -6; }
3160 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3161 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3162 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3163
3164 for ( ; bits >= 0; bits-= 6) {
3165 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3166 }
3167 out[i] = 0;
3168
3169 htmlCheckParagraph(ctxt);
3170 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3171 ctxt->sax->characters(ctxt->userData, out, i);
3172 } else {
3173 ent = htmlParseEntityRef(ctxt, &name);
3174 if (name == NULL) {
3175 htmlCheckParagraph(ctxt);
3176 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3177 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3178 return;
3179 }
3180 if ((ent == NULL) || (ent->value <= 0)) {
3181 htmlCheckParagraph(ctxt);
3182 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3183 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3184 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3185 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3186 }
3187 } else {
3188 unsigned int c;
3189 int bits, i = 0;
3190
3191 c = ent->value;
3192 if (c < 0x80)
3193 { out[i++]= c; bits= -6; }
3194 else if (c < 0x800)
3195 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3196 else if (c < 0x10000)
3197 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3198 else
3199 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3200
3201 for ( ; bits >= 0; bits-= 6) {
3202 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3203 }
3204 out[i] = 0;
3205
3206 htmlCheckParagraph(ctxt);
3207 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3208 ctxt->sax->characters(ctxt->userData, out, i);
3209 }
3210 xmlFree(name);
3211 }
3212}
3213
3214/**
3215 * htmlParseContent:
3216 * @ctxt: an HTML parser context
3217 * @name: the node name
3218 *
3219 * Parse a content: comment, sub-element, reference or text.
3220 *
3221 */
3222
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003223static void
Owen Taylor3473f882001-02-23 17:55:21 +00003224htmlParseContent(htmlParserCtxtPtr ctxt) {
3225 xmlChar *currentNode;
3226 int depth;
3227
3228 currentNode = xmlStrdup(ctxt->name);
3229 depth = ctxt->nameNr;
3230 while (1) {
3231 long cons = ctxt->nbChars;
3232
3233 GROW;
3234 /*
3235 * Our tag or one of it's parent or children is ending.
3236 */
3237 if ((CUR == '<') && (NXT(1) == '/')) {
3238 htmlParseEndTag(ctxt);
3239 if (currentNode != NULL) xmlFree(currentNode);
3240 return;
3241 }
3242
3243 /*
3244 * Has this node been popped out during parsing of
3245 * the next element
3246 */
3247 if ((!xmlStrEqual(currentNode, ctxt->name)) &&
3248 (depth >= ctxt->nameNr)) {
3249 if (currentNode != NULL) xmlFree(currentNode);
3250 return;
3251 }
3252
Daniel Veillardf9533d12001-03-03 10:04:57 +00003253 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3254 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003255 /*
3256 * Handle SCRIPT/STYLE separately
3257 */
3258 htmlParseScript(ctxt);
3259 } else {
3260 /*
3261 * Sometimes DOCTYPE arrives in the middle of the document
3262 */
3263 if ((CUR == '<') && (NXT(1) == '!') &&
3264 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3265 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3266 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3267 (UPP(8) == 'E')) {
3268 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3269 ctxt->sax->error(ctxt->userData,
3270 "Misplaced DOCTYPE declaration\n");
3271 ctxt->wellFormed = 0;
3272 htmlParseDocTypeDecl(ctxt);
3273 }
3274
3275 /*
3276 * First case : a comment
3277 */
3278 if ((CUR == '<') && (NXT(1) == '!') &&
3279 (NXT(2) == '-') && (NXT(3) == '-')) {
3280 htmlParseComment(ctxt);
3281 }
3282
3283 /*
3284 * Second case : a sub-element.
3285 */
3286 else if (CUR == '<') {
3287 htmlParseElement(ctxt);
3288 }
3289
3290 /*
3291 * Third case : a reference. If if has not been resolved,
3292 * parsing returns it's Name, create the node
3293 */
3294 else if (CUR == '&') {
3295 htmlParseReference(ctxt);
3296 }
3297
3298 /*
3299 * Fourth : end of the resource
3300 */
3301 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003302 htmlAutoCloseOnEnd(ctxt);
3303 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003304 }
3305
3306 /*
3307 * Last case, text. Note that References are handled directly.
3308 */
3309 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003310 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003311 }
3312
3313 if (cons == ctxt->nbChars) {
3314 if (ctxt->node != NULL) {
3315 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3316 ctxt->sax->error(ctxt->userData,
3317 "detected an error in element content\n");
3318 ctxt->wellFormed = 0;
3319 }
3320 break;
3321 }
3322 }
3323 GROW;
3324 }
3325 if (currentNode != NULL) xmlFree(currentNode);
3326}
3327
3328/**
3329 * htmlParseElement:
3330 * @ctxt: an HTML parser context
3331 *
3332 * parse an HTML element, this is highly recursive
3333 *
3334 * [39] element ::= EmptyElemTag | STag content ETag
3335 *
3336 * [41] Attribute ::= Name Eq AttValue
3337 */
3338
3339void
3340htmlParseElement(htmlParserCtxtPtr ctxt) {
3341 xmlChar *name;
3342 xmlChar *currentNode = NULL;
3343 htmlElemDescPtr info;
3344 htmlParserNodeInfo node_info;
3345 xmlChar *oldname;
3346 int depth = ctxt->nameNr;
3347
3348 /* Capture start position */
3349 if (ctxt->record_info) {
3350 node_info.begin_pos = ctxt->input->consumed +
3351 (CUR_PTR - ctxt->input->base);
3352 node_info.begin_line = ctxt->input->line;
3353 }
3354
3355 oldname = xmlStrdup(ctxt->name);
3356 htmlParseStartTag(ctxt);
3357 name = ctxt->name;
3358#ifdef DEBUG
3359 if (oldname == NULL)
3360 xmlGenericError(xmlGenericErrorContext,
3361 "Start of element %s\n", name);
3362 else if (name == NULL)
3363 xmlGenericError(xmlGenericErrorContext,
3364 "Start of element failed, was %s\n", oldname);
3365 else
3366 xmlGenericError(xmlGenericErrorContext,
3367 "Start of element %s, was %s\n", name, oldname);
3368#endif
3369 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3370 (name == NULL)) {
3371 if (CUR == '>')
3372 NEXT;
3373 if (oldname != NULL)
3374 xmlFree(oldname);
3375 return;
3376 }
3377 if (oldname != NULL)
3378 xmlFree(oldname);
3379
3380 /*
3381 * Lookup the info for that element.
3382 */
3383 info = htmlTagLookup(name);
3384 if (info == NULL) {
3385 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3386 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3387 name);
3388 ctxt->wellFormed = 0;
3389 } else if (info->depr) {
3390/***************************
3391 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3392 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3393 name);
3394 ***************************/
3395 }
3396
3397 /*
3398 * Check for an Empty Element labelled the XML/SGML way
3399 */
3400 if ((CUR == '/') && (NXT(1) == '>')) {
3401 SKIP(2);
3402 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3403 ctxt->sax->endElement(ctxt->userData, name);
3404 oldname = htmlnamePop(ctxt);
3405#ifdef DEBUG
3406 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
3407#endif
3408 if (oldname != NULL)
3409 xmlFree(oldname);
3410 return;
3411 }
3412
3413 if (CUR == '>') {
3414 NEXT;
3415 } else {
3416 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3417 ctxt->sax->error(ctxt->userData,
3418 "Couldn't find end of Start Tag %s\n",
3419 name);
3420 ctxt->wellFormed = 0;
3421
3422 /*
3423 * end of parsing of this node.
3424 */
3425 if (xmlStrEqual(name, ctxt->name)) {
3426 nodePop(ctxt);
3427 oldname = htmlnamePop(ctxt);
3428#ifdef DEBUG
3429 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
3430#endif
3431 if (oldname != NULL)
3432 xmlFree(oldname);
3433 }
3434
3435 /*
3436 * Capture end position and add node
3437 */
3438 if ( currentNode != NULL && ctxt->record_info ) {
3439 node_info.end_pos = ctxt->input->consumed +
3440 (CUR_PTR - ctxt->input->base);
3441 node_info.end_line = ctxt->input->line;
3442 node_info.node = ctxt->node;
3443 xmlParserAddNodeInfo(ctxt, &node_info);
3444 }
3445 return;
3446 }
3447
3448 /*
3449 * Check for an Empty Element from DTD definition
3450 */
3451 if ((info != NULL) && (info->empty)) {
3452 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3453 ctxt->sax->endElement(ctxt->userData, name);
3454 oldname = htmlnamePop(ctxt);
3455#ifdef DEBUG
3456 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
3457#endif
3458 if (oldname != NULL)
3459 xmlFree(oldname);
3460 return;
3461 }
3462
3463 /*
3464 * Parse the content of the element:
3465 */
3466 currentNode = xmlStrdup(ctxt->name);
3467 depth = ctxt->nameNr;
3468 while (IS_CHAR(CUR)) {
3469 htmlParseContent(ctxt);
3470 if (ctxt->nameNr < depth) break;
3471 }
3472
Owen Taylor3473f882001-02-23 17:55:21 +00003473 /*
3474 * Capture end position and add node
3475 */
3476 if ( currentNode != NULL && ctxt->record_info ) {
3477 node_info.end_pos = ctxt->input->consumed +
3478 (CUR_PTR - ctxt->input->base);
3479 node_info.end_line = ctxt->input->line;
3480 node_info.node = ctxt->node;
3481 xmlParserAddNodeInfo(ctxt, &node_info);
3482 }
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003483 if (!IS_CHAR(CUR)) {
3484 htmlAutoCloseOnEnd(ctxt);
3485 }
3486
Owen Taylor3473f882001-02-23 17:55:21 +00003487 if (currentNode != NULL)
3488 xmlFree(currentNode);
3489}
3490
3491/**
3492 * htmlParseDocument :
3493 * @ctxt: an HTML parser context
3494 *
3495 * parse an HTML document (and build a tree if using the standard SAX
3496 * interface).
3497 *
3498 * Returns 0, -1 in case of error. the parser context is augmented
3499 * as a result of the parsing.
3500 */
3501
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003502static int
Owen Taylor3473f882001-02-23 17:55:21 +00003503htmlParseDocument(htmlParserCtxtPtr ctxt) {
3504 xmlDtdPtr dtd;
3505
3506 htmlDefaultSAXHandlerInit();
3507 ctxt->html = 1;
3508
3509 GROW;
3510 /*
3511 * SAX: beginning of the document processing.
3512 */
3513 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3514 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3515
3516 /*
3517 * Wipe out everything which is before the first '<'
3518 */
3519 SKIP_BLANKS;
3520 if (CUR == 0) {
3521 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3522 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3523 ctxt->wellFormed = 0;
3524 }
3525
3526 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3527 ctxt->sax->startDocument(ctxt->userData);
3528
3529
3530 /*
3531 * Parse possible comments before any content
3532 */
3533 while ((CUR == '<') && (NXT(1) == '!') &&
3534 (NXT(2) == '-') && (NXT(3) == '-')) {
3535 htmlParseComment(ctxt);
3536 SKIP_BLANKS;
3537 }
3538
3539
3540 /*
3541 * Then possibly doc type declaration(s) and more Misc
3542 * (doctypedecl Misc*)?
3543 */
3544 if ((CUR == '<') && (NXT(1) == '!') &&
3545 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3546 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3547 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3548 (UPP(8) == 'E')) {
3549 htmlParseDocTypeDecl(ctxt);
3550 }
3551 SKIP_BLANKS;
3552
3553 /*
3554 * Parse possible comments before any content
3555 */
3556 while ((CUR == '<') && (NXT(1) == '!') &&
3557 (NXT(2) == '-') && (NXT(3) == '-')) {
3558 htmlParseComment(ctxt);
3559 SKIP_BLANKS;
3560 }
3561
3562 /*
3563 * Time to start parsing the tree itself
3564 */
3565 htmlParseContent(ctxt);
3566
3567 /*
3568 * autoclose
3569 */
3570 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003571 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003572
3573
3574 /*
3575 * SAX: end of the document processing.
3576 */
3577 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3578 ctxt->sax->endDocument(ctxt->userData);
3579
3580 if (ctxt->myDoc != NULL) {
3581 dtd = xmlGetIntSubset(ctxt->myDoc);
3582 if (dtd == NULL)
3583 ctxt->myDoc->intSubset =
3584 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3585 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3586 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3587 }
3588 if (! ctxt->wellFormed) return(-1);
3589 return(0);
3590}
3591
3592
3593/************************************************************************
3594 * *
3595 * Parser contexts handling *
3596 * *
3597 ************************************************************************/
3598
3599/**
3600 * xmlInitParserCtxt:
3601 * @ctxt: an HTML parser context
3602 *
3603 * Initialize a parser context
3604 */
3605
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003606static void
Owen Taylor3473f882001-02-23 17:55:21 +00003607htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3608{
3609 htmlSAXHandler *sax;
3610
3611 if (ctxt == NULL) return;
3612 memset(ctxt, 0, sizeof(htmlParserCtxt));
3613
3614 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3615 if (sax == NULL) {
3616 xmlGenericError(xmlGenericErrorContext,
3617 "htmlInitParserCtxt: out of memory\n");
3618 }
3619 else
3620 memset(sax, 0, sizeof(htmlSAXHandler));
3621
3622 /* Allocate the Input stack */
3623 ctxt->inputTab = (htmlParserInputPtr *)
3624 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3625 if (ctxt->inputTab == NULL) {
3626 xmlGenericError(xmlGenericErrorContext,
3627 "htmlInitParserCtxt: out of memory\n");
3628 ctxt->inputNr = 0;
3629 ctxt->inputMax = 0;
3630 ctxt->input = NULL;
3631 return;
3632 }
3633 ctxt->inputNr = 0;
3634 ctxt->inputMax = 5;
3635 ctxt->input = NULL;
3636 ctxt->version = NULL;
3637 ctxt->encoding = NULL;
3638 ctxt->standalone = -1;
3639 ctxt->instate = XML_PARSER_START;
3640
3641 /* Allocate the Node stack */
3642 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3643 if (ctxt->nodeTab == NULL) {
3644 xmlGenericError(xmlGenericErrorContext,
3645 "htmlInitParserCtxt: out of memory\n");
3646 ctxt->nodeNr = 0;
3647 ctxt->nodeMax = 0;
3648 ctxt->node = NULL;
3649 ctxt->inputNr = 0;
3650 ctxt->inputMax = 0;
3651 ctxt->input = NULL;
3652 return;
3653 }
3654 ctxt->nodeNr = 0;
3655 ctxt->nodeMax = 10;
3656 ctxt->node = NULL;
3657
3658 /* Allocate the Name stack */
3659 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
3660 if (ctxt->nameTab == NULL) {
3661 xmlGenericError(xmlGenericErrorContext,
3662 "htmlInitParserCtxt: out of memory\n");
3663 ctxt->nameNr = 0;
3664 ctxt->nameMax = 10;
3665 ctxt->name = NULL;
3666 ctxt->nodeNr = 0;
3667 ctxt->nodeMax = 0;
3668 ctxt->node = NULL;
3669 ctxt->inputNr = 0;
3670 ctxt->inputMax = 0;
3671 ctxt->input = NULL;
3672 return;
3673 }
3674 ctxt->nameNr = 0;
3675 ctxt->nameMax = 10;
3676 ctxt->name = NULL;
3677
3678 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3679 else {
3680 ctxt->sax = sax;
3681 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3682 }
3683 ctxt->userData = ctxt;
3684 ctxt->myDoc = NULL;
3685 ctxt->wellFormed = 1;
3686 ctxt->replaceEntities = 0;
3687 ctxt->html = 1;
3688 ctxt->record_info = 0;
3689 ctxt->validate = 0;
3690 ctxt->nbChars = 0;
3691 ctxt->checkIndex = 0;
3692 xmlInitNodeInfoSeq(&ctxt->node_seq);
3693}
3694
3695/**
3696 * htmlFreeParserCtxt:
3697 * @ctxt: an HTML parser context
3698 *
3699 * Free all the memory used by a parser context. However the parsed
3700 * document in ctxt->myDoc is not freed.
3701 */
3702
3703void
3704htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3705{
3706 xmlFreeParserCtxt(ctxt);
3707}
3708
3709/**
3710 * htmlCreateDocParserCtxt :
3711 * @cur: a pointer to an array of xmlChar
3712 * @encoding: a free form C string describing the HTML document encoding, or NULL
3713 *
3714 * Create a parser context for an HTML document.
3715 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003716 * TODO: check the need to add encoding handling there
3717 *
Owen Taylor3473f882001-02-23 17:55:21 +00003718 * Returns the new parser context or NULL
3719 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003720static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00003721htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Owen Taylor3473f882001-02-23 17:55:21 +00003722 htmlParserCtxtPtr ctxt;
3723 htmlParserInputPtr input;
3724 /* htmlCharEncoding enc; */
3725
3726 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
3727 if (ctxt == NULL) {
3728 perror("malloc");
3729 return(NULL);
3730 }
3731 htmlInitParserCtxt(ctxt);
3732 input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
3733 if (input == NULL) {
3734 perror("malloc");
3735 xmlFree(ctxt);
3736 return(NULL);
3737 }
3738 memset(input, 0, sizeof(htmlParserInput));
3739
3740 input->line = 1;
3741 input->col = 1;
3742 input->base = cur;
3743 input->cur = cur;
3744
3745 inputPush(ctxt, input);
3746 return(ctxt);
3747}
3748
3749/************************************************************************
3750 * *
3751 * Progressive parsing interfaces *
3752 * *
3753 ************************************************************************/
3754
3755/**
3756 * htmlParseLookupSequence:
3757 * @ctxt: an HTML parser context
3758 * @first: the first char to lookup
3759 * @next: the next char to lookup or zero
3760 * @third: the next char to lookup or zero
3761 *
3762 * Try to find if a sequence (first, next, third) or just (first next) or
3763 * (first) is available in the input stream.
3764 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3765 * to avoid rescanning sequences of bytes, it DOES change the state of the
3766 * parser, do not use liberally.
3767 * This is basically similar to xmlParseLookupSequence()
3768 *
3769 * Returns the index to the current parsing point if the full sequence
3770 * is available, -1 otherwise.
3771 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003772static int
Owen Taylor3473f882001-02-23 17:55:21 +00003773htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3774 xmlChar next, xmlChar third) {
3775 int base, len;
3776 htmlParserInputPtr in;
3777 const xmlChar *buf;
3778
3779 in = ctxt->input;
3780 if (in == NULL) return(-1);
3781 base = in->cur - in->base;
3782 if (base < 0) return(-1);
3783 if (ctxt->checkIndex > base)
3784 base = ctxt->checkIndex;
3785 if (in->buf == NULL) {
3786 buf = in->base;
3787 len = in->length;
3788 } else {
3789 buf = in->buf->buffer->content;
3790 len = in->buf->buffer->use;
3791 }
3792 /* take into account the sequence length */
3793 if (third) len -= 2;
3794 else if (next) len --;
3795 for (;base < len;base++) {
3796 if (buf[base] == first) {
3797 if (third != 0) {
3798 if ((buf[base + 1] != next) ||
3799 (buf[base + 2] != third)) continue;
3800 } else if (next != 0) {
3801 if (buf[base + 1] != next) continue;
3802 }
3803 ctxt->checkIndex = 0;
3804#ifdef DEBUG_PUSH
3805 if (next == 0)
3806 xmlGenericError(xmlGenericErrorContext,
3807 "HPP: lookup '%c' found at %d\n",
3808 first, base);
3809 else if (third == 0)
3810 xmlGenericError(xmlGenericErrorContext,
3811 "HPP: lookup '%c%c' found at %d\n",
3812 first, next, base);
3813 else
3814 xmlGenericError(xmlGenericErrorContext,
3815 "HPP: lookup '%c%c%c' found at %d\n",
3816 first, next, third, base);
3817#endif
3818 return(base - (in->cur - in->base));
3819 }
3820 }
3821 ctxt->checkIndex = base;
3822#ifdef DEBUG_PUSH
3823 if (next == 0)
3824 xmlGenericError(xmlGenericErrorContext,
3825 "HPP: lookup '%c' failed\n", first);
3826 else if (third == 0)
3827 xmlGenericError(xmlGenericErrorContext,
3828 "HPP: lookup '%c%c' failed\n", first, next);
3829 else
3830 xmlGenericError(xmlGenericErrorContext,
3831 "HPP: lookup '%c%c%c' failed\n", first, next, third);
3832#endif
3833 return(-1);
3834}
3835
3836/**
3837 * htmlParseTryOrFinish:
3838 * @ctxt: an HTML parser context
3839 * @terminate: last chunk indicator
3840 *
3841 * Try to progress on parsing
3842 *
3843 * Returns zero if no parsing was possible
3844 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003845static int
Owen Taylor3473f882001-02-23 17:55:21 +00003846htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
3847 int ret = 0;
3848 htmlParserInputPtr in;
3849 int avail = 0;
3850 xmlChar cur, next;
3851
3852#ifdef DEBUG_PUSH
3853 switch (ctxt->instate) {
3854 case XML_PARSER_EOF:
3855 xmlGenericError(xmlGenericErrorContext,
3856 "HPP: try EOF\n"); break;
3857 case XML_PARSER_START:
3858 xmlGenericError(xmlGenericErrorContext,
3859 "HPP: try START\n"); break;
3860 case XML_PARSER_MISC:
3861 xmlGenericError(xmlGenericErrorContext,
3862 "HPP: try MISC\n");break;
3863 case XML_PARSER_COMMENT:
3864 xmlGenericError(xmlGenericErrorContext,
3865 "HPP: try COMMENT\n");break;
3866 case XML_PARSER_PROLOG:
3867 xmlGenericError(xmlGenericErrorContext,
3868 "HPP: try PROLOG\n");break;
3869 case XML_PARSER_START_TAG:
3870 xmlGenericError(xmlGenericErrorContext,
3871 "HPP: try START_TAG\n");break;
3872 case XML_PARSER_CONTENT:
3873 xmlGenericError(xmlGenericErrorContext,
3874 "HPP: try CONTENT\n");break;
3875 case XML_PARSER_CDATA_SECTION:
3876 xmlGenericError(xmlGenericErrorContext,
3877 "HPP: try CDATA_SECTION\n");break;
3878 case XML_PARSER_END_TAG:
3879 xmlGenericError(xmlGenericErrorContext,
3880 "HPP: try END_TAG\n");break;
3881 case XML_PARSER_ENTITY_DECL:
3882 xmlGenericError(xmlGenericErrorContext,
3883 "HPP: try ENTITY_DECL\n");break;
3884 case XML_PARSER_ENTITY_VALUE:
3885 xmlGenericError(xmlGenericErrorContext,
3886 "HPP: try ENTITY_VALUE\n");break;
3887 case XML_PARSER_ATTRIBUTE_VALUE:
3888 xmlGenericError(xmlGenericErrorContext,
3889 "HPP: try ATTRIBUTE_VALUE\n");break;
3890 case XML_PARSER_DTD:
3891 xmlGenericError(xmlGenericErrorContext,
3892 "HPP: try DTD\n");break;
3893 case XML_PARSER_EPILOG:
3894 xmlGenericError(xmlGenericErrorContext,
3895 "HPP: try EPILOG\n");break;
3896 case XML_PARSER_PI:
3897 xmlGenericError(xmlGenericErrorContext,
3898 "HPP: try PI\n");break;
3899 case XML_PARSER_SYSTEM_LITERAL:
3900 xmlGenericError(xmlGenericErrorContext,
3901 "HPP: try SYSTEM_LITERAL\n");break;
3902 }
3903#endif
3904
3905 while (1) {
3906
3907 in = ctxt->input;
3908 if (in == NULL) break;
3909 if (in->buf == NULL)
3910 avail = in->length - (in->cur - in->base);
3911 else
3912 avail = in->buf->buffer->use - (in->cur - in->base);
3913 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003914 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003915 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
3916 /*
3917 * SAX: end of the document processing.
3918 */
3919 ctxt->instate = XML_PARSER_EOF;
3920 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3921 ctxt->sax->endDocument(ctxt->userData);
3922 }
3923 }
3924 if (avail < 1)
3925 goto done;
3926 switch (ctxt->instate) {
3927 case XML_PARSER_EOF:
3928 /*
3929 * Document parsing is done !
3930 */
3931 goto done;
3932 case XML_PARSER_START:
3933 /*
3934 * Very first chars read from the document flow.
3935 */
3936 cur = in->cur[0];
3937 if (IS_BLANK(cur)) {
3938 SKIP_BLANKS;
3939 if (in->buf == NULL)
3940 avail = in->length - (in->cur - in->base);
3941 else
3942 avail = in->buf->buffer->use - (in->cur - in->base);
3943 }
3944 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3945 ctxt->sax->setDocumentLocator(ctxt->userData,
3946 &xmlDefaultSAXLocator);
3947 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
3948 (!ctxt->disableSAX))
3949 ctxt->sax->startDocument(ctxt->userData);
3950
3951 cur = in->cur[0];
3952 next = in->cur[1];
3953 if ((cur == '<') && (next == '!') &&
3954 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3955 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3956 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3957 (UPP(8) == 'E')) {
3958 if ((!terminate) &&
3959 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
3960 goto done;
3961#ifdef DEBUG_PUSH
3962 xmlGenericError(xmlGenericErrorContext,
3963 "HPP: Parsing internal subset\n");
3964#endif
3965 htmlParseDocTypeDecl(ctxt);
3966 ctxt->instate = XML_PARSER_PROLOG;
3967#ifdef DEBUG_PUSH
3968 xmlGenericError(xmlGenericErrorContext,
3969 "HPP: entering PROLOG\n");
3970#endif
3971 } else {
3972 ctxt->instate = XML_PARSER_MISC;
3973 }
3974#ifdef DEBUG_PUSH
3975 xmlGenericError(xmlGenericErrorContext,
3976 "HPP: entering MISC\n");
3977#endif
3978 break;
3979 case XML_PARSER_MISC:
3980 SKIP_BLANKS;
3981 if (in->buf == NULL)
3982 avail = in->length - (in->cur - in->base);
3983 else
3984 avail = in->buf->buffer->use - (in->cur - in->base);
3985 if (avail < 2)
3986 goto done;
3987 cur = in->cur[0];
3988 next = in->cur[1];
3989 if ((cur == '<') && (next == '!') &&
3990 (in->cur[2] == '-') && (in->cur[3] == '-')) {
3991 if ((!terminate) &&
3992 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
3993 goto done;
3994#ifdef DEBUG_PUSH
3995 xmlGenericError(xmlGenericErrorContext,
3996 "HPP: Parsing Comment\n");
3997#endif
3998 htmlParseComment(ctxt);
3999 ctxt->instate = XML_PARSER_MISC;
4000 } else if ((cur == '<') && (next == '!') &&
4001 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4002 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4003 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4004 (UPP(8) == 'E')) {
4005 if ((!terminate) &&
4006 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4007 goto done;
4008#ifdef DEBUG_PUSH
4009 xmlGenericError(xmlGenericErrorContext,
4010 "HPP: Parsing internal subset\n");
4011#endif
4012 htmlParseDocTypeDecl(ctxt);
4013 ctxt->instate = XML_PARSER_PROLOG;
4014#ifdef DEBUG_PUSH
4015 xmlGenericError(xmlGenericErrorContext,
4016 "HPP: entering PROLOG\n");
4017#endif
4018 } else if ((cur == '<') && (next == '!') &&
4019 (avail < 9)) {
4020 goto done;
4021 } else {
4022 ctxt->instate = XML_PARSER_START_TAG;
4023#ifdef DEBUG_PUSH
4024 xmlGenericError(xmlGenericErrorContext,
4025 "HPP: entering START_TAG\n");
4026#endif
4027 }
4028 break;
4029 case XML_PARSER_PROLOG:
4030 SKIP_BLANKS;
4031 if (in->buf == NULL)
4032 avail = in->length - (in->cur - in->base);
4033 else
4034 avail = in->buf->buffer->use - (in->cur - in->base);
4035 if (avail < 2)
4036 goto done;
4037 cur = in->cur[0];
4038 next = in->cur[1];
4039 if ((cur == '<') && (next == '!') &&
4040 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4041 if ((!terminate) &&
4042 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4043 goto done;
4044#ifdef DEBUG_PUSH
4045 xmlGenericError(xmlGenericErrorContext,
4046 "HPP: Parsing Comment\n");
4047#endif
4048 htmlParseComment(ctxt);
4049 ctxt->instate = XML_PARSER_PROLOG;
4050 } else if ((cur == '<') && (next == '!') &&
4051 (avail < 4)) {
4052 goto done;
4053 } else {
4054 ctxt->instate = XML_PARSER_START_TAG;
4055#ifdef DEBUG_PUSH
4056 xmlGenericError(xmlGenericErrorContext,
4057 "HPP: entering START_TAG\n");
4058#endif
4059 }
4060 break;
4061 case XML_PARSER_EPILOG:
4062 if (in->buf == NULL)
4063 avail = in->length - (in->cur - in->base);
4064 else
4065 avail = in->buf->buffer->use - (in->cur - in->base);
4066 if (avail < 1)
4067 goto done;
4068 cur = in->cur[0];
4069 if (IS_BLANK(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004070 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004071 goto done;
4072 }
4073 if (avail < 2)
4074 goto done;
4075 next = in->cur[1];
4076 if ((cur == '<') && (next == '!') &&
4077 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4078 if ((!terminate) &&
4079 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4080 goto done;
4081#ifdef DEBUG_PUSH
4082 xmlGenericError(xmlGenericErrorContext,
4083 "HPP: Parsing Comment\n");
4084#endif
4085 htmlParseComment(ctxt);
4086 ctxt->instate = XML_PARSER_EPILOG;
4087 } else if ((cur == '<') && (next == '!') &&
4088 (avail < 4)) {
4089 goto done;
4090 } else {
4091 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004092 ctxt->wellFormed = 0;
4093 ctxt->instate = XML_PARSER_EOF;
4094#ifdef DEBUG_PUSH
4095 xmlGenericError(xmlGenericErrorContext,
4096 "HPP: entering EOF\n");
4097#endif
4098 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4099 ctxt->sax->endDocument(ctxt->userData);
4100 goto done;
4101 }
4102 break;
4103 case XML_PARSER_START_TAG: {
4104 xmlChar *name, *oldname;
4105 int depth = ctxt->nameNr;
4106 htmlElemDescPtr info;
4107
4108 if (avail < 2)
4109 goto done;
4110 cur = in->cur[0];
4111 if (cur != '<') {
4112 ctxt->instate = XML_PARSER_CONTENT;
4113#ifdef DEBUG_PUSH
4114 xmlGenericError(xmlGenericErrorContext,
4115 "HPP: entering CONTENT\n");
4116#endif
4117 break;
4118 }
4119 if ((!terminate) &&
4120 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4121 goto done;
4122
4123 oldname = xmlStrdup(ctxt->name);
4124 htmlParseStartTag(ctxt);
4125 name = ctxt->name;
4126#ifdef DEBUG
4127 if (oldname == NULL)
4128 xmlGenericError(xmlGenericErrorContext,
4129 "Start of element %s\n", name);
4130 else if (name == NULL)
4131 xmlGenericError(xmlGenericErrorContext,
4132 "Start of element failed, was %s\n",
4133 oldname);
4134 else
4135 xmlGenericError(xmlGenericErrorContext,
4136 "Start of element %s, was %s\n",
4137 name, oldname);
4138#endif
4139 if (((depth == ctxt->nameNr) &&
4140 (xmlStrEqual(oldname, ctxt->name))) ||
4141 (name == NULL)) {
4142 if (CUR == '>')
4143 NEXT;
4144 if (oldname != NULL)
4145 xmlFree(oldname);
4146 break;
4147 }
4148 if (oldname != NULL)
4149 xmlFree(oldname);
4150
4151 /*
4152 * Lookup the info for that element.
4153 */
4154 info = htmlTagLookup(name);
4155 if (info == NULL) {
4156 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4157 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4158 name);
4159 ctxt->wellFormed = 0;
4160 } else if (info->depr) {
4161 /***************************
4162 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4163 ctxt->sax->warning(ctxt->userData,
4164 "Tag %s is deprecated\n",
4165 name);
4166 ***************************/
4167 }
4168
4169 /*
4170 * Check for an Empty Element labelled the XML/SGML way
4171 */
4172 if ((CUR == '/') && (NXT(1) == '>')) {
4173 SKIP(2);
4174 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4175 ctxt->sax->endElement(ctxt->userData, name);
4176 oldname = htmlnamePop(ctxt);
4177#ifdef DEBUG
4178 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
4179 oldname);
4180#endif
4181 if (oldname != NULL)
4182 xmlFree(oldname);
4183 ctxt->instate = XML_PARSER_CONTENT;
4184#ifdef DEBUG_PUSH
4185 xmlGenericError(xmlGenericErrorContext,
4186 "HPP: entering CONTENT\n");
4187#endif
4188 break;
4189 }
4190
4191 if (CUR == '>') {
4192 NEXT;
4193 } else {
4194 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4195 ctxt->sax->error(ctxt->userData,
4196 "Couldn't find end of Start Tag %s\n",
4197 name);
4198 ctxt->wellFormed = 0;
4199
4200 /*
4201 * end of parsing of this node.
4202 */
4203 if (xmlStrEqual(name, ctxt->name)) {
4204 nodePop(ctxt);
4205 oldname = htmlnamePop(ctxt);
4206#ifdef DEBUG
4207 xmlGenericError(xmlGenericErrorContext,
4208 "End of start tag problem: popping out %s\n", oldname);
4209#endif
4210 if (oldname != NULL)
4211 xmlFree(oldname);
4212 }
4213
4214 ctxt->instate = XML_PARSER_CONTENT;
4215#ifdef DEBUG_PUSH
4216 xmlGenericError(xmlGenericErrorContext,
4217 "HPP: entering CONTENT\n");
4218#endif
4219 break;
4220 }
4221
4222 /*
4223 * Check for an Empty Element from DTD definition
4224 */
4225 if ((info != NULL) && (info->empty)) {
4226 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4227 ctxt->sax->endElement(ctxt->userData, name);
4228 oldname = htmlnamePop(ctxt);
4229#ifdef DEBUG
4230 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
4231#endif
4232 if (oldname != NULL)
4233 xmlFree(oldname);
4234 }
4235 ctxt->instate = XML_PARSER_CONTENT;
4236#ifdef DEBUG_PUSH
4237 xmlGenericError(xmlGenericErrorContext,
4238 "HPP: entering CONTENT\n");
4239#endif
4240 break;
4241 }
4242 case XML_PARSER_CONTENT: {
4243 long cons;
4244 /*
4245 * Handle preparsed entities and charRef
4246 */
4247 if (ctxt->token != 0) {
4248 xmlChar chr[2] = { 0 , 0 } ;
4249
4250 chr[0] = (xmlChar) ctxt->token;
4251 htmlCheckParagraph(ctxt);
4252 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4253 ctxt->sax->characters(ctxt->userData, chr, 1);
4254 ctxt->token = 0;
4255 ctxt->checkIndex = 0;
4256 }
4257 if ((avail == 1) && (terminate)) {
4258 cur = in->cur[0];
4259 if ((cur != '<') && (cur != '&')) {
4260 if (ctxt->sax != NULL) {
4261 if (IS_BLANK(cur)) {
4262 if (ctxt->sax->ignorableWhitespace != NULL)
4263 ctxt->sax->ignorableWhitespace(
4264 ctxt->userData, &cur, 1);
4265 } else {
4266 htmlCheckParagraph(ctxt);
4267 if (ctxt->sax->characters != NULL)
4268 ctxt->sax->characters(
4269 ctxt->userData, &cur, 1);
4270 }
4271 }
4272 ctxt->token = 0;
4273 ctxt->checkIndex = 0;
4274 NEXT;
4275 }
4276 break;
4277 }
4278 if (avail < 2)
4279 goto done;
4280 cur = in->cur[0];
4281 next = in->cur[1];
4282 cons = ctxt->nbChars;
4283 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4284 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4285 /*
4286 * Handle SCRIPT/STYLE separately
4287 */
4288 if ((!terminate) &&
4289 (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4290 goto done;
4291 htmlParseScript(ctxt);
4292 if ((cur == '<') && (next == '/')) {
4293 ctxt->instate = XML_PARSER_END_TAG;
4294 ctxt->checkIndex = 0;
4295#ifdef DEBUG_PUSH
4296 xmlGenericError(xmlGenericErrorContext,
4297 "HPP: entering END_TAG\n");
4298#endif
4299 break;
4300 }
4301 } else {
4302 /*
4303 * Sometimes DOCTYPE arrives in the middle of the document
4304 */
4305 if ((cur == '<') && (next == '!') &&
4306 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4307 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4308 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4309 (UPP(8) == 'E')) {
4310 if ((!terminate) &&
4311 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4312 goto done;
4313 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4314 ctxt->sax->error(ctxt->userData,
4315 "Misplaced DOCTYPE declaration\n");
4316 ctxt->wellFormed = 0;
4317 htmlParseDocTypeDecl(ctxt);
4318 } else if ((cur == '<') && (next == '!') &&
4319 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4320 if ((!terminate) &&
4321 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4322 goto done;
4323#ifdef DEBUG_PUSH
4324 xmlGenericError(xmlGenericErrorContext,
4325 "HPP: Parsing Comment\n");
4326#endif
4327 htmlParseComment(ctxt);
4328 ctxt->instate = XML_PARSER_CONTENT;
4329 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4330 goto done;
4331 } else if ((cur == '<') && (next == '/')) {
4332 ctxt->instate = XML_PARSER_END_TAG;
4333 ctxt->checkIndex = 0;
4334#ifdef DEBUG_PUSH
4335 xmlGenericError(xmlGenericErrorContext,
4336 "HPP: entering END_TAG\n");
4337#endif
4338 break;
4339 } else if (cur == '<') {
4340 ctxt->instate = XML_PARSER_START_TAG;
4341 ctxt->checkIndex = 0;
4342#ifdef DEBUG_PUSH
4343 xmlGenericError(xmlGenericErrorContext,
4344 "HPP: entering START_TAG\n");
4345#endif
4346 break;
4347 } else if (cur == '&') {
4348 if ((!terminate) &&
4349 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4350 goto done;
4351#ifdef DEBUG_PUSH
4352 xmlGenericError(xmlGenericErrorContext,
4353 "HPP: Parsing Reference\n");
4354#endif
4355 /* TODO: check generation of subtrees if noent !!! */
4356 htmlParseReference(ctxt);
4357 } else {
4358 /* TODO Avoid the extra copy, handle directly !!!!!! */
4359 /*
4360 * Goal of the following test is :
4361 * - minimize calls to the SAX 'character' callback
4362 * when they are mergeable
4363 */
4364 if ((ctxt->inputNr == 1) &&
4365 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4366 if ((!terminate) &&
4367 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4368 goto done;
4369 }
4370 ctxt->checkIndex = 0;
4371#ifdef DEBUG_PUSH
4372 xmlGenericError(xmlGenericErrorContext,
4373 "HPP: Parsing char data\n");
4374#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004375 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004376 }
4377 }
4378 if (cons == ctxt->nbChars) {
4379 if (ctxt->node != NULL) {
4380 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4381 ctxt->sax->error(ctxt->userData,
4382 "detected an error in element content\n");
4383 ctxt->wellFormed = 0;
4384 }
4385 NEXT;
4386 break;
4387 }
4388
4389 break;
4390 }
4391 case XML_PARSER_END_TAG:
4392 if (avail < 2)
4393 goto done;
4394 if ((!terminate) &&
4395 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4396 goto done;
4397 htmlParseEndTag(ctxt);
4398 if (ctxt->nameNr == 0) {
4399 ctxt->instate = XML_PARSER_EPILOG;
4400 } else {
4401 ctxt->instate = XML_PARSER_CONTENT;
4402 }
4403 ctxt->checkIndex = 0;
4404#ifdef DEBUG_PUSH
4405 xmlGenericError(xmlGenericErrorContext,
4406 "HPP: entering CONTENT\n");
4407#endif
4408 break;
4409 case XML_PARSER_CDATA_SECTION:
4410 xmlGenericError(xmlGenericErrorContext,
4411 "HPP: internal error, state == CDATA\n");
4412 ctxt->instate = XML_PARSER_CONTENT;
4413 ctxt->checkIndex = 0;
4414#ifdef DEBUG_PUSH
4415 xmlGenericError(xmlGenericErrorContext,
4416 "HPP: entering CONTENT\n");
4417#endif
4418 break;
4419 case XML_PARSER_DTD:
4420 xmlGenericError(xmlGenericErrorContext,
4421 "HPP: internal error, state == DTD\n");
4422 ctxt->instate = XML_PARSER_CONTENT;
4423 ctxt->checkIndex = 0;
4424#ifdef DEBUG_PUSH
4425 xmlGenericError(xmlGenericErrorContext,
4426 "HPP: entering CONTENT\n");
4427#endif
4428 break;
4429 case XML_PARSER_COMMENT:
4430 xmlGenericError(xmlGenericErrorContext,
4431 "HPP: internal error, state == COMMENT\n");
4432 ctxt->instate = XML_PARSER_CONTENT;
4433 ctxt->checkIndex = 0;
4434#ifdef DEBUG_PUSH
4435 xmlGenericError(xmlGenericErrorContext,
4436 "HPP: entering CONTENT\n");
4437#endif
4438 break;
4439 case XML_PARSER_PI:
4440 xmlGenericError(xmlGenericErrorContext,
4441 "HPP: internal error, state == PI\n");
4442 ctxt->instate = XML_PARSER_CONTENT;
4443 ctxt->checkIndex = 0;
4444#ifdef DEBUG_PUSH
4445 xmlGenericError(xmlGenericErrorContext,
4446 "HPP: entering CONTENT\n");
4447#endif
4448 break;
4449 case XML_PARSER_ENTITY_DECL:
4450 xmlGenericError(xmlGenericErrorContext,
4451 "HPP: internal error, state == ENTITY_DECL\n");
4452 ctxt->instate = XML_PARSER_CONTENT;
4453 ctxt->checkIndex = 0;
4454#ifdef DEBUG_PUSH
4455 xmlGenericError(xmlGenericErrorContext,
4456 "HPP: entering CONTENT\n");
4457#endif
4458 break;
4459 case XML_PARSER_ENTITY_VALUE:
4460 xmlGenericError(xmlGenericErrorContext,
4461 "HPP: internal error, state == ENTITY_VALUE\n");
4462 ctxt->instate = XML_PARSER_CONTENT;
4463 ctxt->checkIndex = 0;
4464#ifdef DEBUG_PUSH
4465 xmlGenericError(xmlGenericErrorContext,
4466 "HPP: entering DTD\n");
4467#endif
4468 break;
4469 case XML_PARSER_ATTRIBUTE_VALUE:
4470 xmlGenericError(xmlGenericErrorContext,
4471 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
4472 ctxt->instate = XML_PARSER_START_TAG;
4473 ctxt->checkIndex = 0;
4474#ifdef DEBUG_PUSH
4475 xmlGenericError(xmlGenericErrorContext,
4476 "HPP: entering START_TAG\n");
4477#endif
4478 break;
4479 case XML_PARSER_SYSTEM_LITERAL:
4480 xmlGenericError(xmlGenericErrorContext,
4481 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
4482 ctxt->instate = XML_PARSER_CONTENT;
4483 ctxt->checkIndex = 0;
4484#ifdef DEBUG_PUSH
4485 xmlGenericError(xmlGenericErrorContext,
4486 "HPP: entering CONTENT\n");
4487#endif
4488 break;
4489 case XML_PARSER_IGNORE:
4490 xmlGenericError(xmlGenericErrorContext,
4491 "HPP: internal error, state == XML_PARSER_IGNORE\n");
4492 ctxt->instate = XML_PARSER_CONTENT;
4493 ctxt->checkIndex = 0;
4494#ifdef DEBUG_PUSH
4495 xmlGenericError(xmlGenericErrorContext,
4496 "HPP: entering CONTENT\n");
4497#endif
4498 break;
4499 }
4500 }
4501done:
4502 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004503 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004504 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4505 /*
4506 * SAX: end of the document processing.
4507 */
4508 ctxt->instate = XML_PARSER_EOF;
4509 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4510 ctxt->sax->endDocument(ctxt->userData);
4511 }
4512 }
4513 if ((ctxt->myDoc != NULL) &&
4514 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4515 (ctxt->instate == XML_PARSER_EPILOG))) {
4516 xmlDtdPtr dtd;
4517 dtd = xmlGetIntSubset(ctxt->myDoc);
4518 if (dtd == NULL)
4519 ctxt->myDoc->intSubset =
4520 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4521 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4522 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4523 }
4524#ifdef DEBUG_PUSH
4525 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4526#endif
4527 return(ret);
4528}
4529
4530/**
Owen Taylor3473f882001-02-23 17:55:21 +00004531 * htmlParseChunk:
4532 * @ctxt: an XML parser context
4533 * @chunk: an char array
4534 * @size: the size in byte of the chunk
4535 * @terminate: last chunk indicator
4536 *
4537 * Parse a Chunk of memory
4538 *
4539 * Returns zero if no error, the xmlParserErrors otherwise.
4540 */
4541int
4542htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4543 int terminate) {
4544 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4545 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4546 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4547 int cur = ctxt->input->cur - ctxt->input->base;
4548
4549 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4550 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4551 ctxt->input->cur = ctxt->input->base + cur;
4552#ifdef DEBUG_PUSH
4553 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4554#endif
4555
4556 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4557 htmlParseTryOrFinish(ctxt, terminate);
4558 } else if (ctxt->instate != XML_PARSER_EOF) {
4559 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
4560 htmlParseTryOrFinish(ctxt, terminate);
4561 }
4562 if (terminate) {
4563 if ((ctxt->instate != XML_PARSER_EOF) &&
4564 (ctxt->instate != XML_PARSER_EPILOG) &&
4565 (ctxt->instate != XML_PARSER_MISC)) {
4566 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004567 ctxt->wellFormed = 0;
4568 }
4569 if (ctxt->instate != XML_PARSER_EOF) {
4570 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4571 ctxt->sax->endDocument(ctxt->userData);
4572 }
4573 ctxt->instate = XML_PARSER_EOF;
4574 }
4575 return((xmlParserErrors) ctxt->errNo);
4576}
4577
4578/************************************************************************
4579 * *
4580 * User entry points *
4581 * *
4582 ************************************************************************/
4583
4584/**
4585 * htmlCreatePushParserCtxt :
4586 * @sax: a SAX handler
4587 * @user_data: The user data returned on SAX callbacks
4588 * @chunk: a pointer to an array of chars
4589 * @size: number of chars in the array
4590 * @filename: an optional file name or URI
4591 * @enc: an optional encoding
4592 *
4593 * Create a parser context for using the HTML parser in push mode
4594 * To allow content encoding detection, @size should be >= 4
4595 * The value of @filename is used for fetching external entities
4596 * and error/warning reports.
4597 *
4598 * Returns the new parser context or NULL
4599 */
4600htmlParserCtxtPtr
4601htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4602 const char *chunk, int size, const char *filename,
4603 xmlCharEncoding enc) {
4604 htmlParserCtxtPtr ctxt;
4605 htmlParserInputPtr inputStream;
4606 xmlParserInputBufferPtr buf;
4607
4608 buf = xmlAllocParserInputBuffer(enc);
4609 if (buf == NULL) return(NULL);
4610
4611 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4612 if (ctxt == NULL) {
4613 xmlFree(buf);
4614 return(NULL);
4615 }
4616 memset(ctxt, 0, sizeof(htmlParserCtxt));
4617 htmlInitParserCtxt(ctxt);
4618 if (sax != NULL) {
4619 if (ctxt->sax != &htmlDefaultSAXHandler)
4620 xmlFree(ctxt->sax);
4621 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4622 if (ctxt->sax == NULL) {
4623 xmlFree(buf);
4624 xmlFree(ctxt);
4625 return(NULL);
4626 }
4627 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4628 if (user_data != NULL)
4629 ctxt->userData = user_data;
4630 }
4631 if (filename == NULL) {
4632 ctxt->directory = NULL;
4633 } else {
4634 ctxt->directory = xmlParserGetDirectory(filename);
4635 }
4636
4637 inputStream = htmlNewInputStream(ctxt);
4638 if (inputStream == NULL) {
4639 xmlFreeParserCtxt(ctxt);
4640 return(NULL);
4641 }
4642
4643 if (filename == NULL)
4644 inputStream->filename = NULL;
4645 else
4646 inputStream->filename = xmlMemStrdup(filename);
4647 inputStream->buf = buf;
4648 inputStream->base = inputStream->buf->buffer->content;
4649 inputStream->cur = inputStream->buf->buffer->content;
4650
4651 inputPush(ctxt, inputStream);
4652
4653 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4654 (ctxt->input->buf != NULL)) {
4655 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4656#ifdef DEBUG_PUSH
4657 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4658#endif
4659 }
4660
4661 return(ctxt);
4662}
4663
4664/**
4665 * htmlSAXParseDoc :
4666 * @cur: a pointer to an array of xmlChar
4667 * @encoding: a free form C string describing the HTML document encoding, or NULL
4668 * @sax: the SAX handler block
4669 * @userData: if using SAX, this pointer will be provided on callbacks.
4670 *
4671 * parse an HTML in-memory document and build a tree.
4672 * It use the given SAX function block to handle the parsing callback.
4673 * If sax is NULL, fallback to the default DOM tree building routines.
4674 *
4675 * Returns the resulting document tree
4676 */
4677
4678htmlDocPtr
4679htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
4680 htmlDocPtr ret;
4681 htmlParserCtxtPtr ctxt;
4682
4683 if (cur == NULL) return(NULL);
4684
4685
4686 ctxt = htmlCreateDocParserCtxt(cur, encoding);
4687 if (ctxt == NULL) return(NULL);
4688 if (sax != NULL) {
4689 ctxt->sax = sax;
4690 ctxt->userData = userData;
4691 }
4692
4693 htmlParseDocument(ctxt);
4694 ret = ctxt->myDoc;
4695 if (sax != NULL) {
4696 ctxt->sax = NULL;
4697 ctxt->userData = NULL;
4698 }
4699 htmlFreeParserCtxt(ctxt);
4700
4701 return(ret);
4702}
4703
4704/**
4705 * htmlParseDoc :
4706 * @cur: a pointer to an array of xmlChar
4707 * @encoding: a free form C string describing the HTML document encoding, or NULL
4708 *
4709 * parse an HTML in-memory document and build a tree.
4710 *
4711 * Returns the resulting document tree
4712 */
4713
4714htmlDocPtr
4715htmlParseDoc(xmlChar *cur, const char *encoding) {
4716 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4717}
4718
4719
4720/**
4721 * htmlCreateFileParserCtxt :
4722 * @filename: the filename
4723 * @encoding: a free form C string describing the HTML document encoding, or NULL
4724 *
4725 * Create a parser context for a file content.
4726 * Automatic support for ZLIB/Compress compressed document is provided
4727 * by default if found at compile-time.
4728 *
4729 * Returns the new parser context or NULL
4730 */
4731htmlParserCtxtPtr
4732htmlCreateFileParserCtxt(const char *filename, const char *encoding)
4733{
4734 htmlParserCtxtPtr ctxt;
4735 htmlParserInputPtr inputStream;
4736 xmlParserInputBufferPtr buf;
4737 /* htmlCharEncoding enc; */
4738 xmlChar *content, *content_line = (xmlChar *) "charset=";
4739
4740 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
4741 if (buf == NULL) return(NULL);
4742
4743 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4744 if (ctxt == NULL) {
4745 perror("malloc");
4746 return(NULL);
4747 }
4748 memset(ctxt, 0, sizeof(htmlParserCtxt));
4749 htmlInitParserCtxt(ctxt);
4750 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
4751 if (inputStream == NULL) {
4752 perror("malloc");
4753 xmlFree(ctxt);
4754 return(NULL);
4755 }
4756 memset(inputStream, 0, sizeof(htmlParserInput));
4757
4758 inputStream->filename = xmlMemStrdup(filename);
4759 inputStream->line = 1;
4760 inputStream->col = 1;
4761 inputStream->buf = buf;
4762 inputStream->directory = NULL;
4763
4764 inputStream->base = inputStream->buf->buffer->content;
4765 inputStream->cur = inputStream->buf->buffer->content;
4766 inputStream->free = NULL;
4767
4768 inputPush(ctxt, inputStream);
4769
4770 /* set encoding */
4771 if (encoding) {
4772 content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
4773 if (content) {
4774 strcpy ((char *)content, (char *)content_line);
4775 strcat ((char *)content, (char *)encoding);
4776 htmlCheckEncoding (ctxt, content);
4777 xmlFree (content);
4778 }
4779 }
4780
4781 return(ctxt);
4782}
4783
4784/**
4785 * htmlSAXParseFile :
4786 * @filename: the filename
4787 * @encoding: a free form C string describing the HTML document encoding, or NULL
4788 * @sax: the SAX handler block
4789 * @userData: if using SAX, this pointer will be provided on callbacks.
4790 *
4791 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4792 * compressed document is provided by default if found at compile-time.
4793 * It use the given SAX function block to handle the parsing callback.
4794 * If sax is NULL, fallback to the default DOM tree building routines.
4795 *
4796 * Returns the resulting document tree
4797 */
4798
4799htmlDocPtr
4800htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
4801 void *userData) {
4802 htmlDocPtr ret;
4803 htmlParserCtxtPtr ctxt;
4804 htmlSAXHandlerPtr oldsax = NULL;
4805
4806 ctxt = htmlCreateFileParserCtxt(filename, encoding);
4807 if (ctxt == NULL) return(NULL);
4808 if (sax != NULL) {
4809 oldsax = ctxt->sax;
4810 ctxt->sax = sax;
4811 ctxt->userData = userData;
4812 }
4813
4814 htmlParseDocument(ctxt);
4815
4816 ret = ctxt->myDoc;
4817 if (sax != NULL) {
4818 ctxt->sax = oldsax;
4819 ctxt->userData = NULL;
4820 }
4821 htmlFreeParserCtxt(ctxt);
4822
4823 return(ret);
4824}
4825
4826/**
4827 * htmlParseFile :
4828 * @filename: the filename
4829 * @encoding: a free form C string describing the HTML document encoding, or NULL
4830 *
4831 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4832 * compressed document is provided by default if found at compile-time.
4833 *
4834 * Returns the resulting document tree
4835 */
4836
4837htmlDocPtr
4838htmlParseFile(const char *filename, const char *encoding) {
4839 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
4840}
4841
4842/**
4843 * htmlHandleOmittedElem:
4844 * @val: int 0 or 1
4845 *
4846 * Set and return the previous value for handling HTML omitted tags.
4847 *
4848 * Returns the last value for 0 for no handling, 1 for auto insertion.
4849 */
4850
4851int
4852htmlHandleOmittedElem(int val) {
4853 int old = htmlOmittedDefaultValue;
4854
4855 htmlOmittedDefaultValue = val;
4856 return(old);
4857}
4858
4859#endif /* LIBXML_HTML_ENABLED */