blob: b096d82e8482ccc8a19da2f6eab7a83d3704ff0d [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * Daniel.Veillard@w3.org
7 */
8
Bjorn Reese70a9da52001-04-21 16:57:29 +00009#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000010#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000011
Owen Taylor3473f882001-02-23 17:55:21 +000012#include <string.h>
13#ifdef HAVE_CTYPE_H
14#include <ctype.h>
15#endif
16#ifdef HAVE_STDLIB_H
17#include <stdlib.h>
18#endif
19#ifdef HAVE_SYS_STAT_H
20#include <sys/stat.h>
21#endif
22#ifdef HAVE_FCNTL_H
23#include <fcntl.h>
24#endif
25#ifdef HAVE_UNISTD_H
26#include <unistd.h>
27#endif
28#ifdef HAVE_ZLIB_H
29#include <zlib.h>
30#endif
31
32#include <libxml/xmlmemory.h>
33#include <libxml/tree.h>
34#include <libxml/parser.h>
35#include <libxml/parserInternals.h>
36#include <libxml/xmlerror.h>
37#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000038#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000039#include <libxml/entities.h>
40#include <libxml/encoding.h>
41#include <libxml/valid.h>
42#include <libxml/xmlIO.h>
43
44#define HTML_MAX_NAMELEN 1000
45#define HTML_PARSER_BIG_BUFFER_SIZE 1000
46#define HTML_PARSER_BUFFER_SIZE 100
47
48/* #define DEBUG */
49/* #define DEBUG_PUSH */
50
51int htmlOmittedDefaultValue = 1;
52
Daniel Veillard56a4cb82001-03-24 17:00:36 +000053xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
54 xmlChar end, xmlChar end2, xmlChar end3);
55
56/************************************************************************
57 * *
Owen Taylor3473f882001-02-23 17:55:21 +000058 * Parser stacks related functions and macros *
59 * *
60 ************************************************************************/
61
62/*
63 * Generic function for accessing stacks in the Parser Context
64 */
65
66#define PUSH_AND_POP(scope, type, name) \
67scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
68 if (ctxt->name##Nr >= ctxt->name##Max) { \
69 ctxt->name##Max *= 2; \
70 ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
71 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
72 if (ctxt->name##Tab == NULL) { \
73 xmlGenericError(xmlGenericErrorContext, \
74 "realloc failed !\n"); \
75 return(0); \
76 } \
77 } \
78 ctxt->name##Tab[ctxt->name##Nr] = value; \
79 ctxt->name = value; \
80 return(ctxt->name##Nr++); \
81} \
82scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
83 type ret; \
84 if (ctxt->name##Nr < 0) return(0); \
85 ctxt->name##Nr--; \
86 if (ctxt->name##Nr < 0) return(0); \
87 if (ctxt->name##Nr > 0) \
88 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
89 else \
90 ctxt->name = NULL; \
91 ret = ctxt->name##Tab[ctxt->name##Nr]; \
92 ctxt->name##Tab[ctxt->name##Nr] = 0; \
93 return(ret); \
94} \
95
Daniel Veillard56a4cb82001-03-24 17:00:36 +000096/* PUSH_AND_POP(static, xmlNodePtr, node) */
97PUSH_AND_POP(static, xmlChar*, name)
Owen Taylor3473f882001-02-23 17:55:21 +000098
99/*
100 * Macros for accessing the content. Those should be used only by the parser,
101 * and not exported.
102 *
103 * Dirty macros, i.e. one need to make assumption on the context to use them
104 *
105 * CUR_PTR return the current pointer to the xmlChar to be parsed.
106 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
107 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
108 * in UNICODE mode. This should be used internally by the parser
109 * only to compare to ASCII values otherwise it would break when
110 * running with UTF-8 encoding.
111 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
112 * to compare on ASCII based substring.
113 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
114 * it should be used only to compare on ASCII based substring.
115 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
116 * strings within the parser.
117 *
118 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
119 *
120 * CURRENT Returns the current char value, with the full decoding of
121 * UTF-8 if we are using this mode. It returns an int.
122 * NEXT Skip to the next character, this does the proper decoding
123 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
124 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
125 */
126
127#define UPPER (toupper(*ctxt->input->cur))
128
129#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
130
131#define NXT(val) ctxt->input->cur[(val)]
132
133#define UPP(val) (toupper(ctxt->input->cur[(val)]))
134
135#define CUR_PTR ctxt->input->cur
136
137#define SHRINK xmlParserInputShrink(ctxt->input)
138
139#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
140
141#define CURRENT ((int) (*ctxt->input->cur))
142
143#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
144
145/* Inported from XML */
146
147/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
148#define CUR ((int) (*ctxt->input->cur))
149#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
150
151#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
152#define NXT(val) ctxt->input->cur[(val)]
153#define CUR_PTR ctxt->input->cur
154
155
156#define NEXTL(l) do { \
157 if (*(ctxt->input->cur) == '\n') { \
158 ctxt->input->line++; ctxt->input->col = 1; \
159 } else ctxt->input->col++; \
160 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
161 } while (0)
162
163/************
164 \
165 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
166 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
167 ************/
168
169#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
170#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
171
172#define COPY_BUF(l,b,i,v) \
173 if (l == 1) b[i++] = (xmlChar) v; \
174 else i += xmlCopyChar(l,&b[i],v)
175
176/**
177 * htmlCurrentChar:
178 * @ctxt: the HTML parser context
179 * @len: pointer to the length of the char read
180 *
181 * The current char value, if using UTF-8 this may actaully span multiple
182 * bytes in the input buffer. Implement the end of line normalization:
183 * 2.11 End-of-Line Handling
184 * If the encoding is unspecified, in the case we find an ISO-Latin-1
185 * char, then the encoding converter is plugged in automatically.
186 *
187 * Returns the current char value and its lenght
188 */
189
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000190static int
Owen Taylor3473f882001-02-23 17:55:21 +0000191htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
192 if (ctxt->instate == XML_PARSER_EOF)
193 return(0);
194
195 if (ctxt->token != 0) {
196 *len = 0;
197 return(ctxt->token);
198 }
199 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
200 /*
201 * We are supposed to handle UTF8, check it's valid
202 * From rfc2044: encoding of the Unicode values on UTF-8:
203 *
204 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
205 * 0000 0000-0000 007F 0xxxxxxx
206 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
207 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
208 *
209 * Check for the 0x110000 limit too
210 */
211 const unsigned char *cur = ctxt->input->cur;
212 unsigned char c;
213 unsigned int val;
214
215 c = *cur;
216 if (c & 0x80) {
217 if (cur[1] == 0)
218 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
219 if ((cur[1] & 0xc0) != 0x80)
220 goto encoding_error;
221 if ((c & 0xe0) == 0xe0) {
222
223 if (cur[2] == 0)
224 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
225 if ((cur[2] & 0xc0) != 0x80)
226 goto encoding_error;
227 if ((c & 0xf0) == 0xf0) {
228 if (cur[3] == 0)
229 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
230 if (((c & 0xf8) != 0xf0) ||
231 ((cur[3] & 0xc0) != 0x80))
232 goto encoding_error;
233 /* 4-byte code */
234 *len = 4;
235 val = (cur[0] & 0x7) << 18;
236 val |= (cur[1] & 0x3f) << 12;
237 val |= (cur[2] & 0x3f) << 6;
238 val |= cur[3] & 0x3f;
239 } else {
240 /* 3-byte code */
241 *len = 3;
242 val = (cur[0] & 0xf) << 12;
243 val |= (cur[1] & 0x3f) << 6;
244 val |= cur[2] & 0x3f;
245 }
246 } else {
247 /* 2-byte code */
248 *len = 2;
249 val = (cur[0] & 0x1f) << 6;
250 val |= cur[1] & 0x3f;
251 }
252 if (!IS_CHAR(val)) {
253 ctxt->errNo = XML_ERR_INVALID_ENCODING;
254 if ((ctxt->sax != NULL) &&
255 (ctxt->sax->error != NULL))
256 ctxt->sax->error(ctxt->userData,
257 "Char 0x%X out of allowed range\n", val);
258 ctxt->wellFormed = 0;
259 ctxt->disableSAX = 1;
260 }
261 return(val);
262 } else {
263 /* 1-byte code */
264 *len = 1;
265 return((int) *ctxt->input->cur);
266 }
267 }
268 /*
269 * Assume it's a fixed lenght encoding (1) with
270 * a compatibke encoding for the ASCII set, since
271 * XML constructs only use < 128 chars
272 */
273 *len = 1;
274 if ((int) *ctxt->input->cur < 0x80)
275 return((int) *ctxt->input->cur);
276
277 /*
278 * Humm this is bad, do an automatic flow conversion
279 */
280 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
281 ctxt->charset = XML_CHAR_ENCODING_UTF8;
282 return(xmlCurrentChar(ctxt, len));
283
284encoding_error:
285 /*
286 * If we detect an UTF8 error that probably mean that the
287 * input encoding didn't get properly advertized in the
288 * declaration header. Report the error and switch the encoding
289 * to ISO-Latin-1 (if you don't like this policy, just declare the
290 * encoding !)
291 */
292 ctxt->errNo = XML_ERR_INVALID_ENCODING;
293 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
294 ctxt->sax->error(ctxt->userData,
295 "Input is not proper UTF-8, indicate encoding !\n");
296 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
297 ctxt->input->cur[0], ctxt->input->cur[1],
298 ctxt->input->cur[2], ctxt->input->cur[3]);
299 }
300
301 ctxt->charset = XML_CHAR_ENCODING_8859_1;
302 *len = 1;
303 return((int) *ctxt->input->cur);
304}
305
306/**
Owen Taylor3473f882001-02-23 17:55:21 +0000307 * htmlSkipBlankChars:
308 * @ctxt: the HTML parser context
309 *
310 * skip all blanks character found at that point in the input streams.
311 *
312 * Returns the number of space chars skipped
313 */
314
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000315static int
Owen Taylor3473f882001-02-23 17:55:21 +0000316htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
317 int res = 0;
318
319 while (IS_BLANK(*(ctxt->input->cur))) {
320 if ((*ctxt->input->cur == 0) &&
321 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
322 xmlPopInput(ctxt);
323 } else {
324 if (*(ctxt->input->cur) == '\n') {
325 ctxt->input->line++; ctxt->input->col = 1;
326 } else ctxt->input->col++;
327 ctxt->input->cur++;
328 ctxt->nbChars++;
329 if (*ctxt->input->cur == 0)
330 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
331 }
332 res++;
333 }
334 return(res);
335}
336
337
338
339/************************************************************************
340 * *
341 * The list of HTML elements and their properties *
342 * *
343 ************************************************************************/
344
345/*
346 * Start Tag: 1 means the start tag can be ommited
347 * End Tag: 1 means the end tag can be ommited
348 * 2 means it's forbidden (empty elements)
Daniel Veillard56098d42001-04-24 12:51:09 +0000349 * 3 means the tag is stylistic and should be closed easilly
Owen Taylor3473f882001-02-23 17:55:21 +0000350 * Depr: this element is deprecated
351 * DTD: 1 means that this element is valid only in the Loose DTD
352 * 2 means that this element is valid only in the Frameset DTD
353 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000354 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Owen Taylor3473f882001-02-23 17:55:21 +0000355 */
356htmlElemDesc html40ElementTable[] = {
Daniel Veillard02bb1702001-06-13 21:11:59 +0000357{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor " },
358{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form" },
359{ "acronym", 0, 0, 0, 0, 0, 0, 1, "" },
360{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author " },
361{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet " },
362{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area " },
363{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style" },
364{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri " },
365{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " },
366{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride " },
367{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style" },
368{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation " },
369{ "body", 1, 1, 0, 0, 0, 0, 0, "document body " },
370{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break " },
371{ "button", 0, 0, 0, 0, 0, 0, 2, "push button " },
372{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption " },
373{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center " },
374{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation" },
375{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment" },
376{ "col", 0, 2, 2, 1, 0, 0, 0, "table column " },
377{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group " },
378{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description " },
379{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text " },
380{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition" },
381{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list" },
382{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container"},
383{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list " },
384{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term " },
385{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis" },
386{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group " },
387{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font " },
388{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form " },
389{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " },
390{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" },
391{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading " },
392{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading " },
393{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading " },
394{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading " },
395{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading " },
396{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading " },
397{ "head", 1, 1, 0, 0, 0, 0, 0, "document head " },
398{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " },
399{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element " },
400{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style" },
401{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow " },
402{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image " },
403{ "input", 0, 2, 2, 1, 0, 0, 1, "form control " },
404{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text" },
405{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt " },
406{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user" },
407{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text " },
408{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend " },
409{ "li", 0, 1, 1, 0, 0, 0, 0, "list item " },
410{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link " },
411{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map " },
412{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list " },
413{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation " },
414{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering " },
415{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
416{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object " },
417{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list " },
418{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group " },
419{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " },
420{ "p", 0, 1, 1, 0, 0, 0, 0, "paragraph " },
421{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value " },
422{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text " },
423{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation " },
424{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style" },
425{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc." },
426{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements " },
427{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector " },
428{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style" },
429{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container " },
430{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text" },
431{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis" },
432{ "style", 0, 0, 0, 0, 0, 0, 0, "style info " },
433{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript" },
434{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript " },
435{ "table", 0, 0, 0, 0, 0, 0, 0, "&#160;" },
436{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body " },
437{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell" },
438{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field " },
439{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer " },
440{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell" },
441{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header " },
442{ "title", 0, 0, 0, 0, 0, 0, 0, "document title " },
443{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row " },
444{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style" },
445{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style" },
446{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list " },
447{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument" },
Owen Taylor3473f882001-02-23 17:55:21 +0000448};
449
450/*
451 * start tags that imply the end of a current element
452 * any tag of each line implies the end of the current element if the type of
453 * that element is in the same line
454 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000455const char *htmlEquEnd[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000456"dt", "dd", "li", "option", NULL,
457"h1", "h2", "h3", "h4", "h5", "h6", NULL,
458"ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,
459NULL
460};
461/*
462 * acording the HTML DTD, HR should be added to the 2nd line above, as it
463 * is not allowed within a H1, H2, H3, etc. But we should tolerate that case
464 * because many documents contain rules in headings...
465 */
466
467/*
468 * start tags that imply the end of current element
469 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000470const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000471"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
472 "dl", "ul", "ol", "menu", "dir", "address", "pre",
473 "listing", "xmp", "head", NULL,
474"head", "p", NULL,
475"title", "p", NULL,
476"body", "head", "style", "link", "title", "p", NULL,
477"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
478 "pre", "listing", "xmp", "head", "li", NULL,
479"hr", "p", "head", NULL,
480"h1", "p", "head", NULL,
481"h2", "p", "head", NULL,
482"h3", "p", "head", NULL,
483"h4", "p", "head", NULL,
484"h5", "p", "head", NULL,
485"h6", "p", "head", NULL,
486"dir", "p", "head", NULL,
487"address", "p", "head", "ul", NULL,
488"pre", "p", "head", "ul", NULL,
489"listing", "p", "head", NULL,
490"xmp", "p", "head", NULL,
491"blockquote", "p", "head", NULL,
492"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
493 "xmp", "head", NULL,
494"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
495 "head", "dd", NULL,
496"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
497 "head", "dt", NULL,
498"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
499 "listing", "xmp", NULL,
500"ol", "p", "head", "ul", NULL,
501"menu", "p", "head", "ul", NULL,
502"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
503"div", "p", "head", NULL,
504"noscript", "p", "head", NULL,
505"center", "font", "b", "i", "p", "head", NULL,
506"a", "a", NULL,
507"caption", "p", NULL,
508"colgroup", "caption", "colgroup", "col", "p", NULL,
509"col", "caption", "col", "p", NULL,
510"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
511 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000512"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
513"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000514"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
515"thead", "caption", "col", "colgroup", NULL,
516"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
517 "tbody", "p", NULL,
518"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
519 "tfoot", "tbody", "p", NULL,
520"optgroup", "option", NULL,
521"option", "option", NULL,
522"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
523 "pre", "listing", "xmp", "a", NULL,
524NULL
525};
526
527/*
528 * The list of HTML elements which are supposed not to have
529 * CDATA content and where a p element will be implied
530 *
531 * TODO: extend that list by reading the HTML SGML DtD on
532 * implied paragraph
533 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000534static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000535 "html",
536 "head",
537 "body",
538 NULL
539};
540
541/*
542 * The list of HTML attributes which are of content %Script;
543 * NOTE: when adding ones, check htmlIsScriptAttribute() since
544 * it assumes the name starts with 'on'
545 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000546static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000547 "onclick",
548 "ondblclick",
549 "onmousedown",
550 "onmouseup",
551 "onmouseover",
552 "onmousemove",
553 "onmouseout",
554 "onkeypress",
555 "onkeydown",
556 "onkeyup",
557 "onload",
558 "onunload",
559 "onfocus",
560 "onblur",
561 "onsubmit",
562 "onrest",
563 "onchange",
564 "onselect"
565};
566
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000567/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000568 * This table is used by the htmlparser to know what to do with
569 * broken html pages. By assigning different priorities to different
570 * elements the parser can decide how to handle extra endtags.
571 * Endtags are only allowed to close elements with lower or equal
572 * priority.
573 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000574
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000575typedef struct {
576 const char *name;
577 int priority;
578} elementPriority;
579
580const elementPriority htmlEndPriority[] = {
581 {"div", 150},
582 {"td", 160},
583 {"th", 160},
584 {"tr", 170},
585 {"thead", 180},
586 {"tbody", 180},
587 {"tfoot", 180},
588 {"table", 190},
589 {"head", 200},
590 {"body", 200},
591 {"html", 220},
592 {NULL, 100} /* Default priority */
593};
Owen Taylor3473f882001-02-23 17:55:21 +0000594
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000595static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +0000596static int htmlStartCloseIndexinitialized = 0;
597
598/************************************************************************
599 * *
600 * functions to handle HTML specific data *
601 * *
602 ************************************************************************/
603
604/**
605 * htmlInitAutoClose:
606 *
607 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
608 * This is not reentrant. Call xmlInitParser() once before processing in
609 * case of use in multithreaded programs.
610 */
611void
612htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000613 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000614
615 if (htmlStartCloseIndexinitialized) return;
616
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000617 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
618 indx = 0;
619 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
620 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +0000621 while (htmlStartClose[i] != NULL) i++;
622 i++;
623 }
624 htmlStartCloseIndexinitialized = 1;
625}
626
627/**
628 * htmlTagLookup:
629 * @tag: The tag name in lowercase
630 *
631 * Lookup the HTML tag in the ElementTable
632 *
633 * Returns the related htmlElemDescPtr or NULL if not found.
634 */
635htmlElemDescPtr
636htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000637 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +0000638
639 for (i = 0; i < (sizeof(html40ElementTable) /
640 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000641 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Owen Taylor3473f882001-02-23 17:55:21 +0000642 return(&html40ElementTable[i]);
643 }
644 return(NULL);
645}
646
647/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000648 * htmlGetEndPriority:
649 * @name: The name of the element to look up the priority for.
650 *
651 * Return value: The "endtag" priority.
652 **/
653static int
654htmlGetEndPriority (const xmlChar *name) {
655 int i = 0;
656
657 while ((htmlEndPriority[i].name != NULL) &&
658 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
659 i++;
660
661 return(htmlEndPriority[i].priority);
662}
663
664/**
Owen Taylor3473f882001-02-23 17:55:21 +0000665 * htmlCheckAutoClose:
666 * @newtag: The new tag name
667 * @oldtag: The old tag name
668 *
669 * Checks wether the new tag is one of the registered valid tags for closing old.
670 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
671 *
672 * Returns 0 if no, 1 if yes.
673 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000674static int
Owen Taylor3473f882001-02-23 17:55:21 +0000675htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000676 int i, indx;
677 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +0000678
679 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
680
681 /* inefficient, but not a big deal */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000682 for (indx = 0; indx < 100;indx++) {
683 closed = htmlStartCloseIndex[indx];
684 if (closed == NULL) return(0);
685 if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor3473f882001-02-23 17:55:21 +0000686 }
687
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000688 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +0000689 i++;
690 while (htmlStartClose[i] != NULL) {
691 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
692 return(1);
693 }
694 i++;
695 }
696 return(0);
697}
698
699/**
700 * htmlAutoCloseOnClose:
701 * @ctxt: an HTML parser context
702 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000703 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +0000704 *
705 * The HTmL DtD allows an ending tag to implicitely close other tags.
706 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000707static void
Owen Taylor3473f882001-02-23 17:55:21 +0000708htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
709 htmlElemDescPtr info;
710 xmlChar *oldname;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000711 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +0000712
713#ifdef DEBUG
714 xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
715 for (i = 0;i < ctxt->nameNr;i++)
716 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
717#endif
718
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000719 priority = htmlGetEndPriority (newtag);
720
Owen Taylor3473f882001-02-23 17:55:21 +0000721 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000722
Owen Taylor3473f882001-02-23 17:55:21 +0000723 if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000724 /*
725 * A missplaced endtagad can only close elements with lower
726 * or equal priority, so if we find an element with higher
727 * priority before we find an element with
728 * matching name, we just ignore this endtag
729 */
730 if (htmlGetEndPriority (ctxt->nameTab[i]) > priority) return;
Owen Taylor3473f882001-02-23 17:55:21 +0000731 }
732 if (i < 0) return;
733
734 while (!xmlStrEqual(newtag, ctxt->name)) {
735 info = htmlTagLookup(ctxt->name);
736 if ((info == NULL) || (info->endTag == 1)) {
737#ifdef DEBUG
738 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
739#endif
Daniel Veillard56098d42001-04-24 12:51:09 +0000740 } else if (info->endTag == 3) {
741#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +0000742 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", newtag, ctxt->name);
Daniel Veillard56098d42001-04-24 12:51:09 +0000743#endif
744 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
745 ctxt->sax->error(ctxt->userData,
746 "Opening and ending tag mismatch: %s and %s\n",
747 newtag, ctxt->name);
748 ctxt->wellFormed = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000749 }
750 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
751 ctxt->sax->endElement(ctxt->userData, ctxt->name);
752 oldname = htmlnamePop(ctxt);
753 if (oldname != NULL) {
754#ifdef DEBUG
755 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
756#endif
757 xmlFree(oldname);
758 }
759 }
760}
761
762/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000763 * htmlAutoCloseOnEnd:
764 * @ctxt: an HTML parser context
765 *
766 * Close all remaining tags at the end of the stream
767 */
768static void
769htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
770 xmlChar *oldname;
771 int i;
772
773 if (ctxt->nameNr == 0)
774 return;
775#ifdef DEBUG
776 xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
777#endif
778
779 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
780#ifdef DEBUG
781 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
782#endif
783 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
784 ctxt->sax->endElement(ctxt->userData, ctxt->name);
785 oldname = htmlnamePop(ctxt);
786 if (oldname != NULL) {
787#ifdef DEBUG
788 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
789#endif
790 xmlFree(oldname);
791 }
792 }
793}
794
795/**
Owen Taylor3473f882001-02-23 17:55:21 +0000796 * htmlAutoClose:
797 * @ctxt: an HTML parser context
798 * @newtag: The new tag name or NULL
799 *
800 * The HTmL DtD allows a tag to implicitely close other tags.
801 * The list is kept in htmlStartClose array. This function is
802 * called when a new tag has been detected and generates the
803 * appropriates closes if possible/needed.
804 * If newtag is NULL this mean we are at the end of the resource
805 * and we should check
806 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000807static void
Owen Taylor3473f882001-02-23 17:55:21 +0000808htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
809 xmlChar *oldname;
810 while ((newtag != NULL) && (ctxt->name != NULL) &&
811 (htmlCheckAutoClose(newtag, ctxt->name))) {
812#ifdef DEBUG
813 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
814#endif
815 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
816 ctxt->sax->endElement(ctxt->userData, ctxt->name);
817 oldname = htmlnamePop(ctxt);
818 if (oldname != NULL) {
819#ifdef DEBUG
820 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
821#endif
822 xmlFree(oldname);
823 }
824 }
825 if (newtag == NULL) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +0000826 htmlAutoCloseOnEnd(ctxt);
827 return;
Owen Taylor3473f882001-02-23 17:55:21 +0000828 }
829 while ((newtag == NULL) && (ctxt->name != NULL) &&
830 ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
831 (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
832 (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
833#ifdef DEBUG
834 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
835#endif
836 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
837 ctxt->sax->endElement(ctxt->userData, ctxt->name);
838 oldname = htmlnamePop(ctxt);
839 if (oldname != NULL) {
840#ifdef DEBUG
841 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
842#endif
843 xmlFree(oldname);
844 }
845 }
846
847}
848
849/**
850 * htmlAutoCloseTag:
851 * @doc: the HTML document
852 * @name: The tag name
853 * @elem: the HTML element
854 *
855 * The HTmL DtD allows a tag to implicitely close other tags.
856 * The list is kept in htmlStartClose array. This function checks
857 * if the element or one of it's children would autoclose the
858 * given tag.
859 *
860 * Returns 1 if autoclose, 0 otherwise
861 */
862int
863htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
864 htmlNodePtr child;
865
866 if (elem == NULL) return(1);
867 if (xmlStrEqual(name, elem->name)) return(0);
868 if (htmlCheckAutoClose(elem->name, name)) return(1);
869 child = elem->children;
870 while (child != NULL) {
871 if (htmlAutoCloseTag(doc, name, child)) return(1);
872 child = child->next;
873 }
874 return(0);
875}
876
877/**
878 * htmlIsAutoClosed:
879 * @doc: the HTML document
880 * @elem: the HTML element
881 *
882 * The HTmL DtD allows a tag to implicitely close other tags.
883 * The list is kept in htmlStartClose array. This function checks
884 * if a tag is autoclosed by one of it's child
885 *
886 * Returns 1 if autoclosed, 0 otherwise
887 */
888int
889htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
890 htmlNodePtr child;
891
892 if (elem == NULL) return(1);
893 child = elem->children;
894 while (child != NULL) {
895 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
896 child = child->next;
897 }
898 return(0);
899}
900
901/**
902 * htmlCheckImplied:
903 * @ctxt: an HTML parser context
904 * @newtag: The new tag name
905 *
906 * The HTML DtD allows a tag to exists only implicitely
907 * called when a new tag has been detected and generates the
908 * appropriates implicit tags if missing
909 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000910static void
Owen Taylor3473f882001-02-23 17:55:21 +0000911htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
912 if (!htmlOmittedDefaultValue)
913 return;
914 if (xmlStrEqual(newtag, BAD_CAST"html"))
915 return;
916 if (ctxt->nameNr <= 0) {
917#ifdef DEBUG
918 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
919#endif
920 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
921 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
922 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
923 }
924 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
925 return;
926 if ((ctxt->nameNr <= 1) &&
927 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
928 (xmlStrEqual(newtag, BAD_CAST"style")) ||
929 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
930 (xmlStrEqual(newtag, BAD_CAST"link")) ||
931 (xmlStrEqual(newtag, BAD_CAST"title")) ||
932 (xmlStrEqual(newtag, BAD_CAST"base")))) {
933 /*
934 * dropped OBJECT ... i you put it first BODY will be
935 * assumed !
936 */
937#ifdef DEBUG
938 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
939#endif
940 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
941 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
942 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
943 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
944 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
945 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
946 int i;
947 for (i = 0;i < ctxt->nameNr;i++) {
948 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
949 return;
950 }
951 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
952 return;
953 }
954 }
955
956#ifdef DEBUG
957 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
958#endif
959 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
960 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
961 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
962 }
963}
964
965/**
966 * htmlCheckParagraph
967 * @ctxt: an HTML parser context
968 *
969 * Check whether a p element need to be implied before inserting
970 * characters in the current element.
971 *
972 * Returns 1 if a paragraph has been inserted, 0 if not and -1
973 * in case of error.
974 */
975
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000976static int
Owen Taylor3473f882001-02-23 17:55:21 +0000977htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
978 const xmlChar *tag;
979 int i;
980
981 if (ctxt == NULL)
982 return(-1);
983 tag = ctxt->name;
984 if (tag == NULL) {
985 htmlAutoClose(ctxt, BAD_CAST"p");
986 htmlCheckImplied(ctxt, BAD_CAST"p");
987 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
988 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
989 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
990 return(1);
991 }
992 if (!htmlOmittedDefaultValue)
993 return(0);
994 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
995 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
996#ifdef DEBUG
997 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
998#endif
999 htmlAutoClose(ctxt, BAD_CAST"p");
1000 htmlCheckImplied(ctxt, BAD_CAST"p");
1001 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
1002 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1003 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1004 return(1);
1005 }
1006 }
1007 return(0);
1008}
1009
1010/**
1011 * htmlIsScriptAttribute:
1012 * @name: an attribute name
1013 *
1014 * Check if an attribute is of content type Script
1015 *
1016 * Returns 1 is the attribute is a script 0 otherwise
1017 */
1018int
1019htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001020 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001021
1022 if (name == NULL)
1023 return(0);
1024 /*
1025 * all script attributes start with 'on'
1026 */
1027 if ((name[0] != 'o') || (name[1] != 'n'))
1028 return(0);
1029 for (i = 0;
1030 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1031 i++) {
1032 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1033 return(1);
1034 }
1035 return(0);
1036}
1037
1038/************************************************************************
1039 * *
1040 * The list of HTML predefined entities *
1041 * *
1042 ************************************************************************/
1043
1044
1045htmlEntityDesc html40EntitiesTable[] = {
1046/*
1047 * the 4 absolute ones, plus apostrophe.
1048 */
1049{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1050{ 38, "amp", "ampersand, U+0026 ISOnum" },
1051{ 39, "apos", "single quote" },
1052{ 60, "lt", "less-than sign, U+003C ISOnum" },
1053{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1054
1055/*
1056 * A bunch still in the 128-255 range
1057 * Replacing them depend really on the charset used.
1058 */
1059{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1060{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1061{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1062{ 163, "pound","pound sign, U+00A3 ISOnum" },
1063{ 164, "curren","currency sign, U+00A4 ISOnum" },
1064{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1065{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1066{ 167, "sect", "section sign, U+00A7 ISOnum" },
1067{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1068{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1069{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1070{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1071{ 172, "not", "not sign, U+00AC ISOnum" },
1072{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1073{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1074{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1075{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1076{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1077{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1078{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1079{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1080{ 181, "micro","micro sign, U+00B5 ISOnum" },
1081{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1082{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1083{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1084{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1085{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1086{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1087{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1088{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1089{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1090{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1091{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1092{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1093{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1094{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1095{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1096{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1097{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1098{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1099{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1100{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1101{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1102{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1103{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1104{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1105{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1106{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1107{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1108{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1109{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1110{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1111{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1112{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1113{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1114{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1115{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1116{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1117{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1118{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1119{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1120{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1121{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1122{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1123{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1124{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1125{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1126{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1127{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1128{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1129{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1130{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1131{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1132{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1133{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1134{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1135{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1136{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1137{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1138{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1139{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1140{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1141{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1142{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1143{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1144{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1145{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1146{ 247, "divide","division sign, U+00F7 ISOnum" },
1147{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1148{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1149{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1150{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1151{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1152{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1153{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1154{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1155
1156{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1157{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1158{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1159{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1160{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1161
1162/*
1163 * Anything below should really be kept as entities references
1164 */
1165{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1166
1167{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1168{ 732, "tilde","small tilde, U+02DC ISOdia" },
1169
1170{ 913, "Alpha","greek capital letter alpha, U+0391" },
1171{ 914, "Beta", "greek capital letter beta, U+0392" },
1172{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1173{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1174{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1175{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1176{ 919, "Eta", "greek capital letter eta, U+0397" },
1177{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1178{ 921, "Iota", "greek capital letter iota, U+0399" },
1179{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001180{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001181{ 924, "Mu", "greek capital letter mu, U+039C" },
1182{ 925, "Nu", "greek capital letter nu, U+039D" },
1183{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1184{ 927, "Omicron","greek capital letter omicron, U+039F" },
1185{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1186{ 929, "Rho", "greek capital letter rho, U+03A1" },
1187{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1188{ 932, "Tau", "greek capital letter tau, U+03A4" },
1189{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1190{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1191{ 935, "Chi", "greek capital letter chi, U+03A7" },
1192{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1193{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1194
1195{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1196{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1197{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1198{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1199{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1200{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1201{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1202{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1203{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1204{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1205{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1206{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1207{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1208{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1209{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1210{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1211{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1212{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1213{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1214{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1215{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1216{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1217{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1218{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1219{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1220{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1221{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1222{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1223
1224{ 8194, "ensp", "en space, U+2002 ISOpub" },
1225{ 8195, "emsp", "em space, U+2003 ISOpub" },
1226{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1227{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1228{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1229{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1230{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1231{ 8211, "ndash","en dash, U+2013 ISOpub" },
1232{ 8212, "mdash","em dash, U+2014 ISOpub" },
1233{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1234{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1235{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1236{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1237{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1238{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1239{ 8224, "dagger","dagger, U+2020 ISOpub" },
1240{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1241
1242{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1243{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1244
1245{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1246
1247{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1248{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1249
1250{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1251{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1252
1253{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1254{ 8260, "frasl","fraction slash, U+2044 NEW" },
1255
1256{ 8364, "euro", "euro sign, U+20AC NEW" },
1257
1258{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1259{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1260{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1261{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1262{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1263{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1264{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1265{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1266{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1267{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1268{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1269{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1270{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1271{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1272{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1273{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1274
1275{ 8704, "forall","for all, U+2200 ISOtech" },
1276{ 8706, "part", "partial differential, U+2202 ISOtech" },
1277{ 8707, "exist","there exists, U+2203 ISOtech" },
1278{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1279{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1280{ 8712, "isin", "element of, U+2208 ISOtech" },
1281{ 8713, "notin","not an element of, U+2209 ISOtech" },
1282{ 8715, "ni", "contains as member, U+220B ISOtech" },
1283{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1284{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
1285{ 8722, "minus","minus sign, U+2212 ISOtech" },
1286{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1287{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1288{ 8733, "prop", "proportional to, U+221D ISOtech" },
1289{ 8734, "infin","infinity, U+221E ISOtech" },
1290{ 8736, "ang", "angle, U+2220 ISOamso" },
1291{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1292{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1293{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1294{ 8746, "cup", "union = cup, U+222A ISOtech" },
1295{ 8747, "int", "integral, U+222B ISOtech" },
1296{ 8756, "there4","therefore, U+2234 ISOtech" },
1297{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1298{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1299{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1300{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1301{ 8801, "equiv","identical to, U+2261 ISOtech" },
1302{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1303{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1304{ 8834, "sub", "subset of, U+2282 ISOtech" },
1305{ 8835, "sup", "superset of, U+2283 ISOtech" },
1306{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1307{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1308{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1309{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1310{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1311{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1312{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1313{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1314{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1315{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1316{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1317{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1318{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1319{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1320
1321{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1322{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1323{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1324{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1325
1326};
1327
1328/************************************************************************
1329 * *
1330 * Commodity functions to handle entities *
1331 * *
1332 ************************************************************************/
1333
1334/*
1335 * Macro used to grow the current buffer.
1336 */
1337#define growBuffer(buffer) { \
1338 buffer##_size *= 2; \
1339 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1340 if (buffer == NULL) { \
1341 perror("realloc failed"); \
1342 return(NULL); \
1343 } \
1344}
1345
1346/**
1347 * htmlEntityLookup:
1348 * @name: the entity name
1349 *
1350 * Lookup the given entity in EntitiesTable
1351 *
1352 * TODO: the linear scan is really ugly, an hash table is really needed.
1353 *
1354 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1355 */
1356htmlEntityDescPtr
1357htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001358 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001359
1360 for (i = 0;i < (sizeof(html40EntitiesTable)/
1361 sizeof(html40EntitiesTable[0]));i++) {
1362 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1363#ifdef DEBUG
1364 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
1365#endif
1366 return(&html40EntitiesTable[i]);
1367 }
1368 }
1369 return(NULL);
1370}
1371
1372/**
1373 * htmlEntityValueLookup:
1374 * @value: the entity's unicode value
1375 *
1376 * Lookup the given entity in EntitiesTable
1377 *
1378 * TODO: the linear scan is really ugly, an hash table is really needed.
1379 *
1380 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1381 */
1382htmlEntityDescPtr
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001383htmlEntityValueLookup(unsigned int value) {
1384 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001385#ifdef DEBUG
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00001386 unsigned int lv = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001387#endif
1388
1389 for (i = 0;i < (sizeof(html40EntitiesTable)/
1390 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001391 if (html40EntitiesTable[i].value >= value) {
1392 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001393 break;
1394#ifdef DEBUG
1395 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
1396#endif
1397 return(&html40EntitiesTable[i]);
1398 }
1399#ifdef DEBUG
1400 if (lv > html40EntitiesTable[i].value) {
1401 xmlGenericError(xmlGenericErrorContext,
1402 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1403 lv, html40EntitiesTable[i].value);
1404 }
1405 lv = html40EntitiesTable[i].value;
1406#endif
1407 }
1408 return(NULL);
1409}
1410
1411/**
1412 * UTF8ToHtml:
1413 * @out: a pointer to an array of bytes to store the result
1414 * @outlen: the length of @out
1415 * @in: a pointer to an array of UTF-8 chars
1416 * @inlen: the length of @in
1417 *
1418 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1419 * plus HTML entities block of chars out.
1420 *
1421 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1422 * The value of @inlen after return is the number of octets consumed
1423 * as the return value is positive, else unpredictiable.
1424 * The value of @outlen after return is the number of octets consumed.
1425 */
1426int
1427UTF8ToHtml(unsigned char* out, int *outlen,
1428 const unsigned char* in, int *inlen) {
1429 const unsigned char* processed = in;
1430 const unsigned char* outend;
1431 const unsigned char* outstart = out;
1432 const unsigned char* instart = in;
1433 const unsigned char* inend;
1434 unsigned int c, d;
1435 int trailing;
1436
1437 if (in == NULL) {
1438 /*
1439 * initialization nothing to do
1440 */
1441 *outlen = 0;
1442 *inlen = 0;
1443 return(0);
1444 }
1445 inend = in + (*inlen);
1446 outend = out + (*outlen);
1447 while (in < inend) {
1448 d = *in++;
1449 if (d < 0x80) { c= d; trailing= 0; }
1450 else if (d < 0xC0) {
1451 /* trailing byte in leading position */
1452 *outlen = out - outstart;
1453 *inlen = processed - instart;
1454 return(-2);
1455 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1456 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1457 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1458 else {
1459 /* no chance for this in Ascii */
1460 *outlen = out - outstart;
1461 *inlen = processed - instart;
1462 return(-2);
1463 }
1464
1465 if (inend - in < trailing) {
1466 break;
1467 }
1468
1469 for ( ; trailing; trailing--) {
1470 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1471 break;
1472 c <<= 6;
1473 c |= d & 0x3F;
1474 }
1475
1476 /* assertion: c is a single UTF-4 value */
1477 if (c < 0x80) {
1478 if (out + 1 >= outend)
1479 break;
1480 *out++ = c;
1481 } else {
1482 int len;
1483 htmlEntityDescPtr ent;
1484
1485 /*
1486 * Try to lookup a predefined HTML entity for it
1487 */
1488
1489 ent = htmlEntityValueLookup(c);
1490 if (ent == NULL) {
1491 /* no chance for this in Ascii */
1492 *outlen = out - outstart;
1493 *inlen = processed - instart;
1494 return(-2);
1495 }
1496 len = strlen(ent->name);
1497 if (out + 2 + len >= outend)
1498 break;
1499 *out++ = '&';
1500 memcpy(out, ent->name, len);
1501 out += len;
1502 *out++ = ';';
1503 }
1504 processed = in;
1505 }
1506 *outlen = out - outstart;
1507 *inlen = processed - instart;
1508 return(0);
1509}
1510
1511/**
1512 * htmlEncodeEntities:
1513 * @out: a pointer to an array of bytes to store the result
1514 * @outlen: the length of @out
1515 * @in: a pointer to an array of UTF-8 chars
1516 * @inlen: the length of @in
1517 * @quoteChar: the quote character to escape (' or ") or zero.
1518 *
1519 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1520 * plus HTML entities block of chars out.
1521 *
1522 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1523 * The value of @inlen after return is the number of octets consumed
1524 * as the return value is positive, else unpredictiable.
1525 * The value of @outlen after return is the number of octets consumed.
1526 */
1527int
1528htmlEncodeEntities(unsigned char* out, int *outlen,
1529 const unsigned char* in, int *inlen, int quoteChar) {
1530 const unsigned char* processed = in;
1531 const unsigned char* outend = out + (*outlen);
1532 const unsigned char* outstart = out;
1533 const unsigned char* instart = in;
1534 const unsigned char* inend = in + (*inlen);
1535 unsigned int c, d;
1536 int trailing;
1537
1538 while (in < inend) {
1539 d = *in++;
1540 if (d < 0x80) { c= d; trailing= 0; }
1541 else if (d < 0xC0) {
1542 /* trailing byte in leading position */
1543 *outlen = out - outstart;
1544 *inlen = processed - instart;
1545 return(-2);
1546 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1547 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1548 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1549 else {
1550 /* no chance for this in Ascii */
1551 *outlen = out - outstart;
1552 *inlen = processed - instart;
1553 return(-2);
1554 }
1555
1556 if (inend - in < trailing)
1557 break;
1558
1559 while (trailing--) {
1560 if (((d= *in++) & 0xC0) != 0x80) {
1561 *outlen = out - outstart;
1562 *inlen = processed - instart;
1563 return(-2);
1564 }
1565 c <<= 6;
1566 c |= d & 0x3F;
1567 }
1568
1569 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001570 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1571 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001572 if (out >= outend)
1573 break;
1574 *out++ = c;
1575 } else {
1576 htmlEntityDescPtr ent;
1577 const char *cp;
1578 char nbuf[16];
1579 int len;
1580
1581 /*
1582 * Try to lookup a predefined HTML entity for it
1583 */
1584 ent = htmlEntityValueLookup(c);
1585 if (ent == NULL) {
1586 sprintf(nbuf, "#%u", c);
1587 cp = nbuf;
1588 }
1589 else
1590 cp = ent->name;
1591 len = strlen(cp);
1592 if (out + 2 + len > outend)
1593 break;
1594 *out++ = '&';
1595 memcpy(out, cp, len);
1596 out += len;
1597 *out++ = ';';
1598 }
1599 processed = in;
1600 }
1601 *outlen = out - outstart;
1602 *inlen = processed - instart;
1603 return(0);
1604}
1605
1606/**
1607 * htmlDecodeEntities:
1608 * @ctxt: the parser context
1609 * @len: the len to decode (in bytes !), -1 for no size limit
1610 * @end: an end marker xmlChar, 0 if none
1611 * @end2: an end marker xmlChar, 0 if none
1612 * @end3: an end marker xmlChar, 0 if none
1613 *
1614 * Subtitute the HTML entities by their value
1615 *
1616 * DEPRECATED !!!!
1617 *
1618 * Returns A newly allocated string with the substitution done. The caller
1619 * must deallocate it !
1620 */
1621xmlChar *
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00001622htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
1623 xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001624 static int deprecated = 0;
1625 if (!deprecated) {
1626 xmlGenericError(xmlGenericErrorContext,
1627 "htmlDecodeEntities() deprecated function reached\n");
1628 deprecated = 1;
1629 }
1630 return(NULL);
1631#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00001632 xmlChar *name = NULL;
1633 xmlChar *buffer = NULL;
1634 unsigned int buffer_size = 0;
1635 unsigned int nbchars = 0;
1636 htmlEntityDescPtr ent;
1637 unsigned int max = (unsigned int) len;
1638 int c,l;
1639
1640 if (ctxt->depth > 40) {
1641 ctxt->errNo = XML_ERR_ENTITY_LOOP;
1642 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1643 ctxt->sax->error(ctxt->userData,
1644 "Detected entity reference loop\n");
1645 ctxt->wellFormed = 0;
1646 ctxt->disableSAX = 1;
1647 return(NULL);
1648 }
1649
1650 /*
1651 * allocate a translation buffer.
1652 */
1653 buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1654 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1655 if (buffer == NULL) {
1656 perror("xmlDecodeEntities: malloc failed");
1657 return(NULL);
1658 }
1659
1660 /*
1661 * Ok loop until we reach one of the ending char or a size limit.
1662 */
1663 c = CUR_CHAR(l);
1664 while ((nbchars < max) && (c != end) &&
1665 (c != end2) && (c != end3)) {
1666
1667 if (c == 0) break;
1668 if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
1669 int val = htmlParseCharRef(ctxt);
1670 COPY_BUF(0,buffer,nbchars,val);
1671 NEXTL(l);
1672 } else if ((c == '&') && (ctxt->token != '&')) {
1673 ent = htmlParseEntityRef(ctxt, &name);
1674 if (name != NULL) {
1675 if (ent != NULL) {
1676 int val = ent->value;
1677 COPY_BUF(0,buffer,nbchars,val);
1678 NEXTL(l);
1679 } else {
1680 const xmlChar *cur = name;
1681
1682 buffer[nbchars++] = '&';
1683 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1684 growBuffer(buffer);
1685 }
1686 while (*cur != 0) {
1687 buffer[nbchars++] = *cur++;
1688 }
1689 buffer[nbchars++] = ';';
1690 }
1691 }
1692 } else {
1693 COPY_BUF(l,buffer,nbchars,c);
1694 NEXTL(l);
1695 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1696 growBuffer(buffer);
1697 }
1698 }
1699 c = CUR_CHAR(l);
1700 }
1701 buffer[nbchars++] = 0;
1702 return(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001703#endif
Owen Taylor3473f882001-02-23 17:55:21 +00001704}
1705
1706/************************************************************************
1707 * *
1708 * Commodity functions to handle streams *
1709 * *
1710 ************************************************************************/
1711
1712/**
Owen Taylor3473f882001-02-23 17:55:21 +00001713 * htmlNewInputStream:
1714 * @ctxt: an HTML parser context
1715 *
1716 * Create a new input stream structure
1717 * Returns the new input stream or NULL
1718 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001719static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001720htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1721 htmlParserInputPtr input;
1722
1723 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1724 if (input == NULL) {
1725 ctxt->errNo = XML_ERR_NO_MEMORY;
1726 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1727 ctxt->sax->error(ctxt->userData,
1728 "malloc: couldn't allocate a new input stream\n");
1729 return(NULL);
1730 }
1731 memset(input, 0, sizeof(htmlParserInput));
1732 input->filename = NULL;
1733 input->directory = NULL;
1734 input->base = NULL;
1735 input->cur = NULL;
1736 input->buf = NULL;
1737 input->line = 1;
1738 input->col = 1;
1739 input->buf = NULL;
1740 input->free = NULL;
1741 input->version = NULL;
1742 input->consumed = 0;
1743 input->length = 0;
1744 return(input);
1745}
1746
1747
1748/************************************************************************
1749 * *
1750 * Commodity functions, cleanup needed ? *
1751 * *
1752 ************************************************************************/
1753
1754/**
1755 * areBlanks:
1756 * @ctxt: an HTML parser context
1757 * @str: a xmlChar *
1758 * @len: the size of @str
1759 *
1760 * Is this a sequence of blank chars that one can ignore ?
1761 *
1762 * Returns 1 if ignorable 0 otherwise.
1763 */
1764
1765static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
1766 int i;
1767 xmlNodePtr lastChild;
1768
1769 for (i = 0;i < len;i++)
1770 if (!(IS_BLANK(str[i]))) return(0);
1771
1772 if (CUR == 0) return(1);
1773 if (CUR != '<') return(0);
1774 if (ctxt->name == NULL)
1775 return(1);
1776 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
1777 return(1);
1778 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
1779 return(1);
1780 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
1781 return(1);
1782 if (ctxt->node == NULL) return(0);
1783 lastChild = xmlGetLastChild(ctxt->node);
1784 if (lastChild == NULL) {
1785 if (ctxt->node->content != NULL) return(0);
1786 } else if (xmlNodeIsText(lastChild)) {
1787 return(0);
1788 } else if (xmlStrEqual(lastChild->name, BAD_CAST"b")) {
1789 return(0);
1790 } else if (xmlStrEqual(lastChild->name, BAD_CAST"bold")) {
1791 return(0);
1792 } else if (xmlStrEqual(lastChild->name, BAD_CAST"em")) {
1793 return(0);
1794 }
1795 return(1);
1796}
1797
1798/**
Owen Taylor3473f882001-02-23 17:55:21 +00001799 * htmlNewDocNoDtD:
1800 * @URI: URI for the dtd, or NULL
1801 * @ExternalID: the external ID of the DTD, or NULL
1802 *
1803 * Returns a new document, do not intialize the DTD if not provided
1804 */
1805htmlDocPtr
1806htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
1807 xmlDocPtr cur;
1808
1809 /*
1810 * Allocate a new document and fill the fields.
1811 */
1812 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
1813 if (cur == NULL) {
1814 xmlGenericError(xmlGenericErrorContext,
1815 "xmlNewDoc : malloc failed\n");
1816 return(NULL);
1817 }
1818 memset(cur, 0, sizeof(xmlDoc));
1819
1820 cur->type = XML_HTML_DOCUMENT_NODE;
1821 cur->version = NULL;
1822 cur->intSubset = NULL;
1823 if ((ExternalID != NULL) ||
1824 (URI != NULL))
1825 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
1826 cur->doc = cur;
1827 cur->name = NULL;
1828 cur->children = NULL;
1829 cur->extSubset = NULL;
1830 cur->oldNs = NULL;
1831 cur->encoding = NULL;
1832 cur->standalone = 1;
1833 cur->compression = 0;
1834 cur->ids = NULL;
1835 cur->refs = NULL;
1836#ifndef XML_WITHOUT_CORBA
1837 cur->_private = NULL;
1838#endif
1839 return(cur);
1840}
1841
1842/**
1843 * htmlNewDoc:
1844 * @URI: URI for the dtd, or NULL
1845 * @ExternalID: the external ID of the DTD, or NULL
1846 *
1847 * Returns a new document
1848 */
1849htmlDocPtr
1850htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1851 if ((URI == NULL) && (ExternalID == NULL))
1852 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00001853 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
1854 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00001855
1856 return(htmlNewDocNoDtD(URI, ExternalID));
1857}
1858
1859
1860/************************************************************************
1861 * *
1862 * The parser itself *
1863 * Relates to http://www.w3.org/TR/html40 *
1864 * *
1865 ************************************************************************/
1866
1867/************************************************************************
1868 * *
1869 * The parser itself *
1870 * *
1871 ************************************************************************/
1872
1873/**
1874 * htmlParseHTMLName:
1875 * @ctxt: an HTML parser context
1876 *
1877 * parse an HTML tag or attribute name, note that we convert it to lowercase
1878 * since HTML names are not case-sensitive.
1879 *
1880 * Returns the Tag Name parsed or NULL
1881 */
1882
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001883static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001884htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1885 xmlChar *ret = NULL;
1886 int i = 0;
1887 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
1888
1889 if (!IS_LETTER(CUR) && (CUR != '_') &&
1890 (CUR != ':')) return(NULL);
1891
1892 while ((i < HTML_PARSER_BUFFER_SIZE) &&
1893 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1894 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
1895 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
1896 else loc[i] = CUR;
1897 i++;
1898
1899 NEXT;
1900 }
1901
1902 ret = xmlStrndup(loc, i);
1903
1904 return(ret);
1905}
1906
1907/**
1908 * htmlParseName:
1909 * @ctxt: an HTML parser context
1910 *
1911 * parse an HTML name, this routine is case sensistive.
1912 *
1913 * Returns the Name parsed or NULL
1914 */
1915
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001916static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001917htmlParseName(htmlParserCtxtPtr ctxt) {
1918 xmlChar buf[HTML_MAX_NAMELEN];
1919 int len = 0;
1920
1921 GROW;
1922 if (!IS_LETTER(CUR) && (CUR != '_')) {
1923 return(NULL);
1924 }
1925
1926 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1927 (CUR == '.') || (CUR == '-') ||
1928 (CUR == '_') || (CUR == ':') ||
1929 (IS_COMBINING(CUR)) ||
1930 (IS_EXTENDER(CUR))) {
1931 buf[len++] = CUR;
1932 NEXT;
1933 if (len >= HTML_MAX_NAMELEN) {
1934 xmlGenericError(xmlGenericErrorContext,
1935 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1936 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1937 (CUR == '.') || (CUR == '-') ||
1938 (CUR == '_') || (CUR == ':') ||
1939 (IS_COMBINING(CUR)) ||
1940 (IS_EXTENDER(CUR)))
1941 NEXT;
1942 break;
1943 }
1944 }
1945 return(xmlStrndup(buf, len));
1946}
1947
1948/**
1949 * htmlParseHTMLAttribute:
1950 * @ctxt: an HTML parser context
1951 * @stop: a char stop value
1952 *
1953 * parse an HTML attribute value till the stop (quote), if
1954 * stop is 0 then it stops at the first space
1955 *
1956 * Returns the attribute parsed or NULL
1957 */
1958
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001959static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00001960htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
1961 xmlChar *buffer = NULL;
1962 int buffer_size = 0;
1963 xmlChar *out = NULL;
1964 xmlChar *name = NULL;
1965
1966 xmlChar *cur = NULL;
1967 htmlEntityDescPtr ent;
1968
1969 /*
1970 * allocate a translation buffer.
1971 */
1972 buffer_size = HTML_PARSER_BUFFER_SIZE;
1973 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1974 if (buffer == NULL) {
1975 perror("htmlParseHTMLAttribute: malloc failed");
1976 return(NULL);
1977 }
1978 out = buffer;
1979
1980 /*
1981 * Ok loop until we reach one of the ending chars
1982 */
1983 while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1984 if ((stop == 0) && (IS_BLANK(CUR))) break;
1985 if (CUR == '&') {
1986 if (NXT(1) == '#') {
1987 unsigned int c;
1988 int bits;
1989
1990 c = htmlParseCharRef(ctxt);
1991 if (c < 0x80)
1992 { *out++ = c; bits= -6; }
1993 else if (c < 0x800)
1994 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1995 else if (c < 0x10000)
1996 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1997 else
1998 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1999
2000 for ( ; bits >= 0; bits-= 6) {
2001 *out++ = ((c >> bits) & 0x3F) | 0x80;
2002 }
2003 } else {
2004 ent = htmlParseEntityRef(ctxt, &name);
2005 if (name == NULL) {
2006 *out++ = '&';
2007 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002008 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002009
2010 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002011 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002012 }
2013 } else if (ent == NULL) {
2014 *out++ = '&';
2015 cur = name;
2016 while (*cur != 0) {
2017 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002018 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002019
2020 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002021 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002022 }
2023 *out++ = *cur++;
2024 }
2025 xmlFree(name);
2026 } else {
2027 unsigned int c;
2028 int bits;
2029
2030 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002031 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002032
2033 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002034 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002035 }
2036 c = (xmlChar)ent->value;
2037 if (c < 0x80)
2038 { *out++ = c; bits= -6; }
2039 else if (c < 0x800)
2040 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2041 else if (c < 0x10000)
2042 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2043 else
2044 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2045
2046 for ( ; bits >= 0; bits-= 6) {
2047 *out++ = ((c >> bits) & 0x3F) | 0x80;
2048 }
2049 xmlFree(name);
2050 }
2051 }
2052 } else {
2053 unsigned int c;
2054 int bits, l;
2055
2056 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002057 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002058
2059 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002060 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002061 }
2062 c = CUR_CHAR(l);
2063 if (c < 0x80)
2064 { *out++ = c; bits= -6; }
2065 else if (c < 0x800)
2066 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2067 else if (c < 0x10000)
2068 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2069 else
2070 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2071
2072 for ( ; bits >= 0; bits-= 6) {
2073 *out++ = ((c >> bits) & 0x3F) | 0x80;
2074 }
2075 NEXT;
2076 }
2077 }
2078 *out++ = 0;
2079 return(buffer);
2080}
2081
2082/**
Owen Taylor3473f882001-02-23 17:55:21 +00002083 * htmlParseEntityRef:
2084 * @ctxt: an HTML parser context
2085 * @str: location to store the entity name
2086 *
2087 * parse an HTML ENTITY references
2088 *
2089 * [68] EntityRef ::= '&' Name ';'
2090 *
2091 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2092 * if non-NULL *str will have to be freed by the caller.
2093 */
2094htmlEntityDescPtr
2095htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2096 xmlChar *name;
2097 htmlEntityDescPtr ent = NULL;
2098 *str = NULL;
2099
2100 if (CUR == '&') {
2101 NEXT;
2102 name = htmlParseName(ctxt);
2103 if (name == NULL) {
2104 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2105 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2106 ctxt->wellFormed = 0;
2107 } else {
2108 GROW;
2109 if (CUR == ';') {
2110 *str = name;
2111
2112 /*
2113 * Lookup the entity in the table.
2114 */
2115 ent = htmlEntityLookup(name);
2116 if (ent != NULL) /* OK that's ugly !!! */
2117 NEXT;
2118 } else {
2119 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2120 ctxt->sax->error(ctxt->userData,
2121 "htmlParseEntityRef: expecting ';'\n");
2122 *str = name;
2123 }
2124 }
2125 }
2126 return(ent);
2127}
2128
2129/**
2130 * htmlParseAttValue:
2131 * @ctxt: an HTML parser context
2132 *
2133 * parse a value for an attribute
2134 * Note: the parser won't do substitution of entities here, this
2135 * will be handled later in xmlStringGetNodeList, unless it was
2136 * asked for ctxt->replaceEntities != 0
2137 *
2138 * Returns the AttValue parsed or NULL.
2139 */
2140
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002141static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002142htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2143 xmlChar *ret = NULL;
2144
2145 if (CUR == '"') {
2146 NEXT;
2147 ret = htmlParseHTMLAttribute(ctxt, '"');
2148 if (CUR != '"') {
2149 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2150 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2151 ctxt->wellFormed = 0;
2152 } else
2153 NEXT;
2154 } else if (CUR == '\'') {
2155 NEXT;
2156 ret = htmlParseHTMLAttribute(ctxt, '\'');
2157 if (CUR != '\'') {
2158 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2159 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2160 ctxt->wellFormed = 0;
2161 } else
2162 NEXT;
2163 } else {
2164 /*
2165 * That's an HTMLism, the attribute value may not be quoted
2166 */
2167 ret = htmlParseHTMLAttribute(ctxt, 0);
2168 if (ret == NULL) {
2169 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2170 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2171 ctxt->wellFormed = 0;
2172 }
2173 }
2174 return(ret);
2175}
2176
2177/**
2178 * htmlParseSystemLiteral:
2179 * @ctxt: an HTML parser context
2180 *
2181 * parse an HTML Literal
2182 *
2183 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2184 *
2185 * Returns the SystemLiteral parsed or NULL
2186 */
2187
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002188static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002189htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2190 const xmlChar *q;
2191 xmlChar *ret = NULL;
2192
2193 if (CUR == '"') {
2194 NEXT;
2195 q = CUR_PTR;
2196 while ((IS_CHAR(CUR)) && (CUR != '"'))
2197 NEXT;
2198 if (!IS_CHAR(CUR)) {
2199 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2200 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2201 ctxt->wellFormed = 0;
2202 } else {
2203 ret = xmlStrndup(q, CUR_PTR - q);
2204 NEXT;
2205 }
2206 } else if (CUR == '\'') {
2207 NEXT;
2208 q = CUR_PTR;
2209 while ((IS_CHAR(CUR)) && (CUR != '\''))
2210 NEXT;
2211 if (!IS_CHAR(CUR)) {
2212 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2213 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2214 ctxt->wellFormed = 0;
2215 } else {
2216 ret = xmlStrndup(q, CUR_PTR - q);
2217 NEXT;
2218 }
2219 } else {
2220 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2221 ctxt->sax->error(ctxt->userData,
2222 "SystemLiteral \" or ' expected\n");
2223 ctxt->wellFormed = 0;
2224 }
2225
2226 return(ret);
2227}
2228
2229/**
2230 * htmlParsePubidLiteral:
2231 * @ctxt: an HTML parser context
2232 *
2233 * parse an HTML public literal
2234 *
2235 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2236 *
2237 * Returns the PubidLiteral parsed or NULL.
2238 */
2239
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002240static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002241htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2242 const xmlChar *q;
2243 xmlChar *ret = NULL;
2244 /*
2245 * Name ::= (Letter | '_') (NameChar)*
2246 */
2247 if (CUR == '"') {
2248 NEXT;
2249 q = CUR_PTR;
2250 while (IS_PUBIDCHAR(CUR)) NEXT;
2251 if (CUR != '"') {
2252 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2253 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2254 ctxt->wellFormed = 0;
2255 } else {
2256 ret = xmlStrndup(q, CUR_PTR - q);
2257 NEXT;
2258 }
2259 } else if (CUR == '\'') {
2260 NEXT;
2261 q = CUR_PTR;
2262 while ((IS_LETTER(CUR)) && (CUR != '\''))
2263 NEXT;
2264 if (!IS_LETTER(CUR)) {
2265 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2266 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2267 ctxt->wellFormed = 0;
2268 } else {
2269 ret = xmlStrndup(q, CUR_PTR - q);
2270 NEXT;
2271 }
2272 } else {
2273 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2274 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2275 ctxt->wellFormed = 0;
2276 }
2277
2278 return(ret);
2279}
2280
2281/**
2282 * htmlParseScript:
2283 * @ctxt: an HTML parser context
2284 *
2285 * parse the content of an HTML SCRIPT or STYLE element
2286 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2287 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2288 * http://www.w3.org/TR/html4/types.html#type-script
2289 * http://www.w3.org/TR/html4/types.html#h-6.15
2290 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2291 *
2292 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2293 * element and the value of intrinsic event attributes. User agents must
2294 * not evaluate script data as HTML markup but instead must pass it on as
2295 * data to a script engine.
2296 * NOTES:
2297 * - The content is passed like CDATA
2298 * - the attributes for style and scripting "onXXX" are also described
2299 * as CDATA but SGML allows entities references in attributes so their
2300 * processing is identical as other attributes
2301 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002302static void
Owen Taylor3473f882001-02-23 17:55:21 +00002303htmlParseScript(htmlParserCtxtPtr ctxt) {
2304 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2305 int nbchar = 0;
2306 xmlChar cur;
2307
2308 SHRINK;
2309 cur = CUR;
2310 while (IS_CHAR(cur)) {
2311 if ((cur == '<') && (NXT(1) == '/')) {
2312 /*
2313 * One should break here, the specification is clear:
2314 * Authors should therefore escape "</" within the content.
2315 * Escape mechanisms are specific to each scripting or
2316 * style sheet language.
2317 */
2318 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2319 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2320 break; /* while */
2321 }
2322 buf[nbchar++] = cur;
2323 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2324 if (ctxt->sax->cdataBlock!= NULL) {
2325 /*
2326 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2327 */
2328 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2329 }
2330 nbchar = 0;
2331 }
2332 NEXT;
2333 cur = CUR;
2334 }
2335 if (!(IS_CHAR(cur))) {
2336 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2337 ctxt->sax->error(ctxt->userData,
2338 "Invalid char in CDATA 0x%X\n", cur);
2339 ctxt->wellFormed = 0;
2340 NEXT;
2341 }
2342
2343 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2344 if (ctxt->sax->cdataBlock!= NULL) {
2345 /*
2346 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2347 */
2348 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2349 }
2350 }
2351}
2352
2353
2354/**
2355 * htmlParseCharData:
2356 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002357 *
2358 * parse a CharData section.
2359 * if we are within a CDATA section ']]>' marks an end of section.
2360 *
2361 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2362 */
2363
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002364static void
2365htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002366 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2367 int nbchar = 0;
2368 int cur, l;
2369
2370 SHRINK;
2371 cur = CUR_CHAR(l);
2372 while (((cur != '<') || (ctxt->token == '<')) &&
2373 ((cur != '&') || (ctxt->token == '&')) &&
2374 (IS_CHAR(cur))) {
2375 COPY_BUF(l,buf,nbchar,cur);
2376 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2377 /*
2378 * Ok the segment is to be consumed as chars.
2379 */
2380 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2381 if (areBlanks(ctxt, buf, nbchar)) {
2382 if (ctxt->sax->ignorableWhitespace != NULL)
2383 ctxt->sax->ignorableWhitespace(ctxt->userData,
2384 buf, nbchar);
2385 } else {
2386 htmlCheckParagraph(ctxt);
2387 if (ctxt->sax->characters != NULL)
2388 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2389 }
2390 }
2391 nbchar = 0;
2392 }
2393 NEXTL(l);
2394 cur = CUR_CHAR(l);
2395 }
2396 if (nbchar != 0) {
2397 /*
2398 * Ok the segment is to be consumed as chars.
2399 */
2400 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2401 if (areBlanks(ctxt, buf, nbchar)) {
2402 if (ctxt->sax->ignorableWhitespace != NULL)
2403 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2404 } else {
2405 htmlCheckParagraph(ctxt);
2406 if (ctxt->sax->characters != NULL)
2407 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2408 }
2409 }
2410 }
2411}
2412
2413/**
2414 * htmlParseExternalID:
2415 * @ctxt: an HTML parser context
2416 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002417 *
2418 * Parse an External ID or a Public ID
2419 *
Owen Taylor3473f882001-02-23 17:55:21 +00002420 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2421 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2422 *
2423 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2424 *
2425 * Returns the function returns SystemLiteral and in the second
2426 * case publicID receives PubidLiteral, is strict is off
2427 * it is possible to return NULL and have publicID set.
2428 */
2429
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002430static xmlChar *
2431htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002432 xmlChar *URI = NULL;
2433
2434 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2435 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2436 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2437 SKIP(6);
2438 if (!IS_BLANK(CUR)) {
2439 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2440 ctxt->sax->error(ctxt->userData,
2441 "Space required after 'SYSTEM'\n");
2442 ctxt->wellFormed = 0;
2443 }
2444 SKIP_BLANKS;
2445 URI = htmlParseSystemLiteral(ctxt);
2446 if (URI == NULL) {
2447 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2448 ctxt->sax->error(ctxt->userData,
2449 "htmlParseExternalID: SYSTEM, no URI\n");
2450 ctxt->wellFormed = 0;
2451 }
2452 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2453 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2454 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2455 SKIP(6);
2456 if (!IS_BLANK(CUR)) {
2457 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2458 ctxt->sax->error(ctxt->userData,
2459 "Space required after 'PUBLIC'\n");
2460 ctxt->wellFormed = 0;
2461 }
2462 SKIP_BLANKS;
2463 *publicID = htmlParsePubidLiteral(ctxt);
2464 if (*publicID == NULL) {
2465 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2466 ctxt->sax->error(ctxt->userData,
2467 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2468 ctxt->wellFormed = 0;
2469 }
2470 SKIP_BLANKS;
2471 if ((CUR == '"') || (CUR == '\'')) {
2472 URI = htmlParseSystemLiteral(ctxt);
2473 }
2474 }
2475 return(URI);
2476}
2477
2478/**
2479 * htmlParseComment:
2480 * @ctxt: an HTML parser context
2481 *
2482 * Parse an XML (SGML) comment <!-- .... -->
2483 *
2484 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2485 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002486static void
Owen Taylor3473f882001-02-23 17:55:21 +00002487htmlParseComment(htmlParserCtxtPtr ctxt) {
2488 xmlChar *buf = NULL;
2489 int len;
2490 int size = HTML_PARSER_BUFFER_SIZE;
2491 int q, ql;
2492 int r, rl;
2493 int cur, l;
2494 xmlParserInputState state;
2495
2496 /*
2497 * Check that there is a comment right here.
2498 */
2499 if ((RAW != '<') || (NXT(1) != '!') ||
2500 (NXT(2) != '-') || (NXT(3) != '-')) return;
2501
2502 state = ctxt->instate;
2503 ctxt->instate = XML_PARSER_COMMENT;
2504 SHRINK;
2505 SKIP(4);
2506 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2507 if (buf == NULL) {
2508 xmlGenericError(xmlGenericErrorContext,
2509 "malloc of %d byte failed\n", size);
2510 ctxt->instate = state;
2511 return;
2512 }
2513 q = CUR_CHAR(ql);
2514 NEXTL(ql);
2515 r = CUR_CHAR(rl);
2516 NEXTL(rl);
2517 cur = CUR_CHAR(l);
2518 len = 0;
2519 while (IS_CHAR(cur) &&
2520 ((cur != '>') ||
2521 (r != '-') || (q != '-'))) {
2522 if (len + 5 >= size) {
2523 size *= 2;
2524 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2525 if (buf == NULL) {
2526 xmlGenericError(xmlGenericErrorContext,
2527 "realloc of %d byte failed\n", size);
2528 ctxt->instate = state;
2529 return;
2530 }
2531 }
2532 COPY_BUF(ql,buf,len,q);
2533 q = r;
2534 ql = rl;
2535 r = cur;
2536 rl = l;
2537 NEXTL(l);
2538 cur = CUR_CHAR(l);
2539 if (cur == 0) {
2540 SHRINK;
2541 GROW;
2542 cur = CUR_CHAR(l);
2543 }
2544 }
2545 buf[len] = 0;
2546 if (!IS_CHAR(cur)) {
2547 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2548 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2549 ctxt->sax->error(ctxt->userData,
2550 "Comment not terminated \n<!--%.50s\n", buf);
2551 ctxt->wellFormed = 0;
2552 xmlFree(buf);
2553 } else {
2554 NEXT;
2555 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2556 (!ctxt->disableSAX))
2557 ctxt->sax->comment(ctxt->userData, buf);
2558 xmlFree(buf);
2559 }
2560 ctxt->instate = state;
2561}
2562
2563/**
2564 * htmlParseCharRef:
2565 * @ctxt: an HTML parser context
2566 *
2567 * parse Reference declarations
2568 *
2569 * [66] CharRef ::= '&#' [0-9]+ ';' |
2570 * '&#x' [0-9a-fA-F]+ ';'
2571 *
2572 * Returns the value parsed (as an int)
2573 */
2574int
2575htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2576 int val = 0;
2577
2578 if ((CUR == '&') && (NXT(1) == '#') &&
2579 (NXT(2) == 'x')) {
2580 SKIP(3);
2581 while (CUR != ';') {
2582 if ((CUR >= '0') && (CUR <= '9'))
2583 val = val * 16 + (CUR - '0');
2584 else if ((CUR >= 'a') && (CUR <= 'f'))
2585 val = val * 16 + (CUR - 'a') + 10;
2586 else if ((CUR >= 'A') && (CUR <= 'F'))
2587 val = val * 16 + (CUR - 'A') + 10;
2588 else {
2589 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2590 ctxt->sax->error(ctxt->userData,
2591 "htmlParseCharRef: invalid hexadecimal value\n");
2592 ctxt->wellFormed = 0;
2593 return(0);
2594 }
2595 NEXT;
2596 }
2597 if (CUR == ';')
2598 NEXT;
2599 } else if ((CUR == '&') && (NXT(1) == '#')) {
2600 SKIP(2);
2601 while (CUR != ';') {
2602 if ((CUR >= '0') && (CUR <= '9'))
2603 val = val * 10 + (CUR - '0');
2604 else {
2605 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2606 ctxt->sax->error(ctxt->userData,
2607 "htmlParseCharRef: invalid decimal value\n");
2608 ctxt->wellFormed = 0;
2609 return(0);
2610 }
2611 NEXT;
2612 }
2613 if (CUR == ';')
2614 NEXT;
2615 } else {
2616 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2617 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2618 ctxt->wellFormed = 0;
2619 }
2620 /*
2621 * Check the value IS_CHAR ...
2622 */
2623 if (IS_CHAR(val)) {
2624 return(val);
2625 } else {
2626 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2627 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
2628 val);
2629 ctxt->wellFormed = 0;
2630 }
2631 return(0);
2632}
2633
2634
2635/**
2636 * htmlParseDocTypeDecl :
2637 * @ctxt: an HTML parser context
2638 *
2639 * parse a DOCTYPE declaration
2640 *
2641 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2642 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2643 */
2644
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002645static void
Owen Taylor3473f882001-02-23 17:55:21 +00002646htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
2647 xmlChar *name;
2648 xmlChar *ExternalID = NULL;
2649 xmlChar *URI = NULL;
2650
2651 /*
2652 * We know that '<!DOCTYPE' has been detected.
2653 */
2654 SKIP(9);
2655
2656 SKIP_BLANKS;
2657
2658 /*
2659 * Parse the DOCTYPE name.
2660 */
2661 name = htmlParseName(ctxt);
2662 if (name == NULL) {
2663 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2664 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2665 ctxt->wellFormed = 0;
2666 }
2667 /*
2668 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2669 */
2670
2671 SKIP_BLANKS;
2672
2673 /*
2674 * Check for SystemID and ExternalID
2675 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002676 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00002677 SKIP_BLANKS;
2678
2679 /*
2680 * We should be at the end of the DOCTYPE declaration.
2681 */
2682 if (CUR != '>') {
2683 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2684 ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
2685 ctxt->wellFormed = 0;
2686 /* We shouldn't try to resynchronize ... */
2687 }
2688 NEXT;
2689
2690 /*
2691 * Create or update the document accordingly to the DOCTYPE
2692 */
2693 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2694 (!ctxt->disableSAX))
2695 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
2696
2697 /*
2698 * Cleanup, since we don't use all those identifiers
2699 */
2700 if (URI != NULL) xmlFree(URI);
2701 if (ExternalID != NULL) xmlFree(ExternalID);
2702 if (name != NULL) xmlFree(name);
2703}
2704
2705/**
2706 * htmlParseAttribute:
2707 * @ctxt: an HTML parser context
2708 * @value: a xmlChar ** used to store the value of the attribute
2709 *
2710 * parse an attribute
2711 *
2712 * [41] Attribute ::= Name Eq AttValue
2713 *
2714 * [25] Eq ::= S? '=' S?
2715 *
2716 * With namespace:
2717 *
2718 * [NS 11] Attribute ::= QName Eq AttValue
2719 *
2720 * Also the case QName == xmlns:??? is handled independently as a namespace
2721 * definition.
2722 *
2723 * Returns the attribute name, and the value in *value.
2724 */
2725
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002726static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002727htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
2728 xmlChar *name, *val = NULL;
2729
2730 *value = NULL;
2731 name = htmlParseHTMLName(ctxt);
2732 if (name == NULL) {
2733 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2734 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2735 ctxt->wellFormed = 0;
2736 return(NULL);
2737 }
2738
2739 /*
2740 * read the value
2741 */
2742 SKIP_BLANKS;
2743 if (CUR == '=') {
2744 NEXT;
2745 SKIP_BLANKS;
2746 val = htmlParseAttValue(ctxt);
2747 /******
2748 } else {
2749 * TODO : some attribute must have values, some may not
2750 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2751 ctxt->sax->warning(ctxt->userData,
2752 "No value for attribute %s\n", name); */
2753 }
2754
2755 *value = val;
2756 return(name);
2757}
2758
2759/**
2760 * htmlCheckEncoding:
2761 * @ctxt: an HTML parser context
2762 * @attvalue: the attribute value
2763 *
2764 * Checks an http-equiv attribute from a Meta tag to detect
2765 * the encoding
2766 * If a new encoding is detected the parser is switched to decode
2767 * it and pass UTF8
2768 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002769static void
Owen Taylor3473f882001-02-23 17:55:21 +00002770htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2771 const xmlChar *encoding;
2772
2773 if ((ctxt == NULL) || (attvalue == NULL))
2774 return;
2775
2776 /* do not change encoding */
2777 if (ctxt->input->encoding != NULL)
2778 return;
2779
2780 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
2781 if (encoding != NULL) {
2782 encoding += 8;
2783 } else {
2784 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
2785 if (encoding != NULL)
2786 encoding += 9;
2787 }
2788 if (encoding != NULL) {
2789 xmlCharEncoding enc;
2790 xmlCharEncodingHandlerPtr handler;
2791
2792 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2793
2794 if (ctxt->input->encoding != NULL)
2795 xmlFree((xmlChar *) ctxt->input->encoding);
2796 ctxt->input->encoding = xmlStrdup(encoding);
2797
2798 enc = xmlParseCharEncoding((const char *) encoding);
2799 /*
2800 * registered set of known encodings
2801 */
2802 if (enc != XML_CHAR_ENCODING_ERROR) {
2803 xmlSwitchEncoding(ctxt, enc);
2804 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2805 } else {
2806 /*
2807 * fallback for unknown encodings
2808 */
2809 handler = xmlFindCharEncodingHandler((const char *) encoding);
2810 if (handler != NULL) {
2811 xmlSwitchToEncoding(ctxt, handler);
2812 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2813 } else {
2814 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2815 }
2816 }
2817
2818 if ((ctxt->input->buf != NULL) &&
2819 (ctxt->input->buf->encoder != NULL) &&
2820 (ctxt->input->buf->raw != NULL) &&
2821 (ctxt->input->buf->buffer != NULL)) {
2822 int nbchars;
2823 int processed;
2824
2825 /*
2826 * convert as much as possible to the parser reading buffer.
2827 */
2828 processed = ctxt->input->cur - ctxt->input->base;
2829 xmlBufferShrink(ctxt->input->buf->buffer, processed);
2830 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2831 ctxt->input->buf->buffer,
2832 ctxt->input->buf->raw);
2833 if (nbchars < 0) {
2834 ctxt->errNo = XML_ERR_INVALID_ENCODING;
2835 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2836 ctxt->sax->error(ctxt->userData,
2837 "htmlCheckEncoding: encoder error\n");
2838 }
2839 ctxt->input->base =
2840 ctxt->input->cur = ctxt->input->buf->buffer->content;
2841 }
2842 }
2843}
2844
2845/**
2846 * htmlCheckMeta:
2847 * @ctxt: an HTML parser context
2848 * @atts: the attributes values
2849 *
2850 * Checks an attributes from a Meta tag
2851 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002852static void
Owen Taylor3473f882001-02-23 17:55:21 +00002853htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2854 int i;
2855 const xmlChar *att, *value;
2856 int http = 0;
2857 const xmlChar *content = NULL;
2858
2859 if ((ctxt == NULL) || (atts == NULL))
2860 return;
2861
2862 i = 0;
2863 att = atts[i++];
2864 while (att != NULL) {
2865 value = atts[i++];
2866 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
2867 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
2868 http = 1;
2869 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
2870 content = value;
2871 att = atts[i++];
2872 }
2873 if ((http) && (content != NULL))
2874 htmlCheckEncoding(ctxt, content);
2875
2876}
2877
2878/**
2879 * htmlParseStartTag:
2880 * @ctxt: an HTML parser context
2881 *
2882 * parse a start of tag either for rule element or
2883 * EmptyElement. In both case we don't parse the tag closing chars.
2884 *
2885 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2886 *
2887 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2888 *
2889 * With namespace:
2890 *
2891 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2892 *
2893 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2894 *
2895 */
2896
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002897static void
Owen Taylor3473f882001-02-23 17:55:21 +00002898htmlParseStartTag(htmlParserCtxtPtr ctxt) {
2899 xmlChar *name;
2900 xmlChar *attname;
2901 xmlChar *attvalue;
2902 const xmlChar **atts = NULL;
2903 int nbatts = 0;
2904 int maxatts = 0;
2905 int meta = 0;
2906 int i;
2907
2908 if (CUR != '<') return;
2909 NEXT;
2910
2911 GROW;
2912 name = htmlParseHTMLName(ctxt);
2913 if (name == NULL) {
2914 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2915 ctxt->sax->error(ctxt->userData,
2916 "htmlParseStartTag: invalid element name\n");
2917 ctxt->wellFormed = 0;
2918 /* Dump the bogus tag like browsers do */
2919 while ((IS_CHAR(CUR)) && (CUR != '>'))
2920 NEXT;
2921 return;
2922 }
2923 if (xmlStrEqual(name, BAD_CAST"meta"))
2924 meta = 1;
2925
2926 /*
2927 * Check for auto-closure of HTML elements.
2928 */
2929 htmlAutoClose(ctxt, name);
2930
2931 /*
2932 * Check for implied HTML elements.
2933 */
2934 htmlCheckImplied(ctxt, name);
2935
2936 /*
2937 * Avoid html at any level > 0, head at any level != 1
2938 * or any attempt to recurse body
2939 */
2940 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
2941 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2942 ctxt->sax->error(ctxt->userData,
2943 "htmlParseStartTag: misplaced <html> tag\n");
2944 ctxt->wellFormed = 0;
2945 xmlFree(name);
2946 return;
2947 }
2948 if ((ctxt->nameNr != 1) &&
2949 (xmlStrEqual(name, BAD_CAST"head"))) {
2950 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2951 ctxt->sax->error(ctxt->userData,
2952 "htmlParseStartTag: misplaced <head> tag\n");
2953 ctxt->wellFormed = 0;
2954 xmlFree(name);
2955 return;
2956 }
2957 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002958 int indx;
2959 for (indx = 0;indx < ctxt->nameNr;indx++) {
2960 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor3473f882001-02-23 17:55:21 +00002961 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2962 ctxt->sax->error(ctxt->userData,
2963 "htmlParseStartTag: misplaced <body> tag\n");
2964 ctxt->wellFormed = 0;
2965 xmlFree(name);
2966 return;
2967 }
2968 }
2969 }
2970
2971 /*
2972 * Now parse the attributes, it ends up with the ending
2973 *
2974 * (S Attribute)* S?
2975 */
2976 SKIP_BLANKS;
2977 while ((IS_CHAR(CUR)) &&
2978 (CUR != '>') &&
2979 ((CUR != '/') || (NXT(1) != '>'))) {
2980 long cons = ctxt->nbChars;
2981
2982 GROW;
2983 attname = htmlParseAttribute(ctxt, &attvalue);
2984 if (attname != NULL) {
2985
2986 /*
2987 * Well formedness requires at most one declaration of an attribute
2988 */
2989 for (i = 0; i < nbatts;i += 2) {
2990 if (xmlStrEqual(atts[i], attname)) {
2991 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2992 ctxt->sax->error(ctxt->userData,
2993 "Attribute %s redefined\n",
2994 attname);
2995 ctxt->wellFormed = 0;
2996 xmlFree(attname);
2997 if (attvalue != NULL)
2998 xmlFree(attvalue);
2999 goto failed;
3000 }
3001 }
3002
3003 /*
3004 * Add the pair to atts
3005 */
3006 if (atts == NULL) {
3007 maxatts = 10;
3008 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
3009 if (atts == NULL) {
3010 xmlGenericError(xmlGenericErrorContext,
3011 "malloc of %ld byte failed\n",
3012 maxatts * (long)sizeof(xmlChar *));
3013 if (name != NULL) xmlFree(name);
3014 return;
3015 }
3016 } else if (nbatts + 4 > maxatts) {
3017 maxatts *= 2;
3018 atts = (const xmlChar **) xmlRealloc((void *) atts,
3019 maxatts * sizeof(xmlChar *));
3020 if (atts == NULL) {
3021 xmlGenericError(xmlGenericErrorContext,
3022 "realloc of %ld byte failed\n",
3023 maxatts * (long)sizeof(xmlChar *));
3024 if (name != NULL) xmlFree(name);
3025 return;
3026 }
3027 }
3028 atts[nbatts++] = attname;
3029 atts[nbatts++] = attvalue;
3030 atts[nbatts] = NULL;
3031 atts[nbatts + 1] = NULL;
3032 }
3033 else {
3034 /* Dump the bogus attribute string up to the next blank or
3035 * the end of the tag. */
3036 while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
3037 && ((CUR != '/') || (NXT(1) != '>')))
3038 NEXT;
3039 }
3040
3041failed:
3042 SKIP_BLANKS;
3043 if (cons == ctxt->nbChars) {
3044 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3045 ctxt->sax->error(ctxt->userData,
3046 "htmlParseStartTag: problem parsing attributes\n");
3047 ctxt->wellFormed = 0;
3048 break;
3049 }
3050 }
3051
3052 /*
3053 * Handle specific association to the META tag
3054 */
3055 if (meta)
3056 htmlCheckMeta(ctxt, atts);
3057
3058 /*
3059 * SAX: Start of Element !
3060 */
3061 htmlnamePush(ctxt, xmlStrdup(name));
3062#ifdef DEBUG
3063 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
3064#endif
3065 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3066 ctxt->sax->startElement(ctxt->userData, name, atts);
3067
3068 if (atts != NULL) {
3069 for (i = 0;i < nbatts;i++) {
3070 if (atts[i] != NULL)
3071 xmlFree((xmlChar *) atts[i]);
3072 }
3073 xmlFree((void *) atts);
3074 }
3075 if (name != NULL) xmlFree(name);
3076}
3077
3078/**
3079 * htmlParseEndTag:
3080 * @ctxt: an HTML parser context
3081 *
3082 * parse an end of tag
3083 *
3084 * [42] ETag ::= '</' Name S? '>'
3085 *
3086 * With namespace
3087 *
3088 * [NS 9] ETag ::= '</' QName S? '>'
3089 */
3090
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003091static void
Owen Taylor3473f882001-02-23 17:55:21 +00003092htmlParseEndTag(htmlParserCtxtPtr ctxt) {
3093 xmlChar *name;
3094 xmlChar *oldname;
3095 int i;
3096
3097 if ((CUR != '<') || (NXT(1) != '/')) {
3098 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3099 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3100 ctxt->wellFormed = 0;
3101 return;
3102 }
3103 SKIP(2);
3104
3105 name = htmlParseHTMLName(ctxt);
3106 if (name == NULL) return;
3107
3108 /*
3109 * We should definitely be at the ending "S? '>'" part
3110 */
3111 SKIP_BLANKS;
3112 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3113 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3114 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3115 ctxt->wellFormed = 0;
3116 } else
3117 NEXT;
3118
3119 /*
3120 * If the name read is not one of the element in the parsing stack
3121 * then return, it's just an error.
3122 */
3123 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
3124 if (xmlStrEqual(name, ctxt->nameTab[i])) break;
3125 }
3126 if (i < 0) {
3127 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3128 ctxt->sax->error(ctxt->userData,
3129 "Unexpected end tag : %s\n", name);
3130 xmlFree(name);
3131 ctxt->wellFormed = 0;
3132 return;
3133 }
3134
3135
3136 /*
3137 * Check for auto-closure of HTML elements.
3138 */
3139
3140 htmlAutoCloseOnClose(ctxt, name);
3141
3142 /*
3143 * Well formedness constraints, opening and closing must match.
3144 * With the exception that the autoclose may have popped stuff out
3145 * of the stack.
3146 */
3147 if (!xmlStrEqual(name, ctxt->name)) {
3148#ifdef DEBUG
3149 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
3150#endif
3151 if ((ctxt->name != NULL) &&
3152 (!xmlStrEqual(ctxt->name, name))) {
3153 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3154 ctxt->sax->error(ctxt->userData,
3155 "Opening and ending tag mismatch: %s and %s\n",
3156 name, ctxt->name);
3157 ctxt->wellFormed = 0;
3158 }
3159 }
3160
3161 /*
3162 * SAX: End of Tag
3163 */
3164 oldname = ctxt->name;
3165 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3166 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3167 ctxt->sax->endElement(ctxt->userData, name);
3168 oldname = htmlnamePop(ctxt);
3169 if (oldname != NULL) {
3170#ifdef DEBUG
3171 xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
3172#endif
3173 xmlFree(oldname);
3174#ifdef DEBUG
3175 } else {
3176 xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
3177#endif
3178 }
3179 }
3180
3181 if (name != NULL)
3182 xmlFree(name);
3183
3184 return;
3185}
3186
3187
3188/**
3189 * htmlParseReference:
3190 * @ctxt: an HTML parser context
3191 *
3192 * parse and handle entity references in content,
3193 * this will end-up in a call to character() since this is either a
3194 * CharRef, or a predefined entity.
3195 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003196static void
Owen Taylor3473f882001-02-23 17:55:21 +00003197htmlParseReference(htmlParserCtxtPtr ctxt) {
3198 htmlEntityDescPtr ent;
3199 xmlChar out[6];
3200 xmlChar *name;
3201 if (CUR != '&') return;
3202
3203 if (NXT(1) == '#') {
3204 unsigned int c;
3205 int bits, i = 0;
3206
3207 c = htmlParseCharRef(ctxt);
3208 if (c == 0)
3209 return;
3210
3211 if (c < 0x80) { out[i++]= c; bits= -6; }
3212 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3213 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3214 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3215
3216 for ( ; bits >= 0; bits-= 6) {
3217 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3218 }
3219 out[i] = 0;
3220
3221 htmlCheckParagraph(ctxt);
3222 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3223 ctxt->sax->characters(ctxt->userData, out, i);
3224 } else {
3225 ent = htmlParseEntityRef(ctxt, &name);
3226 if (name == NULL) {
3227 htmlCheckParagraph(ctxt);
3228 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3229 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3230 return;
3231 }
3232 if ((ent == NULL) || (ent->value <= 0)) {
3233 htmlCheckParagraph(ctxt);
3234 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3235 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3236 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3237 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3238 }
3239 } else {
3240 unsigned int c;
3241 int bits, i = 0;
3242
3243 c = ent->value;
3244 if (c < 0x80)
3245 { out[i++]= c; bits= -6; }
3246 else if (c < 0x800)
3247 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3248 else if (c < 0x10000)
3249 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3250 else
3251 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3252
3253 for ( ; bits >= 0; bits-= 6) {
3254 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3255 }
3256 out[i] = 0;
3257
3258 htmlCheckParagraph(ctxt);
3259 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3260 ctxt->sax->characters(ctxt->userData, out, i);
3261 }
3262 xmlFree(name);
3263 }
3264}
3265
3266/**
3267 * htmlParseContent:
3268 * @ctxt: an HTML parser context
3269 * @name: the node name
3270 *
3271 * Parse a content: comment, sub-element, reference or text.
3272 *
3273 */
3274
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003275static void
Owen Taylor3473f882001-02-23 17:55:21 +00003276htmlParseContent(htmlParserCtxtPtr ctxt) {
3277 xmlChar *currentNode;
3278 int depth;
3279
3280 currentNode = xmlStrdup(ctxt->name);
3281 depth = ctxt->nameNr;
3282 while (1) {
3283 long cons = ctxt->nbChars;
3284
3285 GROW;
3286 /*
3287 * Our tag or one of it's parent or children is ending.
3288 */
3289 if ((CUR == '<') && (NXT(1) == '/')) {
3290 htmlParseEndTag(ctxt);
3291 if (currentNode != NULL) xmlFree(currentNode);
3292 return;
3293 }
3294
3295 /*
3296 * Has this node been popped out during parsing of
3297 * the next element
3298 */
3299 if ((!xmlStrEqual(currentNode, ctxt->name)) &&
3300 (depth >= ctxt->nameNr)) {
3301 if (currentNode != NULL) xmlFree(currentNode);
3302 return;
3303 }
3304
Daniel Veillardf9533d12001-03-03 10:04:57 +00003305 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3306 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003307 /*
3308 * Handle SCRIPT/STYLE separately
3309 */
3310 htmlParseScript(ctxt);
3311 } else {
3312 /*
3313 * Sometimes DOCTYPE arrives in the middle of the document
3314 */
3315 if ((CUR == '<') && (NXT(1) == '!') &&
3316 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3317 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3318 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3319 (UPP(8) == 'E')) {
3320 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3321 ctxt->sax->error(ctxt->userData,
3322 "Misplaced DOCTYPE declaration\n");
3323 ctxt->wellFormed = 0;
3324 htmlParseDocTypeDecl(ctxt);
3325 }
3326
3327 /*
3328 * First case : a comment
3329 */
3330 if ((CUR == '<') && (NXT(1) == '!') &&
3331 (NXT(2) == '-') && (NXT(3) == '-')) {
3332 htmlParseComment(ctxt);
3333 }
3334
3335 /*
3336 * Second case : a sub-element.
3337 */
3338 else if (CUR == '<') {
3339 htmlParseElement(ctxt);
3340 }
3341
3342 /*
3343 * Third case : a reference. If if has not been resolved,
3344 * parsing returns it's Name, create the node
3345 */
3346 else if (CUR == '&') {
3347 htmlParseReference(ctxt);
3348 }
3349
3350 /*
3351 * Fourth : end of the resource
3352 */
3353 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003354 htmlAutoCloseOnEnd(ctxt);
3355 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003356 }
3357
3358 /*
3359 * Last case, text. Note that References are handled directly.
3360 */
3361 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003362 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003363 }
3364
3365 if (cons == ctxt->nbChars) {
3366 if (ctxt->node != NULL) {
3367 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3368 ctxt->sax->error(ctxt->userData,
3369 "detected an error in element content\n");
3370 ctxt->wellFormed = 0;
3371 }
3372 break;
3373 }
3374 }
3375 GROW;
3376 }
3377 if (currentNode != NULL) xmlFree(currentNode);
3378}
3379
3380/**
3381 * htmlParseElement:
3382 * @ctxt: an HTML parser context
3383 *
3384 * parse an HTML element, this is highly recursive
3385 *
3386 * [39] element ::= EmptyElemTag | STag content ETag
3387 *
3388 * [41] Attribute ::= Name Eq AttValue
3389 */
3390
3391void
3392htmlParseElement(htmlParserCtxtPtr ctxt) {
3393 xmlChar *name;
3394 xmlChar *currentNode = NULL;
3395 htmlElemDescPtr info;
3396 htmlParserNodeInfo node_info;
3397 xmlChar *oldname;
3398 int depth = ctxt->nameNr;
3399
3400 /* Capture start position */
3401 if (ctxt->record_info) {
3402 node_info.begin_pos = ctxt->input->consumed +
3403 (CUR_PTR - ctxt->input->base);
3404 node_info.begin_line = ctxt->input->line;
3405 }
3406
3407 oldname = xmlStrdup(ctxt->name);
3408 htmlParseStartTag(ctxt);
3409 name = ctxt->name;
3410#ifdef DEBUG
3411 if (oldname == NULL)
3412 xmlGenericError(xmlGenericErrorContext,
3413 "Start of element %s\n", name);
3414 else if (name == NULL)
3415 xmlGenericError(xmlGenericErrorContext,
3416 "Start of element failed, was %s\n", oldname);
3417 else
3418 xmlGenericError(xmlGenericErrorContext,
3419 "Start of element %s, was %s\n", name, oldname);
3420#endif
3421 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3422 (name == NULL)) {
3423 if (CUR == '>')
3424 NEXT;
3425 if (oldname != NULL)
3426 xmlFree(oldname);
3427 return;
3428 }
3429 if (oldname != NULL)
3430 xmlFree(oldname);
3431
3432 /*
3433 * Lookup the info for that element.
3434 */
3435 info = htmlTagLookup(name);
3436 if (info == NULL) {
3437 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3438 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3439 name);
3440 ctxt->wellFormed = 0;
3441 } else if (info->depr) {
3442/***************************
3443 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3444 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3445 name);
3446 ***************************/
3447 }
3448
3449 /*
3450 * Check for an Empty Element labelled the XML/SGML way
3451 */
3452 if ((CUR == '/') && (NXT(1) == '>')) {
3453 SKIP(2);
3454 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3455 ctxt->sax->endElement(ctxt->userData, name);
3456 oldname = htmlnamePop(ctxt);
3457#ifdef DEBUG
3458 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
3459#endif
3460 if (oldname != NULL)
3461 xmlFree(oldname);
3462 return;
3463 }
3464
3465 if (CUR == '>') {
3466 NEXT;
3467 } else {
3468 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3469 ctxt->sax->error(ctxt->userData,
3470 "Couldn't find end of Start Tag %s\n",
3471 name);
3472 ctxt->wellFormed = 0;
3473
3474 /*
3475 * end of parsing of this node.
3476 */
3477 if (xmlStrEqual(name, ctxt->name)) {
3478 nodePop(ctxt);
3479 oldname = htmlnamePop(ctxt);
3480#ifdef DEBUG
3481 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
3482#endif
3483 if (oldname != NULL)
3484 xmlFree(oldname);
3485 }
3486
3487 /*
3488 * Capture end position and add node
3489 */
3490 if ( currentNode != NULL && ctxt->record_info ) {
3491 node_info.end_pos = ctxt->input->consumed +
3492 (CUR_PTR - ctxt->input->base);
3493 node_info.end_line = ctxt->input->line;
3494 node_info.node = ctxt->node;
3495 xmlParserAddNodeInfo(ctxt, &node_info);
3496 }
3497 return;
3498 }
3499
3500 /*
3501 * Check for an Empty Element from DTD definition
3502 */
3503 if ((info != NULL) && (info->empty)) {
3504 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3505 ctxt->sax->endElement(ctxt->userData, name);
3506 oldname = htmlnamePop(ctxt);
3507#ifdef DEBUG
3508 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
3509#endif
3510 if (oldname != NULL)
3511 xmlFree(oldname);
3512 return;
3513 }
3514
3515 /*
3516 * Parse the content of the element:
3517 */
3518 currentNode = xmlStrdup(ctxt->name);
3519 depth = ctxt->nameNr;
3520 while (IS_CHAR(CUR)) {
3521 htmlParseContent(ctxt);
3522 if (ctxt->nameNr < depth) break;
3523 }
3524
Owen Taylor3473f882001-02-23 17:55:21 +00003525 /*
3526 * Capture end position and add node
3527 */
3528 if ( currentNode != NULL && ctxt->record_info ) {
3529 node_info.end_pos = ctxt->input->consumed +
3530 (CUR_PTR - ctxt->input->base);
3531 node_info.end_line = ctxt->input->line;
3532 node_info.node = ctxt->node;
3533 xmlParserAddNodeInfo(ctxt, &node_info);
3534 }
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003535 if (!IS_CHAR(CUR)) {
3536 htmlAutoCloseOnEnd(ctxt);
3537 }
3538
Owen Taylor3473f882001-02-23 17:55:21 +00003539 if (currentNode != NULL)
3540 xmlFree(currentNode);
3541}
3542
3543/**
3544 * htmlParseDocument :
3545 * @ctxt: an HTML parser context
3546 *
3547 * parse an HTML document (and build a tree if using the standard SAX
3548 * interface).
3549 *
3550 * Returns 0, -1 in case of error. the parser context is augmented
3551 * as a result of the parsing.
3552 */
3553
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003554static int
Owen Taylor3473f882001-02-23 17:55:21 +00003555htmlParseDocument(htmlParserCtxtPtr ctxt) {
3556 xmlDtdPtr dtd;
3557
3558 htmlDefaultSAXHandlerInit();
3559 ctxt->html = 1;
3560
3561 GROW;
3562 /*
3563 * SAX: beginning of the document processing.
3564 */
3565 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3566 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3567
3568 /*
3569 * Wipe out everything which is before the first '<'
3570 */
3571 SKIP_BLANKS;
3572 if (CUR == 0) {
3573 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3574 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3575 ctxt->wellFormed = 0;
3576 }
3577
3578 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3579 ctxt->sax->startDocument(ctxt->userData);
3580
3581
3582 /*
3583 * Parse possible comments before any content
3584 */
3585 while ((CUR == '<') && (NXT(1) == '!') &&
3586 (NXT(2) == '-') && (NXT(3) == '-')) {
3587 htmlParseComment(ctxt);
3588 SKIP_BLANKS;
3589 }
3590
3591
3592 /*
3593 * Then possibly doc type declaration(s) and more Misc
3594 * (doctypedecl Misc*)?
3595 */
3596 if ((CUR == '<') && (NXT(1) == '!') &&
3597 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3598 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3599 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3600 (UPP(8) == 'E')) {
3601 htmlParseDocTypeDecl(ctxt);
3602 }
3603 SKIP_BLANKS;
3604
3605 /*
3606 * Parse possible comments before any content
3607 */
3608 while ((CUR == '<') && (NXT(1) == '!') &&
3609 (NXT(2) == '-') && (NXT(3) == '-')) {
3610 htmlParseComment(ctxt);
3611 SKIP_BLANKS;
3612 }
3613
3614 /*
3615 * Time to start parsing the tree itself
3616 */
3617 htmlParseContent(ctxt);
3618
3619 /*
3620 * autoclose
3621 */
3622 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003623 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003624
3625
3626 /*
3627 * SAX: end of the document processing.
3628 */
3629 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3630 ctxt->sax->endDocument(ctxt->userData);
3631
3632 if (ctxt->myDoc != NULL) {
3633 dtd = xmlGetIntSubset(ctxt->myDoc);
3634 if (dtd == NULL)
3635 ctxt->myDoc->intSubset =
3636 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3637 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3638 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3639 }
3640 if (! ctxt->wellFormed) return(-1);
3641 return(0);
3642}
3643
3644
3645/************************************************************************
3646 * *
3647 * Parser contexts handling *
3648 * *
3649 ************************************************************************/
3650
3651/**
3652 * xmlInitParserCtxt:
3653 * @ctxt: an HTML parser context
3654 *
3655 * Initialize a parser context
3656 */
3657
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003658static void
Owen Taylor3473f882001-02-23 17:55:21 +00003659htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3660{
3661 htmlSAXHandler *sax;
3662
3663 if (ctxt == NULL) return;
3664 memset(ctxt, 0, sizeof(htmlParserCtxt));
3665
3666 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3667 if (sax == NULL) {
3668 xmlGenericError(xmlGenericErrorContext,
3669 "htmlInitParserCtxt: out of memory\n");
3670 }
3671 else
3672 memset(sax, 0, sizeof(htmlSAXHandler));
3673
3674 /* Allocate the Input stack */
3675 ctxt->inputTab = (htmlParserInputPtr *)
3676 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3677 if (ctxt->inputTab == NULL) {
3678 xmlGenericError(xmlGenericErrorContext,
3679 "htmlInitParserCtxt: out of memory\n");
3680 ctxt->inputNr = 0;
3681 ctxt->inputMax = 0;
3682 ctxt->input = NULL;
3683 return;
3684 }
3685 ctxt->inputNr = 0;
3686 ctxt->inputMax = 5;
3687 ctxt->input = NULL;
3688 ctxt->version = NULL;
3689 ctxt->encoding = NULL;
3690 ctxt->standalone = -1;
3691 ctxt->instate = XML_PARSER_START;
3692
3693 /* Allocate the Node stack */
3694 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3695 if (ctxt->nodeTab == NULL) {
3696 xmlGenericError(xmlGenericErrorContext,
3697 "htmlInitParserCtxt: out of memory\n");
3698 ctxt->nodeNr = 0;
3699 ctxt->nodeMax = 0;
3700 ctxt->node = NULL;
3701 ctxt->inputNr = 0;
3702 ctxt->inputMax = 0;
3703 ctxt->input = NULL;
3704 return;
3705 }
3706 ctxt->nodeNr = 0;
3707 ctxt->nodeMax = 10;
3708 ctxt->node = NULL;
3709
3710 /* Allocate the Name stack */
3711 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
3712 if (ctxt->nameTab == NULL) {
3713 xmlGenericError(xmlGenericErrorContext,
3714 "htmlInitParserCtxt: out of memory\n");
3715 ctxt->nameNr = 0;
3716 ctxt->nameMax = 10;
3717 ctxt->name = NULL;
3718 ctxt->nodeNr = 0;
3719 ctxt->nodeMax = 0;
3720 ctxt->node = NULL;
3721 ctxt->inputNr = 0;
3722 ctxt->inputMax = 0;
3723 ctxt->input = NULL;
3724 return;
3725 }
3726 ctxt->nameNr = 0;
3727 ctxt->nameMax = 10;
3728 ctxt->name = NULL;
3729
3730 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3731 else {
3732 ctxt->sax = sax;
3733 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3734 }
3735 ctxt->userData = ctxt;
3736 ctxt->myDoc = NULL;
3737 ctxt->wellFormed = 1;
3738 ctxt->replaceEntities = 0;
3739 ctxt->html = 1;
3740 ctxt->record_info = 0;
3741 ctxt->validate = 0;
3742 ctxt->nbChars = 0;
3743 ctxt->checkIndex = 0;
3744 xmlInitNodeInfoSeq(&ctxt->node_seq);
3745}
3746
3747/**
3748 * htmlFreeParserCtxt:
3749 * @ctxt: an HTML parser context
3750 *
3751 * Free all the memory used by a parser context. However the parsed
3752 * document in ctxt->myDoc is not freed.
3753 */
3754
3755void
3756htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3757{
3758 xmlFreeParserCtxt(ctxt);
3759}
3760
3761/**
3762 * htmlCreateDocParserCtxt :
3763 * @cur: a pointer to an array of xmlChar
3764 * @encoding: a free form C string describing the HTML document encoding, or NULL
3765 *
3766 * Create a parser context for an HTML document.
3767 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003768 * TODO: check the need to add encoding handling there
3769 *
Owen Taylor3473f882001-02-23 17:55:21 +00003770 * Returns the new parser context or NULL
3771 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003772static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00003773htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Owen Taylor3473f882001-02-23 17:55:21 +00003774 htmlParserCtxtPtr ctxt;
3775 htmlParserInputPtr input;
3776 /* htmlCharEncoding enc; */
3777
3778 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
3779 if (ctxt == NULL) {
3780 perror("malloc");
3781 return(NULL);
3782 }
3783 htmlInitParserCtxt(ctxt);
3784 input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
3785 if (input == NULL) {
3786 perror("malloc");
3787 xmlFree(ctxt);
3788 return(NULL);
3789 }
3790 memset(input, 0, sizeof(htmlParserInput));
3791
3792 input->line = 1;
3793 input->col = 1;
3794 input->base = cur;
3795 input->cur = cur;
3796
3797 inputPush(ctxt, input);
3798 return(ctxt);
3799}
3800
3801/************************************************************************
3802 * *
3803 * Progressive parsing interfaces *
3804 * *
3805 ************************************************************************/
3806
3807/**
3808 * htmlParseLookupSequence:
3809 * @ctxt: an HTML parser context
3810 * @first: the first char to lookup
3811 * @next: the next char to lookup or zero
3812 * @third: the next char to lookup or zero
3813 *
3814 * Try to find if a sequence (first, next, third) or just (first next) or
3815 * (first) is available in the input stream.
3816 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3817 * to avoid rescanning sequences of bytes, it DOES change the state of the
3818 * parser, do not use liberally.
3819 * This is basically similar to xmlParseLookupSequence()
3820 *
3821 * Returns the index to the current parsing point if the full sequence
3822 * is available, -1 otherwise.
3823 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003824static int
Owen Taylor3473f882001-02-23 17:55:21 +00003825htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3826 xmlChar next, xmlChar third) {
3827 int base, len;
3828 htmlParserInputPtr in;
3829 const xmlChar *buf;
3830
3831 in = ctxt->input;
3832 if (in == NULL) return(-1);
3833 base = in->cur - in->base;
3834 if (base < 0) return(-1);
3835 if (ctxt->checkIndex > base)
3836 base = ctxt->checkIndex;
3837 if (in->buf == NULL) {
3838 buf = in->base;
3839 len = in->length;
3840 } else {
3841 buf = in->buf->buffer->content;
3842 len = in->buf->buffer->use;
3843 }
3844 /* take into account the sequence length */
3845 if (third) len -= 2;
3846 else if (next) len --;
3847 for (;base < len;base++) {
3848 if (buf[base] == first) {
3849 if (third != 0) {
3850 if ((buf[base + 1] != next) ||
3851 (buf[base + 2] != third)) continue;
3852 } else if (next != 0) {
3853 if (buf[base + 1] != next) continue;
3854 }
3855 ctxt->checkIndex = 0;
3856#ifdef DEBUG_PUSH
3857 if (next == 0)
3858 xmlGenericError(xmlGenericErrorContext,
3859 "HPP: lookup '%c' found at %d\n",
3860 first, base);
3861 else if (third == 0)
3862 xmlGenericError(xmlGenericErrorContext,
3863 "HPP: lookup '%c%c' found at %d\n",
3864 first, next, base);
3865 else
3866 xmlGenericError(xmlGenericErrorContext,
3867 "HPP: lookup '%c%c%c' found at %d\n",
3868 first, next, third, base);
3869#endif
3870 return(base - (in->cur - in->base));
3871 }
3872 }
3873 ctxt->checkIndex = base;
3874#ifdef DEBUG_PUSH
3875 if (next == 0)
3876 xmlGenericError(xmlGenericErrorContext,
3877 "HPP: lookup '%c' failed\n", first);
3878 else if (third == 0)
3879 xmlGenericError(xmlGenericErrorContext,
3880 "HPP: lookup '%c%c' failed\n", first, next);
3881 else
3882 xmlGenericError(xmlGenericErrorContext,
3883 "HPP: lookup '%c%c%c' failed\n", first, next, third);
3884#endif
3885 return(-1);
3886}
3887
3888/**
3889 * htmlParseTryOrFinish:
3890 * @ctxt: an HTML parser context
3891 * @terminate: last chunk indicator
3892 *
3893 * Try to progress on parsing
3894 *
3895 * Returns zero if no parsing was possible
3896 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003897static int
Owen Taylor3473f882001-02-23 17:55:21 +00003898htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
3899 int ret = 0;
3900 htmlParserInputPtr in;
3901 int avail = 0;
3902 xmlChar cur, next;
3903
3904#ifdef DEBUG_PUSH
3905 switch (ctxt->instate) {
3906 case XML_PARSER_EOF:
3907 xmlGenericError(xmlGenericErrorContext,
3908 "HPP: try EOF\n"); break;
3909 case XML_PARSER_START:
3910 xmlGenericError(xmlGenericErrorContext,
3911 "HPP: try START\n"); break;
3912 case XML_PARSER_MISC:
3913 xmlGenericError(xmlGenericErrorContext,
3914 "HPP: try MISC\n");break;
3915 case XML_PARSER_COMMENT:
3916 xmlGenericError(xmlGenericErrorContext,
3917 "HPP: try COMMENT\n");break;
3918 case XML_PARSER_PROLOG:
3919 xmlGenericError(xmlGenericErrorContext,
3920 "HPP: try PROLOG\n");break;
3921 case XML_PARSER_START_TAG:
3922 xmlGenericError(xmlGenericErrorContext,
3923 "HPP: try START_TAG\n");break;
3924 case XML_PARSER_CONTENT:
3925 xmlGenericError(xmlGenericErrorContext,
3926 "HPP: try CONTENT\n");break;
3927 case XML_PARSER_CDATA_SECTION:
3928 xmlGenericError(xmlGenericErrorContext,
3929 "HPP: try CDATA_SECTION\n");break;
3930 case XML_PARSER_END_TAG:
3931 xmlGenericError(xmlGenericErrorContext,
3932 "HPP: try END_TAG\n");break;
3933 case XML_PARSER_ENTITY_DECL:
3934 xmlGenericError(xmlGenericErrorContext,
3935 "HPP: try ENTITY_DECL\n");break;
3936 case XML_PARSER_ENTITY_VALUE:
3937 xmlGenericError(xmlGenericErrorContext,
3938 "HPP: try ENTITY_VALUE\n");break;
3939 case XML_PARSER_ATTRIBUTE_VALUE:
3940 xmlGenericError(xmlGenericErrorContext,
3941 "HPP: try ATTRIBUTE_VALUE\n");break;
3942 case XML_PARSER_DTD:
3943 xmlGenericError(xmlGenericErrorContext,
3944 "HPP: try DTD\n");break;
3945 case XML_PARSER_EPILOG:
3946 xmlGenericError(xmlGenericErrorContext,
3947 "HPP: try EPILOG\n");break;
3948 case XML_PARSER_PI:
3949 xmlGenericError(xmlGenericErrorContext,
3950 "HPP: try PI\n");break;
3951 case XML_PARSER_SYSTEM_LITERAL:
3952 xmlGenericError(xmlGenericErrorContext,
3953 "HPP: try SYSTEM_LITERAL\n");break;
3954 }
3955#endif
3956
3957 while (1) {
3958
3959 in = ctxt->input;
3960 if (in == NULL) break;
3961 if (in->buf == NULL)
3962 avail = in->length - (in->cur - in->base);
3963 else
3964 avail = in->buf->buffer->use - (in->cur - in->base);
3965 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003966 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003967 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
3968 /*
3969 * SAX: end of the document processing.
3970 */
3971 ctxt->instate = XML_PARSER_EOF;
3972 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3973 ctxt->sax->endDocument(ctxt->userData);
3974 }
3975 }
3976 if (avail < 1)
3977 goto done;
3978 switch (ctxt->instate) {
3979 case XML_PARSER_EOF:
3980 /*
3981 * Document parsing is done !
3982 */
3983 goto done;
3984 case XML_PARSER_START:
3985 /*
3986 * Very first chars read from the document flow.
3987 */
3988 cur = in->cur[0];
3989 if (IS_BLANK(cur)) {
3990 SKIP_BLANKS;
3991 if (in->buf == NULL)
3992 avail = in->length - (in->cur - in->base);
3993 else
3994 avail = in->buf->buffer->use - (in->cur - in->base);
3995 }
3996 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3997 ctxt->sax->setDocumentLocator(ctxt->userData,
3998 &xmlDefaultSAXLocator);
3999 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4000 (!ctxt->disableSAX))
4001 ctxt->sax->startDocument(ctxt->userData);
4002
4003 cur = in->cur[0];
4004 next = in->cur[1];
4005 if ((cur == '<') && (next == '!') &&
4006 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4007 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4008 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4009 (UPP(8) == 'E')) {
4010 if ((!terminate) &&
4011 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4012 goto done;
4013#ifdef DEBUG_PUSH
4014 xmlGenericError(xmlGenericErrorContext,
4015 "HPP: Parsing internal subset\n");
4016#endif
4017 htmlParseDocTypeDecl(ctxt);
4018 ctxt->instate = XML_PARSER_PROLOG;
4019#ifdef DEBUG_PUSH
4020 xmlGenericError(xmlGenericErrorContext,
4021 "HPP: entering PROLOG\n");
4022#endif
4023 } else {
4024 ctxt->instate = XML_PARSER_MISC;
4025 }
4026#ifdef DEBUG_PUSH
4027 xmlGenericError(xmlGenericErrorContext,
4028 "HPP: entering MISC\n");
4029#endif
4030 break;
4031 case XML_PARSER_MISC:
4032 SKIP_BLANKS;
4033 if (in->buf == NULL)
4034 avail = in->length - (in->cur - in->base);
4035 else
4036 avail = in->buf->buffer->use - (in->cur - in->base);
4037 if (avail < 2)
4038 goto done;
4039 cur = in->cur[0];
4040 next = in->cur[1];
4041 if ((cur == '<') && (next == '!') &&
4042 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4043 if ((!terminate) &&
4044 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4045 goto done;
4046#ifdef DEBUG_PUSH
4047 xmlGenericError(xmlGenericErrorContext,
4048 "HPP: Parsing Comment\n");
4049#endif
4050 htmlParseComment(ctxt);
4051 ctxt->instate = XML_PARSER_MISC;
4052 } else if ((cur == '<') && (next == '!') &&
4053 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4054 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4055 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4056 (UPP(8) == 'E')) {
4057 if ((!terminate) &&
4058 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4059 goto done;
4060#ifdef DEBUG_PUSH
4061 xmlGenericError(xmlGenericErrorContext,
4062 "HPP: Parsing internal subset\n");
4063#endif
4064 htmlParseDocTypeDecl(ctxt);
4065 ctxt->instate = XML_PARSER_PROLOG;
4066#ifdef DEBUG_PUSH
4067 xmlGenericError(xmlGenericErrorContext,
4068 "HPP: entering PROLOG\n");
4069#endif
4070 } else if ((cur == '<') && (next == '!') &&
4071 (avail < 9)) {
4072 goto done;
4073 } else {
4074 ctxt->instate = XML_PARSER_START_TAG;
4075#ifdef DEBUG_PUSH
4076 xmlGenericError(xmlGenericErrorContext,
4077 "HPP: entering START_TAG\n");
4078#endif
4079 }
4080 break;
4081 case XML_PARSER_PROLOG:
4082 SKIP_BLANKS;
4083 if (in->buf == NULL)
4084 avail = in->length - (in->cur - in->base);
4085 else
4086 avail = in->buf->buffer->use - (in->cur - in->base);
4087 if (avail < 2)
4088 goto done;
4089 cur = in->cur[0];
4090 next = in->cur[1];
4091 if ((cur == '<') && (next == '!') &&
4092 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4093 if ((!terminate) &&
4094 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4095 goto done;
4096#ifdef DEBUG_PUSH
4097 xmlGenericError(xmlGenericErrorContext,
4098 "HPP: Parsing Comment\n");
4099#endif
4100 htmlParseComment(ctxt);
4101 ctxt->instate = XML_PARSER_PROLOG;
4102 } else if ((cur == '<') && (next == '!') &&
4103 (avail < 4)) {
4104 goto done;
4105 } else {
4106 ctxt->instate = XML_PARSER_START_TAG;
4107#ifdef DEBUG_PUSH
4108 xmlGenericError(xmlGenericErrorContext,
4109 "HPP: entering START_TAG\n");
4110#endif
4111 }
4112 break;
4113 case XML_PARSER_EPILOG:
4114 if (in->buf == NULL)
4115 avail = in->length - (in->cur - in->base);
4116 else
4117 avail = in->buf->buffer->use - (in->cur - in->base);
4118 if (avail < 1)
4119 goto done;
4120 cur = in->cur[0];
4121 if (IS_BLANK(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004122 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004123 goto done;
4124 }
4125 if (avail < 2)
4126 goto done;
4127 next = in->cur[1];
4128 if ((cur == '<') && (next == '!') &&
4129 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4130 if ((!terminate) &&
4131 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4132 goto done;
4133#ifdef DEBUG_PUSH
4134 xmlGenericError(xmlGenericErrorContext,
4135 "HPP: Parsing Comment\n");
4136#endif
4137 htmlParseComment(ctxt);
4138 ctxt->instate = XML_PARSER_EPILOG;
4139 } else if ((cur == '<') && (next == '!') &&
4140 (avail < 4)) {
4141 goto done;
4142 } else {
4143 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004144 ctxt->wellFormed = 0;
4145 ctxt->instate = XML_PARSER_EOF;
4146#ifdef DEBUG_PUSH
4147 xmlGenericError(xmlGenericErrorContext,
4148 "HPP: entering EOF\n");
4149#endif
4150 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4151 ctxt->sax->endDocument(ctxt->userData);
4152 goto done;
4153 }
4154 break;
4155 case XML_PARSER_START_TAG: {
4156 xmlChar *name, *oldname;
4157 int depth = ctxt->nameNr;
4158 htmlElemDescPtr info;
4159
4160 if (avail < 2)
4161 goto done;
4162 cur = in->cur[0];
4163 if (cur != '<') {
4164 ctxt->instate = XML_PARSER_CONTENT;
4165#ifdef DEBUG_PUSH
4166 xmlGenericError(xmlGenericErrorContext,
4167 "HPP: entering CONTENT\n");
4168#endif
4169 break;
4170 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004171 if (in->cur[1] == '/') {
4172 ctxt->instate = XML_PARSER_END_TAG;
4173 ctxt->checkIndex = 0;
4174#ifdef DEBUG_PUSH
4175 xmlGenericError(xmlGenericErrorContext,
4176 "HPP: entering END_TAG\n");
4177#endif
4178 break;
4179 }
Owen Taylor3473f882001-02-23 17:55:21 +00004180 if ((!terminate) &&
4181 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4182 goto done;
4183
4184 oldname = xmlStrdup(ctxt->name);
4185 htmlParseStartTag(ctxt);
4186 name = ctxt->name;
4187#ifdef DEBUG
4188 if (oldname == NULL)
4189 xmlGenericError(xmlGenericErrorContext,
4190 "Start of element %s\n", name);
4191 else if (name == NULL)
4192 xmlGenericError(xmlGenericErrorContext,
4193 "Start of element failed, was %s\n",
4194 oldname);
4195 else
4196 xmlGenericError(xmlGenericErrorContext,
4197 "Start of element %s, was %s\n",
4198 name, oldname);
4199#endif
4200 if (((depth == ctxt->nameNr) &&
4201 (xmlStrEqual(oldname, ctxt->name))) ||
4202 (name == NULL)) {
4203 if (CUR == '>')
4204 NEXT;
4205 if (oldname != NULL)
4206 xmlFree(oldname);
4207 break;
4208 }
4209 if (oldname != NULL)
4210 xmlFree(oldname);
4211
4212 /*
4213 * Lookup the info for that element.
4214 */
4215 info = htmlTagLookup(name);
4216 if (info == NULL) {
4217 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4218 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4219 name);
4220 ctxt->wellFormed = 0;
4221 } else if (info->depr) {
4222 /***************************
4223 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4224 ctxt->sax->warning(ctxt->userData,
4225 "Tag %s is deprecated\n",
4226 name);
4227 ***************************/
4228 }
4229
4230 /*
4231 * Check for an Empty Element labelled the XML/SGML way
4232 */
4233 if ((CUR == '/') && (NXT(1) == '>')) {
4234 SKIP(2);
4235 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4236 ctxt->sax->endElement(ctxt->userData, name);
4237 oldname = htmlnamePop(ctxt);
4238#ifdef DEBUG
4239 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
4240 oldname);
4241#endif
4242 if (oldname != NULL)
4243 xmlFree(oldname);
4244 ctxt->instate = XML_PARSER_CONTENT;
4245#ifdef DEBUG_PUSH
4246 xmlGenericError(xmlGenericErrorContext,
4247 "HPP: entering CONTENT\n");
4248#endif
4249 break;
4250 }
4251
4252 if (CUR == '>') {
4253 NEXT;
4254 } else {
4255 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4256 ctxt->sax->error(ctxt->userData,
4257 "Couldn't find end of Start Tag %s\n",
4258 name);
4259 ctxt->wellFormed = 0;
4260
4261 /*
4262 * end of parsing of this node.
4263 */
4264 if (xmlStrEqual(name, ctxt->name)) {
4265 nodePop(ctxt);
4266 oldname = htmlnamePop(ctxt);
4267#ifdef DEBUG
4268 xmlGenericError(xmlGenericErrorContext,
4269 "End of start tag problem: popping out %s\n", oldname);
4270#endif
4271 if (oldname != NULL)
4272 xmlFree(oldname);
4273 }
4274
4275 ctxt->instate = XML_PARSER_CONTENT;
4276#ifdef DEBUG_PUSH
4277 xmlGenericError(xmlGenericErrorContext,
4278 "HPP: entering CONTENT\n");
4279#endif
4280 break;
4281 }
4282
4283 /*
4284 * Check for an Empty Element from DTD definition
4285 */
4286 if ((info != NULL) && (info->empty)) {
4287 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4288 ctxt->sax->endElement(ctxt->userData, name);
4289 oldname = htmlnamePop(ctxt);
4290#ifdef DEBUG
4291 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
4292#endif
4293 if (oldname != NULL)
4294 xmlFree(oldname);
4295 }
4296 ctxt->instate = XML_PARSER_CONTENT;
4297#ifdef DEBUG_PUSH
4298 xmlGenericError(xmlGenericErrorContext,
4299 "HPP: entering CONTENT\n");
4300#endif
4301 break;
4302 }
4303 case XML_PARSER_CONTENT: {
4304 long cons;
4305 /*
4306 * Handle preparsed entities and charRef
4307 */
4308 if (ctxt->token != 0) {
4309 xmlChar chr[2] = { 0 , 0 } ;
4310
4311 chr[0] = (xmlChar) ctxt->token;
4312 htmlCheckParagraph(ctxt);
4313 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4314 ctxt->sax->characters(ctxt->userData, chr, 1);
4315 ctxt->token = 0;
4316 ctxt->checkIndex = 0;
4317 }
4318 if ((avail == 1) && (terminate)) {
4319 cur = in->cur[0];
4320 if ((cur != '<') && (cur != '&')) {
4321 if (ctxt->sax != NULL) {
4322 if (IS_BLANK(cur)) {
4323 if (ctxt->sax->ignorableWhitespace != NULL)
4324 ctxt->sax->ignorableWhitespace(
4325 ctxt->userData, &cur, 1);
4326 } else {
4327 htmlCheckParagraph(ctxt);
4328 if (ctxt->sax->characters != NULL)
4329 ctxt->sax->characters(
4330 ctxt->userData, &cur, 1);
4331 }
4332 }
4333 ctxt->token = 0;
4334 ctxt->checkIndex = 0;
4335 NEXT;
4336 }
4337 break;
4338 }
4339 if (avail < 2)
4340 goto done;
4341 cur = in->cur[0];
4342 next = in->cur[1];
4343 cons = ctxt->nbChars;
4344 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4345 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4346 /*
4347 * Handle SCRIPT/STYLE separately
4348 */
4349 if ((!terminate) &&
4350 (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4351 goto done;
4352 htmlParseScript(ctxt);
4353 if ((cur == '<') && (next == '/')) {
4354 ctxt->instate = XML_PARSER_END_TAG;
4355 ctxt->checkIndex = 0;
4356#ifdef DEBUG_PUSH
4357 xmlGenericError(xmlGenericErrorContext,
4358 "HPP: entering END_TAG\n");
4359#endif
4360 break;
4361 }
4362 } else {
4363 /*
4364 * Sometimes DOCTYPE arrives in the middle of the document
4365 */
4366 if ((cur == '<') && (next == '!') &&
4367 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4368 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4369 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4370 (UPP(8) == 'E')) {
4371 if ((!terminate) &&
4372 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4373 goto done;
4374 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4375 ctxt->sax->error(ctxt->userData,
4376 "Misplaced DOCTYPE declaration\n");
4377 ctxt->wellFormed = 0;
4378 htmlParseDocTypeDecl(ctxt);
4379 } else if ((cur == '<') && (next == '!') &&
4380 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4381 if ((!terminate) &&
4382 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4383 goto done;
4384#ifdef DEBUG_PUSH
4385 xmlGenericError(xmlGenericErrorContext,
4386 "HPP: Parsing Comment\n");
4387#endif
4388 htmlParseComment(ctxt);
4389 ctxt->instate = XML_PARSER_CONTENT;
4390 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4391 goto done;
4392 } else if ((cur == '<') && (next == '/')) {
4393 ctxt->instate = XML_PARSER_END_TAG;
4394 ctxt->checkIndex = 0;
4395#ifdef DEBUG_PUSH
4396 xmlGenericError(xmlGenericErrorContext,
4397 "HPP: entering END_TAG\n");
4398#endif
4399 break;
4400 } else if (cur == '<') {
4401 ctxt->instate = XML_PARSER_START_TAG;
4402 ctxt->checkIndex = 0;
4403#ifdef DEBUG_PUSH
4404 xmlGenericError(xmlGenericErrorContext,
4405 "HPP: entering START_TAG\n");
4406#endif
4407 break;
4408 } else if (cur == '&') {
4409 if ((!terminate) &&
4410 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4411 goto done;
4412#ifdef DEBUG_PUSH
4413 xmlGenericError(xmlGenericErrorContext,
4414 "HPP: Parsing Reference\n");
4415#endif
4416 /* TODO: check generation of subtrees if noent !!! */
4417 htmlParseReference(ctxt);
4418 } else {
4419 /* TODO Avoid the extra copy, handle directly !!!!!! */
4420 /*
4421 * Goal of the following test is :
4422 * - minimize calls to the SAX 'character' callback
4423 * when they are mergeable
4424 */
4425 if ((ctxt->inputNr == 1) &&
4426 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4427 if ((!terminate) &&
4428 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4429 goto done;
4430 }
4431 ctxt->checkIndex = 0;
4432#ifdef DEBUG_PUSH
4433 xmlGenericError(xmlGenericErrorContext,
4434 "HPP: Parsing char data\n");
4435#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004436 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004437 }
4438 }
4439 if (cons == ctxt->nbChars) {
4440 if (ctxt->node != NULL) {
4441 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4442 ctxt->sax->error(ctxt->userData,
4443 "detected an error in element content\n");
4444 ctxt->wellFormed = 0;
4445 }
4446 NEXT;
4447 break;
4448 }
4449
4450 break;
4451 }
4452 case XML_PARSER_END_TAG:
4453 if (avail < 2)
4454 goto done;
4455 if ((!terminate) &&
4456 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4457 goto done;
4458 htmlParseEndTag(ctxt);
4459 if (ctxt->nameNr == 0) {
4460 ctxt->instate = XML_PARSER_EPILOG;
4461 } else {
4462 ctxt->instate = XML_PARSER_CONTENT;
4463 }
4464 ctxt->checkIndex = 0;
4465#ifdef DEBUG_PUSH
4466 xmlGenericError(xmlGenericErrorContext,
4467 "HPP: entering CONTENT\n");
4468#endif
4469 break;
4470 case XML_PARSER_CDATA_SECTION:
4471 xmlGenericError(xmlGenericErrorContext,
4472 "HPP: internal error, state == CDATA\n");
4473 ctxt->instate = XML_PARSER_CONTENT;
4474 ctxt->checkIndex = 0;
4475#ifdef DEBUG_PUSH
4476 xmlGenericError(xmlGenericErrorContext,
4477 "HPP: entering CONTENT\n");
4478#endif
4479 break;
4480 case XML_PARSER_DTD:
4481 xmlGenericError(xmlGenericErrorContext,
4482 "HPP: internal error, state == DTD\n");
4483 ctxt->instate = XML_PARSER_CONTENT;
4484 ctxt->checkIndex = 0;
4485#ifdef DEBUG_PUSH
4486 xmlGenericError(xmlGenericErrorContext,
4487 "HPP: entering CONTENT\n");
4488#endif
4489 break;
4490 case XML_PARSER_COMMENT:
4491 xmlGenericError(xmlGenericErrorContext,
4492 "HPP: internal error, state == COMMENT\n");
4493 ctxt->instate = XML_PARSER_CONTENT;
4494 ctxt->checkIndex = 0;
4495#ifdef DEBUG_PUSH
4496 xmlGenericError(xmlGenericErrorContext,
4497 "HPP: entering CONTENT\n");
4498#endif
4499 break;
4500 case XML_PARSER_PI:
4501 xmlGenericError(xmlGenericErrorContext,
4502 "HPP: internal error, state == PI\n");
4503 ctxt->instate = XML_PARSER_CONTENT;
4504 ctxt->checkIndex = 0;
4505#ifdef DEBUG_PUSH
4506 xmlGenericError(xmlGenericErrorContext,
4507 "HPP: entering CONTENT\n");
4508#endif
4509 break;
4510 case XML_PARSER_ENTITY_DECL:
4511 xmlGenericError(xmlGenericErrorContext,
4512 "HPP: internal error, state == ENTITY_DECL\n");
4513 ctxt->instate = XML_PARSER_CONTENT;
4514 ctxt->checkIndex = 0;
4515#ifdef DEBUG_PUSH
4516 xmlGenericError(xmlGenericErrorContext,
4517 "HPP: entering CONTENT\n");
4518#endif
4519 break;
4520 case XML_PARSER_ENTITY_VALUE:
4521 xmlGenericError(xmlGenericErrorContext,
4522 "HPP: internal error, state == ENTITY_VALUE\n");
4523 ctxt->instate = XML_PARSER_CONTENT;
4524 ctxt->checkIndex = 0;
4525#ifdef DEBUG_PUSH
4526 xmlGenericError(xmlGenericErrorContext,
4527 "HPP: entering DTD\n");
4528#endif
4529 break;
4530 case XML_PARSER_ATTRIBUTE_VALUE:
4531 xmlGenericError(xmlGenericErrorContext,
4532 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
4533 ctxt->instate = XML_PARSER_START_TAG;
4534 ctxt->checkIndex = 0;
4535#ifdef DEBUG_PUSH
4536 xmlGenericError(xmlGenericErrorContext,
4537 "HPP: entering START_TAG\n");
4538#endif
4539 break;
4540 case XML_PARSER_SYSTEM_LITERAL:
4541 xmlGenericError(xmlGenericErrorContext,
4542 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
4543 ctxt->instate = XML_PARSER_CONTENT;
4544 ctxt->checkIndex = 0;
4545#ifdef DEBUG_PUSH
4546 xmlGenericError(xmlGenericErrorContext,
4547 "HPP: entering CONTENT\n");
4548#endif
4549 break;
4550 case XML_PARSER_IGNORE:
4551 xmlGenericError(xmlGenericErrorContext,
4552 "HPP: internal error, state == XML_PARSER_IGNORE\n");
4553 ctxt->instate = XML_PARSER_CONTENT;
4554 ctxt->checkIndex = 0;
4555#ifdef DEBUG_PUSH
4556 xmlGenericError(xmlGenericErrorContext,
4557 "HPP: entering CONTENT\n");
4558#endif
4559 break;
4560 }
4561 }
4562done:
4563 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004564 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004565 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4566 /*
4567 * SAX: end of the document processing.
4568 */
4569 ctxt->instate = XML_PARSER_EOF;
4570 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4571 ctxt->sax->endDocument(ctxt->userData);
4572 }
4573 }
4574 if ((ctxt->myDoc != NULL) &&
4575 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4576 (ctxt->instate == XML_PARSER_EPILOG))) {
4577 xmlDtdPtr dtd;
4578 dtd = xmlGetIntSubset(ctxt->myDoc);
4579 if (dtd == NULL)
4580 ctxt->myDoc->intSubset =
4581 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4582 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4583 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4584 }
4585#ifdef DEBUG_PUSH
4586 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4587#endif
4588 return(ret);
4589}
4590
4591/**
Owen Taylor3473f882001-02-23 17:55:21 +00004592 * htmlParseChunk:
4593 * @ctxt: an XML parser context
4594 * @chunk: an char array
4595 * @size: the size in byte of the chunk
4596 * @terminate: last chunk indicator
4597 *
4598 * Parse a Chunk of memory
4599 *
4600 * Returns zero if no error, the xmlParserErrors otherwise.
4601 */
4602int
4603htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4604 int terminate) {
4605 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4606 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4607 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4608 int cur = ctxt->input->cur - ctxt->input->base;
4609
4610 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4611 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4612 ctxt->input->cur = ctxt->input->base + cur;
4613#ifdef DEBUG_PUSH
4614 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4615#endif
4616
4617 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4618 htmlParseTryOrFinish(ctxt, terminate);
4619 } else if (ctxt->instate != XML_PARSER_EOF) {
4620 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
4621 htmlParseTryOrFinish(ctxt, terminate);
4622 }
4623 if (terminate) {
4624 if ((ctxt->instate != XML_PARSER_EOF) &&
4625 (ctxt->instate != XML_PARSER_EPILOG) &&
4626 (ctxt->instate != XML_PARSER_MISC)) {
4627 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004628 ctxt->wellFormed = 0;
4629 }
4630 if (ctxt->instate != XML_PARSER_EOF) {
4631 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4632 ctxt->sax->endDocument(ctxt->userData);
4633 }
4634 ctxt->instate = XML_PARSER_EOF;
4635 }
4636 return((xmlParserErrors) ctxt->errNo);
4637}
4638
4639/************************************************************************
4640 * *
4641 * User entry points *
4642 * *
4643 ************************************************************************/
4644
4645/**
4646 * htmlCreatePushParserCtxt :
4647 * @sax: a SAX handler
4648 * @user_data: The user data returned on SAX callbacks
4649 * @chunk: a pointer to an array of chars
4650 * @size: number of chars in the array
4651 * @filename: an optional file name or URI
4652 * @enc: an optional encoding
4653 *
4654 * Create a parser context for using the HTML parser in push mode
4655 * To allow content encoding detection, @size should be >= 4
4656 * The value of @filename is used for fetching external entities
4657 * and error/warning reports.
4658 *
4659 * Returns the new parser context or NULL
4660 */
4661htmlParserCtxtPtr
4662htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4663 const char *chunk, int size, const char *filename,
4664 xmlCharEncoding enc) {
4665 htmlParserCtxtPtr ctxt;
4666 htmlParserInputPtr inputStream;
4667 xmlParserInputBufferPtr buf;
4668
4669 buf = xmlAllocParserInputBuffer(enc);
4670 if (buf == NULL) return(NULL);
4671
4672 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4673 if (ctxt == NULL) {
4674 xmlFree(buf);
4675 return(NULL);
4676 }
4677 memset(ctxt, 0, sizeof(htmlParserCtxt));
4678 htmlInitParserCtxt(ctxt);
4679 if (sax != NULL) {
4680 if (ctxt->sax != &htmlDefaultSAXHandler)
4681 xmlFree(ctxt->sax);
4682 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4683 if (ctxt->sax == NULL) {
4684 xmlFree(buf);
4685 xmlFree(ctxt);
4686 return(NULL);
4687 }
4688 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4689 if (user_data != NULL)
4690 ctxt->userData = user_data;
4691 }
4692 if (filename == NULL) {
4693 ctxt->directory = NULL;
4694 } else {
4695 ctxt->directory = xmlParserGetDirectory(filename);
4696 }
4697
4698 inputStream = htmlNewInputStream(ctxt);
4699 if (inputStream == NULL) {
4700 xmlFreeParserCtxt(ctxt);
4701 return(NULL);
4702 }
4703
4704 if (filename == NULL)
4705 inputStream->filename = NULL;
4706 else
4707 inputStream->filename = xmlMemStrdup(filename);
4708 inputStream->buf = buf;
4709 inputStream->base = inputStream->buf->buffer->content;
4710 inputStream->cur = inputStream->buf->buffer->content;
4711
4712 inputPush(ctxt, inputStream);
4713
4714 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4715 (ctxt->input->buf != NULL)) {
4716 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4717#ifdef DEBUG_PUSH
4718 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4719#endif
4720 }
4721
4722 return(ctxt);
4723}
4724
4725/**
4726 * htmlSAXParseDoc :
4727 * @cur: a pointer to an array of xmlChar
4728 * @encoding: a free form C string describing the HTML document encoding, or NULL
4729 * @sax: the SAX handler block
4730 * @userData: if using SAX, this pointer will be provided on callbacks.
4731 *
4732 * parse an HTML in-memory document and build a tree.
4733 * It use the given SAX function block to handle the parsing callback.
4734 * If sax is NULL, fallback to the default DOM tree building routines.
4735 *
4736 * Returns the resulting document tree
4737 */
4738
4739htmlDocPtr
4740htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
4741 htmlDocPtr ret;
4742 htmlParserCtxtPtr ctxt;
4743
4744 if (cur == NULL) return(NULL);
4745
4746
4747 ctxt = htmlCreateDocParserCtxt(cur, encoding);
4748 if (ctxt == NULL) return(NULL);
4749 if (sax != NULL) {
4750 ctxt->sax = sax;
4751 ctxt->userData = userData;
4752 }
4753
4754 htmlParseDocument(ctxt);
4755 ret = ctxt->myDoc;
4756 if (sax != NULL) {
4757 ctxt->sax = NULL;
4758 ctxt->userData = NULL;
4759 }
4760 htmlFreeParserCtxt(ctxt);
4761
4762 return(ret);
4763}
4764
4765/**
4766 * htmlParseDoc :
4767 * @cur: a pointer to an array of xmlChar
4768 * @encoding: a free form C string describing the HTML document encoding, or NULL
4769 *
4770 * parse an HTML in-memory document and build a tree.
4771 *
4772 * Returns the resulting document tree
4773 */
4774
4775htmlDocPtr
4776htmlParseDoc(xmlChar *cur, const char *encoding) {
4777 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4778}
4779
4780
4781/**
4782 * htmlCreateFileParserCtxt :
4783 * @filename: the filename
4784 * @encoding: a free form C string describing the HTML document encoding, or NULL
4785 *
4786 * Create a parser context for a file content.
4787 * Automatic support for ZLIB/Compress compressed document is provided
4788 * by default if found at compile-time.
4789 *
4790 * Returns the new parser context or NULL
4791 */
4792htmlParserCtxtPtr
4793htmlCreateFileParserCtxt(const char *filename, const char *encoding)
4794{
4795 htmlParserCtxtPtr ctxt;
4796 htmlParserInputPtr inputStream;
4797 xmlParserInputBufferPtr buf;
4798 /* htmlCharEncoding enc; */
4799 xmlChar *content, *content_line = (xmlChar *) "charset=";
4800
4801 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
4802 if (buf == NULL) return(NULL);
4803
4804 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4805 if (ctxt == NULL) {
4806 perror("malloc");
4807 return(NULL);
4808 }
4809 memset(ctxt, 0, sizeof(htmlParserCtxt));
4810 htmlInitParserCtxt(ctxt);
4811 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
4812 if (inputStream == NULL) {
4813 perror("malloc");
4814 xmlFree(ctxt);
4815 return(NULL);
4816 }
4817 memset(inputStream, 0, sizeof(htmlParserInput));
4818
4819 inputStream->filename = xmlMemStrdup(filename);
4820 inputStream->line = 1;
4821 inputStream->col = 1;
4822 inputStream->buf = buf;
4823 inputStream->directory = NULL;
4824
4825 inputStream->base = inputStream->buf->buffer->content;
4826 inputStream->cur = inputStream->buf->buffer->content;
4827 inputStream->free = NULL;
4828
4829 inputPush(ctxt, inputStream);
4830
4831 /* set encoding */
4832 if (encoding) {
4833 content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
4834 if (content) {
4835 strcpy ((char *)content, (char *)content_line);
4836 strcat ((char *)content, (char *)encoding);
4837 htmlCheckEncoding (ctxt, content);
4838 xmlFree (content);
4839 }
4840 }
4841
4842 return(ctxt);
4843}
4844
4845/**
4846 * htmlSAXParseFile :
4847 * @filename: the filename
4848 * @encoding: a free form C string describing the HTML document encoding, or NULL
4849 * @sax: the SAX handler block
4850 * @userData: if using SAX, this pointer will be provided on callbacks.
4851 *
4852 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4853 * compressed document is provided by default if found at compile-time.
4854 * It use the given SAX function block to handle the parsing callback.
4855 * If sax is NULL, fallback to the default DOM tree building routines.
4856 *
4857 * Returns the resulting document tree
4858 */
4859
4860htmlDocPtr
4861htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
4862 void *userData) {
4863 htmlDocPtr ret;
4864 htmlParserCtxtPtr ctxt;
4865 htmlSAXHandlerPtr oldsax = NULL;
4866
4867 ctxt = htmlCreateFileParserCtxt(filename, encoding);
4868 if (ctxt == NULL) return(NULL);
4869 if (sax != NULL) {
4870 oldsax = ctxt->sax;
4871 ctxt->sax = sax;
4872 ctxt->userData = userData;
4873 }
4874
4875 htmlParseDocument(ctxt);
4876
4877 ret = ctxt->myDoc;
4878 if (sax != NULL) {
4879 ctxt->sax = oldsax;
4880 ctxt->userData = NULL;
4881 }
4882 htmlFreeParserCtxt(ctxt);
4883
4884 return(ret);
4885}
4886
4887/**
4888 * htmlParseFile :
4889 * @filename: the filename
4890 * @encoding: a free form C string describing the HTML document encoding, or NULL
4891 *
4892 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4893 * compressed document is provided by default if found at compile-time.
4894 *
4895 * Returns the resulting document tree
4896 */
4897
4898htmlDocPtr
4899htmlParseFile(const char *filename, const char *encoding) {
4900 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
4901}
4902
4903/**
4904 * htmlHandleOmittedElem:
4905 * @val: int 0 or 1
4906 *
4907 * Set and return the previous value for handling HTML omitted tags.
4908 *
4909 * Returns the last value for 0 for no handling, 1 for auto insertion.
4910 */
4911
4912int
4913htmlHandleOmittedElem(int val) {
4914 int old = htmlOmittedDefaultValue;
4915
4916 htmlOmittedDefaultValue = val;
4917 return(old);
4918}
4919
4920#endif /* LIBXML_HTML_ENABLED */