blob: 6bb6a9703ae9cb26c5370a7ae248a60d0e6a2353 [file] [log] [blame]
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * Daniel.Veillard@w3.org
7 */
8
9#ifdef WIN32
Daniel Veillard3c558c31999-12-22 11:30:41 +000010#include "win32config.h"
Daniel Veillardbe70ff71999-07-05 16:50:46 +000011#else
Daniel Veillard7f7d1111999-09-22 09:46:25 +000012#include "config.h"
Daniel Veillardbe70ff71999-07-05 16:50:46 +000013#endif
Daniel Veillard7f7d1111999-09-22 09:46:25 +000014
Daniel Veillard361d8452000-04-03 19:48:13 +000015#include "xmlversion.h"
16#ifdef LIBXML_HTML_ENABLED
17
Daniel Veillardbe70ff71999-07-05 16:50:46 +000018#include <stdio.h>
Daniel Veillard32bc74e2000-07-14 14:49:25 +000019#include <string.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000020#ifdef HAVE_CTYPE_H
21#include <ctype.h>
22#endif
23#ifdef HAVE_STDLIB_H
Daniel Veillardbe70ff71999-07-05 16:50:46 +000024#include <stdlib.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000025#endif
26#ifdef HAVE_SYS_STAT_H
Daniel Veillardbe70ff71999-07-05 16:50:46 +000027#include <sys/stat.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000028#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +000029#ifdef HAVE_FCNTL_H
30#include <fcntl.h>
31#endif
32#ifdef HAVE_UNISTD_H
33#include <unistd.h>
34#endif
35#ifdef HAVE_ZLIB_H
36#include <zlib.h>
37#endif
38
Daniel Veillard361d8452000-04-03 19:48:13 +000039#include <libxml/xmlmemory.h>
40#include <libxml/tree.h>
41#include <libxml/HTMLparser.h>
42#include <libxml/entities.h>
43#include <libxml/encoding.h>
Daniel Veillard32bc74e2000-07-14 14:49:25 +000044#include <libxml/parser.h>
Daniel Veillard361d8452000-04-03 19:48:13 +000045#include <libxml/valid.h>
46#include <libxml/parserInternals.h>
47#include <libxml/xmlIO.h>
Daniel Veillard5e5c6231999-12-29 12:49:06 +000048#include "xml-error.h"
Daniel Veillarde2d034d1999-07-27 19:52:06 +000049
50#define HTML_MAX_NAMELEN 1000
51#define INPUT_CHUNK 50
Daniel Veillard32bc74e2000-07-14 14:49:25 +000052#define HTML_PARSER_BIG_BUFFER_SIZE 1000
Daniel Veillard5e5c6231999-12-29 12:49:06 +000053#define HTML_PARSER_BUFFER_SIZE 100
Daniel Veillardbe70ff71999-07-05 16:50:46 +000054
Daniel Veillard82150d81999-07-07 07:32:15 +000055/* #define DEBUG */
Daniel Veillard5e5c6231999-12-29 12:49:06 +000056/* #define DEBUG_PUSH */
Daniel Veillard5233ffc1999-07-06 22:25:25 +000057
58/************************************************************************
59 * *
60 * Parser stacks related functions and macros *
61 * *
62 ************************************************************************/
63
64/*
65 * Generic function for accessing stacks in the Parser Context
66 */
67
Daniel Veillarddbfd6411999-12-28 16:35:14 +000068#define PUSH_AND_POP(scope, type, name) \
69scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000070 if (ctxt->name##Nr >= ctxt->name##Max) { \
71 ctxt->name##Max *= 2; \
Daniel Veillard32bc74e2000-07-14 14:49:25 +000072 ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000073 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
74 if (ctxt->name##Tab == NULL) { \
75 fprintf(stderr, "realloc failed !\n"); \
Daniel Veillard0142b842000-01-14 14:45:24 +000076 return(0); \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000077 } \
78 } \
79 ctxt->name##Tab[ctxt->name##Nr] = value; \
80 ctxt->name = value; \
81 return(ctxt->name##Nr++); \
82} \
Daniel Veillarddbfd6411999-12-28 16:35:14 +000083scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000084 type ret; \
Daniel Veillard7c1206f1999-10-14 09:10:25 +000085 if (ctxt->name##Nr < 0) return(0); \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000086 ctxt->name##Nr--; \
Daniel Veillard7c1206f1999-10-14 09:10:25 +000087 if (ctxt->name##Nr < 0) return(0); \
Daniel Veillard5233ffc1999-07-06 22:25:25 +000088 if (ctxt->name##Nr > 0) \
89 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
90 else \
91 ctxt->name = NULL; \
92 ret = ctxt->name##Tab[ctxt->name##Nr]; \
93 ctxt->name##Tab[ctxt->name##Nr] = 0; \
94 return(ret); \
95} \
96
Daniel Veillarddbfd6411999-12-28 16:35:14 +000097PUSH_AND_POP(extern, xmlNodePtr, node)
98PUSH_AND_POP(extern, xmlChar*, name)
Daniel Veillard5233ffc1999-07-06 22:25:25 +000099
100/*
101 * Macros for accessing the content. Those should be used only by the parser,
102 * and not exported.
103 *
104 * Dirty macros, i.e. one need to make assumption on the context to use them
105 *
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000106 * CUR_PTR return the current pointer to the xmlChar to be parsed.
107 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000108 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
109 * in UNICODE mode. This should be used internally by the parser
110 * only to compare to ASCII values otherwise it would break when
111 * running with UTF-8 encoding.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000112 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000113 * to compare on ASCII based substring.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000114 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000115 * it should be used only to compare on ASCII based substring.
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000116 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000117 * strings within the parser.
118 *
119 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
120 *
121 * CURRENT Returns the current char value, with the full decoding of
122 * UTF-8 if we are using this mode. It returns an int.
123 * NEXT Skip to the next character, this does the proper decoding
124 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000125 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
126 */
127
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000128#define UPPER (toupper(*ctxt->input->cur))
Daniel Veillardcf461992000-03-14 18:30:20 +0000129
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000130#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
Daniel Veillardcf461992000-03-14 18:30:20 +0000131
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000132#define NXT(val) ctxt->input->cur[(val)]
Daniel Veillardcf461992000-03-14 18:30:20 +0000133
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000134#define UPP(val) (toupper(ctxt->input->cur[(val)]))
Daniel Veillardcf461992000-03-14 18:30:20 +0000135
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000136#define CUR_PTR ctxt->input->cur
Daniel Veillardcf461992000-03-14 18:30:20 +0000137
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000138#define SHRINK xmlParserInputShrink(ctxt->input)
Daniel Veillardcf461992000-03-14 18:30:20 +0000139
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000140#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000141
Daniel Veillardcf461992000-03-14 18:30:20 +0000142#define CURRENT ((int) (*ctxt->input->cur))
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000143
Daniel Veillardcf461992000-03-14 18:30:20 +0000144#define SKIP_BLANKS htmlSkipBlankChars(ctxt);
145
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000146#if 0
147#define CUR ((int) (*ctxt->input->cur))
148#define NEXT htmlNextChar(ctxt);
149#else
150/* Inported from XML */
151
152/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
153#define CUR ((int) (*ctxt->input->cur))
154#define NEXT xmlNextChar(ctxt);ctxt->nbChars++;
155
156#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
157#define NXT(val) ctxt->input->cur[(val)]
158#define CUR_PTR ctxt->input->cur
159
160
161#define NEXTL(l) \
162 if (*(ctxt->input->cur) == '\n') { \
163 ctxt->input->line++; ctxt->input->col = 1; \
164 } else ctxt->input->col++; \
165 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++;
166
167/************
168 \
169 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
170 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
171 ************/
172
173#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l);
174#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l);
175
176#define COPY_BUF(l,b,i,v) \
177 if (l == 1) b[i++] = (xmlChar) v; \
178 else i += xmlCopyChar(l,&b[i],v);
179#endif
180
181/**
182 * htmlCurrentChar:
183 * @ctxt: the HTML parser context
184 * @len: pointer to the length of the char read
185 *
186 * The current char value, if using UTF-8 this may actaully span multiple
187 * bytes in the input buffer. Implement the end of line normalization:
188 * 2.11 End-of-Line Handling
189 * If the encoding is unspecified, in the case we find an ISO-Latin-1
190 * char, then the encoding converter is plugged in automatically.
191 *
192 * Returns the current char value and its lenght
193 */
194
195int
196htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
197 if (ctxt->instate == XML_PARSER_EOF)
198 return(0);
199
200 if (ctxt->token != 0) {
201 *len = 0;
202 return(ctxt->token);
203 }
204 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
205 /*
206 * We are supposed to handle UTF8, check it's valid
207 * From rfc2044: encoding of the Unicode values on UTF-8:
208 *
209 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
210 * 0000 0000-0000 007F 0xxxxxxx
211 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
212 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
213 *
214 * Check for the 0x110000 limit too
215 */
216 const unsigned char *cur = ctxt->input->cur;
217 unsigned char c;
218 unsigned int val;
219
220 c = *cur;
221 if (c & 0x80) {
222 if (cur[1] == 0)
223 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
224 if ((cur[1] & 0xc0) != 0x80)
225 goto encoding_error;
226 if ((c & 0xe0) == 0xe0) {
227
228 if (cur[2] == 0)
229 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
230 if ((cur[2] & 0xc0) != 0x80)
231 goto encoding_error;
232 if ((c & 0xf0) == 0xf0) {
233 if (cur[3] == 0)
234 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
235 if (((c & 0xf8) != 0xf0) ||
236 ((cur[3] & 0xc0) != 0x80))
237 goto encoding_error;
238 /* 4-byte code */
239 *len = 4;
240 val = (cur[0] & 0x7) << 18;
241 val |= (cur[1] & 0x3f) << 12;
242 val |= (cur[2] & 0x3f) << 6;
243 val |= cur[3] & 0x3f;
244 } else {
245 /* 3-byte code */
246 *len = 3;
247 val = (cur[0] & 0xf) << 12;
248 val |= (cur[1] & 0x3f) << 6;
249 val |= cur[2] & 0x3f;
250 }
251 } else {
252 /* 2-byte code */
253 *len = 2;
254 val = (cur[0] & 0x1f) << 6;
255 val |= cur[1] & 0x3f;
256 }
257 if (!IS_CHAR(val)) {
258 if ((ctxt->sax != NULL) &&
259 (ctxt->sax->error != NULL))
260 ctxt->sax->error(ctxt->userData,
261 "Char 0x%X out of allowed range\n", val);
262 ctxt->errNo = XML_ERR_INVALID_ENCODING;
263 ctxt->wellFormed = 0;
264 ctxt->disableSAX = 1;
265 }
266 return(val);
267 } else {
268 /* 1-byte code */
269 *len = 1;
270 return((int) *ctxt->input->cur);
271 }
272 }
273 /*
274 * Assume it's a fixed lenght encoding (1) with
275 * a compatibke encoding for the ASCII set, since
276 * XML constructs only use < 128 chars
277 */
278 *len = 1;
279 if ((int) *ctxt->input->cur < 0x80)
280 return((int) *ctxt->input->cur);
281
282 /*
283 * Humm this is bad, do an automatic flow conversion
284 */
285 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
286 ctxt->charset = XML_CHAR_ENCODING_UTF8;
287 return(xmlCurrentChar(ctxt, len));
288
289encoding_error:
290 /*
291 * If we detect an UTF8 error that probably mean that the
292 * input encoding didn't get properly advertized in the
293 * declaration header. Report the error and switch the encoding
294 * to ISO-Latin-1 (if you don't like this policy, just declare the
295 * encoding !)
296 */
297 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
298 ctxt->sax->error(ctxt->userData,
299 "Input is not proper UTF-8, indicate encoding !\n");
300 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
301 ctxt->input->cur[0], ctxt->input->cur[1],
302 ctxt->input->cur[2], ctxt->input->cur[3]);
303 }
304 ctxt->errNo = XML_ERR_INVALID_ENCODING;
305
306 ctxt->charset = XML_CHAR_ENCODING_8859_1;
307 *len = 1;
308 return((int) *ctxt->input->cur);
309}
310
Daniel Veillardcf461992000-03-14 18:30:20 +0000311/**
312 * htmlNextChar:
313 * @ctxt: the HTML parser context
314 *
315 * Skip to the next char input char.
316 */
317
318void
319htmlNextChar(htmlParserCtxtPtr ctxt) {
Daniel Veillard3f6f7f62000-06-30 17:58:25 +0000320 if (ctxt->instate == XML_PARSER_EOF)
321 return;
Daniel Veillardcf461992000-03-14 18:30:20 +0000322 if ((*ctxt->input->cur == 0) &&
323 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
324 xmlPopInput(ctxt);
325 } else {
326 if (*(ctxt->input->cur) == '\n') {
327 ctxt->input->line++; ctxt->input->col = 1;
328 } else ctxt->input->col++;
329 ctxt->input->cur++;
330 ctxt->nbChars++;
331 if (*ctxt->input->cur == 0)
332 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
333 }
334}
335
336/**
337 * htmlSkipBlankChars:
338 * @ctxt: the HTML parser context
339 *
340 * skip all blanks character found at that point in the input streams.
341 *
342 * Returns the number of space chars skipped
343 */
344
345int
346htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
347 int res = 0;
348
349 while (IS_BLANK(*(ctxt->input->cur))) {
350 if ((*ctxt->input->cur == 0) &&
351 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
352 xmlPopInput(ctxt);
353 } else {
354 if (*(ctxt->input->cur) == '\n') {
355 ctxt->input->line++; ctxt->input->col = 1;
356 } else ctxt->input->col++;
357 ctxt->input->cur++;
358 ctxt->nbChars++;
359 if (*ctxt->input->cur == 0)
360 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
361 }
362 res++;
363 }
364 return(res);
365}
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000366
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000367
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000368
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000369/************************************************************************
370 * *
371 * The list of HTML elements and their properties *
372 * *
373 ************************************************************************/
374
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000375/*
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000376 * Start Tag: 1 means the start tag can be ommited
377 * End Tag: 1 means the end tag can be ommited
378 * 2 means it's forbidden (empty elements)
379 * Depr: this element is deprecated
380 * DTD: 1 means that this element is valid only in the Loose DTD
381 * 2 means that this element is valid only in the Frameset DTD
382 *
383 * Name,Start Tag,End Tag, Empty, Depr., DTD, Description
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000384 */
385htmlElemDesc html40ElementTable[] = {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000386{ "a", 0, 0, 0, 0, 0, "anchor " },
387{ "abbr", 0, 0, 0, 0, 0, "abbreviated form" },
388{ "acronym", 0, 0, 0, 0, 0, "" },
389{ "address", 0, 0, 0, 0, 0, "information on author " },
390{ "applet", 0, 0, 0, 1, 1, "java applet " },
391{ "area", 0, 2, 1, 0, 0, "client-side image map area " },
392{ "b", 0, 0, 0, 0, 0, "bold text style" },
393{ "base", 0, 2, 1, 0, 0, "document base uri " },
394{ "basefont", 0, 2, 1, 1, 1, "base font size " },
395{ "bdo", 0, 0, 0, 0, 0, "i18n bidi over-ride " },
396{ "big", 0, 0, 0, 0, 0, "large text style" },
397{ "blockquote", 0, 0, 0, 0, 0, "long quotation " },
398{ "body", 1, 1, 0, 0, 0, "document body " },
399{ "br", 0, 2, 1, 0, 0, "forced line break " },
400{ "button", 0, 0, 0, 0, 0, "push button " },
401{ "caption", 0, 0, 0, 0, 0, "table caption " },
402{ "center", 0, 0, 0, 1, 1, "shorthand for div align=center " },
403{ "cite", 0, 0, 0, 0, 0, "citation" },
404{ "code", 0, 0, 0, 0, 0, "computer code fragment" },
405{ "col", 0, 2, 1, 0, 0, "table column " },
406{ "colgroup", 0, 1, 0, 0, 0, "table column group " },
407{ "dd", 0, 1, 0, 0, 0, "definition description " },
408{ "del", 0, 0, 0, 0, 0, "deleted text " },
409{ "dfn", 0, 0, 0, 0, 0, "instance definition" },
410{ "dir", 0, 0, 0, 1, 1, "directory list" },
411{ "div", 0, 0, 0, 0, 0, "generic language/style container"},
412{ "dl", 0, 0, 0, 0, 0, "definition list " },
413{ "dt", 0, 1, 0, 0, 0, "definition term " },
414{ "em", 0, 0, 0, 0, 0, "emphasis" },
415{ "fieldset", 0, 0, 0, 0, 0, "form control group " },
416{ "font", 0, 0, 0, 1, 1, "local change to font " },
417{ "form", 0, 0, 0, 0, 0, "interactive form " },
418{ "frame", 0, 2, 1, 0, 2, "subwindow " },
419{ "frameset", 0, 0, 0, 0, 2, "window subdivision" },
420{ "h1", 0, 0, 0, 0, 0, "heading " },
421{ "h2", 0, 0, 0, 0, 0, "heading " },
422{ "h3", 0, 0, 0, 0, 0, "heading " },
423{ "h4", 0, 0, 0, 0, 0, "heading " },
424{ "h5", 0, 0, 0, 0, 0, "heading " },
425{ "h6", 0, 0, 0, 0, 0, "heading " },
426{ "head", 1, 1, 0, 0, 0, "document head " },
427{ "hr", 0, 2, 1, 0, 0, "horizontal rule " },
428{ "html", 1, 1, 0, 0, 0, "document root element " },
429{ "i", 0, 0, 0, 0, 0, "italic text style" },
430{ "iframe", 0, 0, 0, 0, 1, "inline subwindow " },
431{ "img", 0, 2, 1, 0, 0, "embedded image " },
432{ "input", 0, 2, 1, 0, 0, "form control " },
433{ "ins", 0, 0, 0, 0, 0, "inserted text" },
434{ "isindex", 0, 2, 1, 1, 1, "single line prompt " },
435{ "kbd", 0, 0, 0, 0, 0, "text to be entered by the user" },
436{ "label", 0, 0, 0, 0, 0, "form field label text " },
437{ "legend", 0, 0, 0, 0, 0, "fieldset legend " },
438{ "li", 0, 1, 0, 0, 0, "list item " },
439{ "link", 0, 2, 1, 0, 0, "a media-independent link " },
440{ "map", 0, 0, 0, 0, 0, "client-side image map " },
441{ "menu", 0, 0, 0, 1, 1, "menu list " },
442{ "meta", 0, 2, 1, 0, 0, "generic metainformation " },
443{ "noframes", 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " },
444{ "noscript", 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
445{ "object", 0, 0, 0, 0, 0, "generic embedded object " },
446{ "ol", 0, 0, 0, 0, 0, "ordered list " },
447{ "optgroup", 0, 0, 0, 0, 0, "option group " },
448{ "option", 0, 1, 0, 0, 0, "selectable choice " },
449{ "p", 0, 1, 0, 0, 0, "paragraph " },
450{ "param", 0, 2, 1, 0, 0, "named property value " },
451{ "pre", 0, 0, 0, 0, 0, "preformatted text " },
452{ "q", 0, 0, 0, 0, 0, "short inline quotation " },
453{ "s", 0, 0, 0, 1, 1, "strike-through text style" },
454{ "samp", 0, 0, 0, 0, 0, "sample program output, scripts, etc." },
455{ "script", 0, 0, 0, 0, 0, "script statements " },
456{ "select", 0, 0, 0, 0, 0, "option selector " },
457{ "small", 0, 0, 0, 0, 0, "small text style" },
458{ "span", 0, 0, 0, 0, 0, "generic language/style container " },
459{ "strike", 0, 0, 0, 1, 1, "strike-through text" },
460{ "strong", 0, 0, 0, 0, 0, "strong emphasis" },
461{ "style", 0, 0, 0, 0, 0, "style info " },
462{ "sub", 0, 0, 0, 0, 0, "subscript" },
463{ "sup", 0, 0, 0, 0, 0, "superscript " },
464{ "table", 0, 0, 0, 0, 0, "&#160;" },
465{ "tbody", 1, 1, 0, 0, 0, "table body " },
466{ "td", 0, 1, 0, 0, 0, "table data cell" },
467{ "textarea", 0, 0, 0, 0, 0, "multi-line text field " },
468{ "tfoot", 0, 1, 0, 0, 0, "table footer " },
469{ "th", 0, 1, 0, 0, 0, "table header cell" },
470{ "thead", 0, 1, 0, 0, 0, "table header " },
471{ "title", 0, 0, 0, 0, 0, "document title " },
472{ "tr", 0, 1, 0, 0, 0, "table row " },
473{ "tt", 0, 0, 0, 0, 0, "teletype or monospaced text style" },
474{ "u", 0, 0, 0, 1, 1, "underlined text style" },
475{ "ul", 0, 0, 0, 0, 0, "unordered list " },
476{ "var", 0, 0, 0, 0, 0, "instance of a variable or program argument" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000477};
478
479/*
480 * start tags that imply the end of a current element
481 * any tag of each line implies the end of the current element if the type of
482 * that element is in the same line
483 */
Daniel Veillardb96e6431999-08-29 21:02:19 +0000484char *htmlEquEnd[] = {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000485"dt", "dd", "li", "option", NULL,
486"h1", "h2", "h3", "h4", "h5", "h6", NULL,
487"ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000488NULL
489};
490/*
491 * acording the HTML DTD, HR should be added to the 2nd line above, as it
492 * is not allowed within a H1, H2, H3, etc. But we should tolerate that case
493 * because many documents contain rules in headings...
494 */
495
496/*
497 * start tags that imply the end of current element
498 */
Daniel Veillardb96e6431999-08-29 21:02:19 +0000499char *htmlStartClose[] = {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +0000500"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
501 "dl", "ul", "ol", "menu", "dir", "address", "pre",
502 "listing", "xmp", "head", NULL,
503"head", "p", NULL,
504"title", "p", NULL,
505"body", "head", "style", "link", "title", "p", NULL,
506"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
507 "pre", "listing", "xmp", "head", "li", NULL,
508"hr", "p", "head", NULL,
509"h1", "p", "head", NULL,
510"h2", "p", "head", NULL,
511"h3", "p", "head", NULL,
512"h4", "p", "head", NULL,
513"h5", "p", "head", NULL,
514"h6", "p", "head", NULL,
515"dir", "p", "head", NULL,
516"address", "p", "head", "ul", NULL,
517"pre", "p", "head", "ul", NULL,
518"listing", "p", "head", NULL,
519"xmp", "p", "head", NULL,
520"blockquote", "p", "head", NULL,
521"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
522 "xmp", "head", NULL,
523"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
524 "head", "dd", NULL,
525"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
526 "head", "dt", NULL,
527"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
528 "listing", "xmp", NULL,
529"ol", "p", "head", "ul", NULL,
530"menu", "p", "head", "ul", NULL,
531"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
532"div", "p", "head", NULL,
533"noscript", "p", "head", NULL,
534"center", "font", "b", "i", "p", "head", NULL,
535"a", "a", NULL,
536"caption", "p", NULL,
537"colgroup", "caption", "colgroup", "col", "p", NULL,
538"col", "caption", "col", "p", NULL,
539"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
540 "listing", "xmp", "a", NULL,
541"th", "th", "td", NULL,
542"td", "th", "td", "p", NULL,
543"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
544"thead", "caption", "col", "colgroup", NULL,
545"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
546 "tbody", "p", NULL,
547"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
548 "tfoot", "tbody", "p", NULL,
549"optgroup", "option", NULL,
550"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
551 "pre", "listing", "xmp", "a", NULL,
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000552NULL
553};
554
Daniel Veillardb8f25c92000-08-19 19:52:36 +0000555/*
556 * The list of HTML elements which are supposed not to have
557 * CDATA content and where a p element will be implied
558 *
559 * TODO: extend that list by reading the HTML SGML DtD on
560 * implied paragraph
561 */
562static char *htmlNoContentElements[] = {
563 "html",
564 "head",
565 "body",
566 NULL
567};
568
Daniel Veillardbe803962000-06-28 23:40:59 +0000569
Daniel Veillardb96e6431999-08-29 21:02:19 +0000570static char** htmlStartCloseIndex[100];
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000571static int htmlStartCloseIndexinitialized = 0;
572
573/************************************************************************
574 * *
575 * functions to handle HTML specific data *
576 * *
577 ************************************************************************/
578
579/**
580 * htmlInitAutoClose:
581 *
582 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
583 *
584 */
585void
586htmlInitAutoClose(void) {
587 int index, i = 0;
588
589 if (htmlStartCloseIndexinitialized) return;
590
591 for (index = 0;index < 100;index ++) htmlStartCloseIndex[index] = NULL;
592 index = 0;
593 while ((htmlStartClose[i] != NULL) && (index < 100 - 1)) {
594 htmlStartCloseIndex[index++] = &htmlStartClose[i];
595 while (htmlStartClose[i] != NULL) i++;
596 i++;
597 }
598}
599
600/**
601 * htmlTagLookup:
602 * @tag: The tag name
603 *
604 * Lookup the HTML tag in the ElementTable
605 *
606 * Returns the related htmlElemDescPtr or NULL if not found.
607 */
608htmlElemDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +0000609htmlTagLookup(const xmlChar *tag) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000610 int i = 0;
611
612 for (i = 0; i < (sizeof(html40ElementTable) /
613 sizeof(html40ElementTable[0]));i++) {
Daniel Veillardb96e6431999-08-29 21:02:19 +0000614 if (!xmlStrcmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000615 return(&html40ElementTable[i]);
616 }
617 return(NULL);
618}
619
620/**
621 * htmlCheckAutoClose:
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000622 * @newtag: The new tag name
623 * @oldtag: The old tag name
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000624 *
625 * Checks wether the new tag is one of the registered valid tags for closing old.
626 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
627 *
628 * Returns 0 if no, 1 if yes.
629 */
630int
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000631htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000632 int i, index;
Daniel Veillardb96e6431999-08-29 21:02:19 +0000633 char **close;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000634
635 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
636
637 /* inefficient, but not a big deal */
638 for (index = 0; index < 100;index++) {
639 close = htmlStartCloseIndex[index];
640 if (close == NULL) return(0);
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000641 if (!xmlStrcmp(BAD_CAST *close, newtag)) break;
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000642 }
643
644 i = close - htmlStartClose;
645 i++;
646 while (htmlStartClose[i] != NULL) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000647 if (!xmlStrcmp(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000648 return(1);
649 }
650 i++;
651 }
652 return(0);
653}
654
655/**
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000656 * htmlAutoCloseOnClose:
657 * @ctxt: an HTML parser context
658 * @newtag: The new tag name
659 *
660 * The HTmL DtD allows an ending tag to implicitely close other tags.
661 */
662void
663htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
664 htmlElemDescPtr info;
665 xmlChar *oldname;
666 int i;
667
668#ifdef DEBUG
669 fprintf(stderr,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
670 for (i = 0;i < ctxt->nameNr;i++)
671 fprintf(stderr,"%d : %s\n", i, ctxt->nameTab[i]);
672#endif
673
674 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
675 if (!xmlStrcmp(newtag, ctxt->nameTab[i])) break;
676 }
677 if (i < 0) return;
678
679 while (xmlStrcmp(newtag, ctxt->name)) {
680 info = htmlTagLookup(ctxt->name);
681 if ((info == NULL) || (info->endTag == 1)) {
682#ifdef DEBUG
683 fprintf(stderr,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
684#endif
685 } else {
686 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
687 ctxt->sax->error(ctxt->userData,
688 "Opening and ending tag mismatch: %s and %s\n",
689 newtag, ctxt->name);
690 ctxt->wellFormed = 0;
691 }
692 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
693 ctxt->sax->endElement(ctxt->userData, ctxt->name);
694 oldname = htmlnamePop(ctxt);
695 if (oldname != NULL) {
696#ifdef DEBUG
697 fprintf(stderr,"htmlAutoCloseOnClose: popped %s\n", oldname);
698#endif
699 xmlFree(oldname);
700 }
701 }
702}
703
704/**
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000705 * htmlAutoClose:
706 * @ctxt: an HTML parser context
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000707 * @newtag: The new tag name or NULL
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000708 *
709 * The HTmL DtD allows a tag to implicitely close other tags.
710 * The list is kept in htmlStartClose array. This function is
711 * called when a new tag has been detected and generates the
712 * appropriates closes if possible/needed.
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000713 * If newtag is NULL this mean we are at the end of the resource
Daniel Veillard365e13b2000-07-02 07:56:37 +0000714 * and we should check
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000715 */
716void
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000717htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000718 xmlChar *oldname;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000719 while ((newtag != NULL) && (ctxt->name != NULL) &&
720 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000721#ifdef DEBUG
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000722 fprintf(stderr,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000723#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000724 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
Daniel Veillard2673d3c1999-10-08 14:37:09 +0000725 ctxt->sax->endElement(ctxt->userData, ctxt->name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +0000726 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +0000727 if (oldname != NULL) {
728#ifdef DEBUG
729 fprintf(stderr,"htmlAutoClose: popped %s\n", oldname);
730#endif
731 xmlFree(oldname);
732 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000733 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000734 if (newtag == NULL) {
735 htmlAutoCloseOnClose(ctxt, BAD_CAST"head");
736 htmlAutoCloseOnClose(ctxt, BAD_CAST"body");
737 htmlAutoCloseOnClose(ctxt, BAD_CAST"html");
738 }
739 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard365e13b2000-07-02 07:56:37 +0000740 ((!xmlStrcmp(ctxt->name, BAD_CAST"head")) ||
741 (!xmlStrcmp(ctxt->name, BAD_CAST"body")) ||
742 (!xmlStrcmp(ctxt->name, BAD_CAST"html")))) {
743#ifdef DEBUG
744 fprintf(stderr,"htmlAutoClose: EOF closes %s\n", ctxt->name);
745#endif
746 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
747 ctxt->sax->endElement(ctxt->userData, ctxt->name);
748 oldname = htmlnamePop(ctxt);
749 if (oldname != NULL) {
750#ifdef DEBUG
751 fprintf(stderr,"htmlAutoClose: popped %s\n", oldname);
752#endif
753 xmlFree(oldname);
754 }
755 }
756
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000757}
758
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000759/**
Daniel Veillard5cb5ab81999-12-21 15:35:29 +0000760 * htmlAutoCloseTag:
761 * @doc: the HTML document
762 * @name: The tag name
763 * @elem: the HTML element
764 *
765 * The HTmL DtD allows a tag to implicitely close other tags.
766 * The list is kept in htmlStartClose array. This function checks
767 * if the element or one of it's children would autoclose the
768 * given tag.
769 *
770 * Returns 1 if autoclose, 0 otherwise
771 */
772int
773htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
774 htmlNodePtr child;
775
776 if (elem == NULL) return(1);
777 if (!xmlStrcmp(name, elem->name)) return(0);
778 if (htmlCheckAutoClose(elem->name, name)) return(1);
Daniel Veillardcf461992000-03-14 18:30:20 +0000779 child = elem->children;
Daniel Veillard5cb5ab81999-12-21 15:35:29 +0000780 while (child != NULL) {
781 if (htmlAutoCloseTag(doc, name, child)) return(1);
782 child = child->next;
783 }
784 return(0);
785}
786
787/**
788 * htmlIsAutoClosed:
789 * @doc: the HTML document
790 * @elem: the HTML element
791 *
792 * The HTmL DtD allows a tag to implicitely close other tags.
793 * The list is kept in htmlStartClose array. This function checks
794 * if a tag is autoclosed by one of it's child
795 *
796 * Returns 1 if autoclosed, 0 otherwise
797 */
798int
799htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
800 htmlNodePtr child;
801
802 if (elem == NULL) return(1);
Daniel Veillardcf461992000-03-14 18:30:20 +0000803 child = elem->children;
Daniel Veillard5cb5ab81999-12-21 15:35:29 +0000804 while (child != NULL) {
805 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
806 child = child->next;
807 }
808 return(0);
809}
810
811/**
Daniel Veillardbe803962000-06-28 23:40:59 +0000812 * htmlCheckImplied:
813 * @ctxt: an HTML parser context
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000814 * @newtag: The new tag name
Daniel Veillardbe803962000-06-28 23:40:59 +0000815 *
816 * The HTmL DtD allows a tag to exists only implicitely
817 * called when a new tag has been detected and generates the
818 * appropriates implicit tags if missing
819 */
820void
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000821htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
822 if (!xmlStrcmp(newtag, BAD_CAST"html"))
Daniel Veillardbe803962000-06-28 23:40:59 +0000823 return;
824 if (ctxt->nameNr <= 0) {
825#ifdef DEBUG
826 fprintf(stderr,"Implied element html: pushed html\n");
827#endif
828 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
829 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
830 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
831 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000832 if ((!xmlStrcmp(newtag, BAD_CAST"body")) || (!xmlStrcmp(newtag, BAD_CAST"head")))
Daniel Veillardbe803962000-06-28 23:40:59 +0000833 return;
834 if (ctxt->nameNr <= 1) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000835 if ((!xmlStrcmp(newtag, BAD_CAST"script")) ||
836 (!xmlStrcmp(newtag, BAD_CAST"style")) ||
837 (!xmlStrcmp(newtag, BAD_CAST"meta")) ||
838 (!xmlStrcmp(newtag, BAD_CAST"link")) ||
839 (!xmlStrcmp(newtag, BAD_CAST"title")) ||
840 (!xmlStrcmp(newtag, BAD_CAST"base"))) {
Daniel Veillardbe803962000-06-28 23:40:59 +0000841 /*
842 * dropped OBJECT ... i you put it first BODY will be
843 * assumed !
844 */
845#ifdef DEBUG
846 fprintf(stderr,"Implied element head: pushed head\n");
847#endif
848 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
849 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
850 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
851 } else {
852#ifdef DEBUG
853 fprintf(stderr,"Implied element body: pushed body\n");
854#endif
855 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
856 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
857 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
858 }
859 }
860}
861
Daniel Veillardb8f25c92000-08-19 19:52:36 +0000862/**
863 * htmlCheckParagraph
864 * @ctxt: an HTML parser context
865 *
866 * Check whether a p element need to be implied before inserting
867 * characters in the current element.
868 *
869 * Returns 1 if a paragraph has been inserted, 0 if not and -1
870 * in case of error.
871 */
872
873int
874htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
875 const xmlChar *tag;
876 int i;
877
878 if (ctxt == NULL)
879 return(-1);
880 tag = ctxt->name;
881 if (tag == NULL) {
882 htmlAutoClose(ctxt, BAD_CAST"p");
883 htmlCheckImplied(ctxt, BAD_CAST"p");
884 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
885 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
886 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
887 return(1);
888 }
889 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
890 if (!xmlStrcmp(tag, BAD_CAST htmlNoContentElements[i])) {
891#ifdef DEBUG
892 fprintf(stderr,"Implied element paragraph\n");
893#endif
894 htmlAutoClose(ctxt, BAD_CAST"p");
895 htmlCheckImplied(ctxt, BAD_CAST"p");
896 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
897 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
898 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
899 return(1);
900 }
901 }
902 return(0);
903}
904
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000905/************************************************************************
906 * *
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000907 * The list of HTML predefined entities *
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000908 * *
909 ************************************************************************/
910
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000911
912htmlEntityDesc html40EntitiesTable[] = {
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000913/*
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000914 * the 4 absolute ones,
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000915 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000916{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
917{ 38, "amp", "ampersand, U+0026 ISOnum" },
918{ 60, "lt", "less-than sign, U+003C ISOnum" },
919{ 62, "gt", "greater-than sign, U+003E ISOnum" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000920
921/*
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000922 * A bunch still in the 128-255 range
923 * Replacing them depend really on the charset used.
Daniel Veillardbe70ff71999-07-05 16:50:46 +0000924 */
Daniel Veillard5cb5ab81999-12-21 15:35:29 +0000925{ 39, "apos", "single quote" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000926{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
927{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
928{ 162, "cent", "cent sign, U+00A2 ISOnum" },
929{ 163, "pound","pound sign, U+00A3 ISOnum" },
930{ 164, "curren","currency sign, U+00A4 ISOnum" },
931{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
932{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
933{ 167, "sect", "section sign, U+00A7 ISOnum" },
934{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
935{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
936{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
937{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
938{ 172, "not", "not sign, U+00AC ISOnum" },
939{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
940{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
941{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
942{ 176, "deg", "degree sign, U+00B0 ISOnum" },
943{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
944{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
945{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
946{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
947{ 181, "micro","micro sign, U+00B5 ISOnum" },
948{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000949{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000950{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
951{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
952{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000953{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000954{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
955{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
956{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
957{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
958{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
959{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
960{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
961{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
962{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
963{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
964{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
965{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
966{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
967{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
968{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
969{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
970{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
971{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
972{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
973{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
974{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
975{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
976{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
977{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
978{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
979{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
980{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
981{ 215, "times","multiplication sign, U+00D7 ISOnum" },
Daniel Veillardb05deb71999-08-10 19:04:08 +0000982{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +0000983{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
984{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
985{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
986{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
987{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
988{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
989{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
990{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
991{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
992{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
993{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
994{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
995{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
996{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
997{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
998{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
999{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1000{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1001{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1002{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1003{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1004{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1005{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1006{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1007{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1008{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1009{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1010{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1011{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1012{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1013{ 247, "divide","division sign, U+00F7 ISOnum" },
1014{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1015{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1016{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1017{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1018{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1019{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1020{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1021{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001022
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001023/*
1024 * Anything below should really be kept as entities references
1025 */
1026{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001027
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001028{ 913, "Alpha","greek capital letter alpha, U+0391" },
1029{ 914, "Beta", "greek capital letter beta, U+0392" },
1030{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1031{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1032{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1033{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1034{ 919, "Eta", "greek capital letter eta, U+0397" },
1035{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1036{ 921, "Iota", "greek capital letter iota, U+0399" },
1037{ 922, "Kappa","greek capital letter kappa, U+039A" },
1038{ 923, "Lambda""greek capital letter lambda, U+039B ISOgrk3" },
1039{ 924, "Mu", "greek capital letter mu, U+039C" },
1040{ 925, "Nu", "greek capital letter nu, U+039D" },
1041{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1042{ 927, "Omicron","greek capital letter omicron, U+039F" },
1043{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1044{ 929, "Rho", "greek capital letter rho, U+03A1" },
1045{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1046{ 932, "Tau", "greek capital letter tau, U+03A4" },
1047{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1048{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1049{ 935, "Chi", "greek capital letter chi, U+03A7" },
1050{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1051{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001052
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001053{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1054{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1055{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1056{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1057{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1058{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1059{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1060{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1061{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1062{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1063{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1064{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1065{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1066{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1067{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1068{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1069{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1070{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1071{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1072{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1073{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1074{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1075{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1076{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1077{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1078{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1079{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1080{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001081
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001082{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1083{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1084{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1085{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1086{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1087{ 8260, "frasl","fraction slash, U+2044 NEW" },
1088
Daniel Veillardb05deb71999-08-10 19:04:08 +00001089{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001090{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1091{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1092{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1093{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1094{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1095{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1096{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1097{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1098{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1099{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1100{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1101{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1102{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1103{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1104{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1105
1106
1107{ 8704, "forall","for all, U+2200 ISOtech" },
1108{ 8706, "part", "partial differential, U+2202 ISOtech" },
1109{ 8707, "exist","there exists, U+2203 ISOtech" },
1110{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1111{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1112{ 8712, "isin", "element of, U+2208 ISOtech" },
1113{ 8713, "notin","not an element of, U+2209 ISOtech" },
1114{ 8715, "ni", "contains as member, U+220B ISOtech" },
1115{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1116{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
1117{ 8722, "minus","minus sign, U+2212 ISOtech" },
1118{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1119{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1120{ 8733, "prop", "proportional to, U+221D ISOtech" },
1121{ 8734, "infin","infinity, U+221E ISOtech" },
1122{ 8736, "ang", "angle, U+2220 ISOamso" },
1123{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1124{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1125{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1126{ 8746, "cup", "union = cup, U+222A ISOtech" },
1127{ 8747, "int", "integral, U+222B ISOtech" },
1128{ 8756, "there4","therefore, U+2234 ISOtech" },
1129{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1130{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1131{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1132{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1133{ 8801, "equiv","identical to, U+2261 ISOtech" },
1134{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1135{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1136{ 8834, "sub", "subset of, U+2282 ISOtech" },
1137{ 8835, "sup", "superset of, U+2283 ISOtech" },
1138{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1139{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1140{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1141{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1142{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1143{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1144{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1145{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1146{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1147{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1148{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1149{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1150{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1151{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1152
1153{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1154{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1155{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1156{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1157
1158{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1159{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1160{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1161{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1162{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1163{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1164{ 732, "tilde","small tilde, U+02DC ISOdia" },
1165
1166{ 8194, "ensp", "en space, U+2002 ISOpub" },
1167{ 8195, "emsp", "em space, U+2003 ISOpub" },
1168{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1169{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1170{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1171{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1172{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1173{ 8211, "ndash","en dash, U+2013 ISOpub" },
1174{ 8212, "mdash","em dash, U+2014 ISOpub" },
1175{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1176{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1177{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1178{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1179{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1180{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1181{ 8224, "dagger","dagger, U+2020 ISOpub" },
1182{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1183{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1184{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
Daniel Veillardb05deb71999-08-10 19:04:08 +00001185{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001186{ 8364, "euro", "euro sign, U+20AC NEW" }
1187};
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001188
1189/************************************************************************
1190 * *
1191 * Commodity functions to handle entities *
1192 * *
1193 ************************************************************************/
1194
1195/*
1196 * Macro used to grow the current buffer.
1197 */
1198#define growBuffer(buffer) { \
1199 buffer##_size *= 2; \
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001200 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001201 if (buffer == NULL) { \
1202 perror("realloc failed"); \
Daniel Veillard0142b842000-01-14 14:45:24 +00001203 return(NULL); \
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001204 } \
1205}
1206
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001207/**
1208 * htmlEntityLookup:
1209 * @name: the entity name
1210 *
1211 * Lookup the given entity in EntitiesTable
1212 *
1213 * TODO: the linear scan is really ugly, an hash table is really needed.
1214 *
1215 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1216 */
1217htmlEntityDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001218htmlEntityLookup(const xmlChar *name) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001219 int i;
1220
1221 for (i = 0;i < (sizeof(html40EntitiesTable)/
1222 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillardb96e6431999-08-29 21:02:19 +00001223 if (!xmlStrcmp(name, BAD_CAST html40EntitiesTable[i].name)) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001224#ifdef DEBUG
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001225 fprintf(stderr,"Found entity %s\n", name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001226#endif
1227 return(&html40EntitiesTable[i]);
1228 }
1229 }
1230 return(NULL);
1231}
1232
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001233/**
1234 * UTF8ToHtml:
1235 * @out: a pointer to an array of bytes to store the result
1236 * @outlen: the length of @out
1237 * @in: a pointer to an array of UTF-8 chars
1238 * @inlen: the length of @in
1239 *
1240 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1241 * plus HTML entities block of chars out.
1242 *
1243 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1244 * The value of @inlen after return is the number of octets consumed
1245 * as the return value is positive, else unpredictiable.
1246 * The value of @outlen after return is the number of octets consumed.
1247 */
1248int
1249UTF8ToHtml(unsigned char* out, int *outlen,
1250 const unsigned char* in, int *inlen) {
1251 const unsigned char* processed = in;
1252 const unsigned char* outend;
1253 const unsigned char* outstart = out;
1254 const unsigned char* instart = in;
1255 const unsigned char* inend;
1256 unsigned int c, d;
1257 int trailing;
1258
1259 if (in == NULL) {
1260 /*
1261 * initialization nothing to do
1262 */
1263 *outlen = 0;
1264 *inlen = 0;
1265 return(0);
1266 }
1267 inend = in + (*inlen);
1268 outend = out + (*outlen);
1269 while (in < inend) {
1270 d = *in++;
1271 if (d < 0x80) { c= d; trailing= 0; }
1272 else if (d < 0xC0) {
1273 /* trailing byte in leading position */
1274 *outlen = out - outstart;
1275 *inlen = processed - instart;
1276 return(-2);
1277 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1278 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1279 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1280 else {
1281 /* no chance for this in Ascii */
1282 *outlen = out - outstart;
1283 *inlen = processed - instart;
1284 return(-2);
1285 }
1286
1287 if (inend - in < trailing) {
1288 break;
1289 }
1290
1291 for ( ; trailing; trailing--) {
1292 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1293 break;
1294 c <<= 6;
1295 c |= d & 0x3F;
1296 }
1297
1298 /* assertion: c is a single UTF-4 value */
1299 if (c < 0x80) {
1300 if (out >= outend)
1301 break;
1302 *out++ = c;
1303 } else {
1304 int i, j, len;
1305 /*
1306 * Try to lookup a predefined HTML entity for it
1307 */
1308
1309 for (i = 0;i < (sizeof(html40EntitiesTable)/
1310 sizeof(html40EntitiesTable[0]));i++) {
1311 if (html40EntitiesTable[i].value == c) {
1312#ifdef DEBUG
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001313 fprintf(stderr,"Found entity %s\n",
1314 html40EntitiesTable[i].name);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001315#endif
1316 goto found_ent;
1317 }
1318 if (html40EntitiesTable[i].value > c)
1319 break;
1320 }
1321
1322 /* no chance for this in Ascii */
1323 *outlen = out - outstart;
1324 *inlen = processed - instart;
1325 return(-2);
1326found_ent:
1327 len = strlen(html40EntitiesTable[i].name);
1328 if (out + 2 + len >= outend)
1329 break;
1330 *out++ = '&';
1331 for (j = 0;j < len;j++)
1332 *out++ = html40EntitiesTable[i].name[j];
1333 *out++ = ';';
1334 }
1335 processed = in;
1336 }
1337 *outlen = out - outstart;
1338 *inlen = processed - instart;
1339 return(0);
1340}
1341
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001342
1343/**
1344 * htmlDecodeEntities:
1345 * @ctxt: the parser context
1346 * @len: the len to decode (in bytes !), -1 for no size limit
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001347 * @end: an end marker xmlChar, 0 if none
1348 * @end2: an end marker xmlChar, 0 if none
1349 * @end3: an end marker xmlChar, 0 if none
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001350 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001351 * Subtitute the HTML entities by their value
1352 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001353 * DEPRECATED !!!!
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001354 *
1355 * Returns A newly allocated string with the substitution done. The caller
1356 * must deallocate it !
1357 */
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001358xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001359htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001360 xmlChar end, xmlChar end2, xmlChar end3) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001361 xmlChar *name = NULL;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001362 xmlChar *buffer = NULL;
1363 unsigned int buffer_size = 0;
1364 unsigned int nbchars = 0;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001365 htmlEntityDescPtr ent;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001366 unsigned int max = (unsigned int) len;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001367 int c,l;
1368
1369 if (ctxt->depth > 40) {
1370 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1371 ctxt->sax->error(ctxt->userData,
1372 "Detected entity reference loop\n");
1373 ctxt->wellFormed = 0;
1374 ctxt->disableSAX = 1;
1375 ctxt->errNo = XML_ERR_ENTITY_LOOP;
1376 return(NULL);
1377 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001378
1379 /*
1380 * allocate a translation buffer.
1381 */
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001382 buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001383 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001384 if (buffer == NULL) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001385 perror("xmlDecodeEntities: malloc failed");
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001386 return(NULL);
1387 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001388
1389 /*
1390 * Ok loop until we reach one of the ending char or a size limit.
1391 */
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001392 c = CUR_CHAR(l);
1393 while ((nbchars < max) && (c != end) &&
1394 (c != end2) && (c != end3)) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001395
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001396 if (c == 0) break;
1397 if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
1398 int val = htmlParseCharRef(ctxt);
1399 COPY_BUF(0,buffer,nbchars,val);
1400 NEXTL(l);
1401 } else if ((c == '&') && (ctxt->token != '&')) {
1402 ent = htmlParseEntityRef(ctxt, &name);
1403 if (name != NULL) {
1404 if (ent != NULL) {
1405 int val = ent->value;
1406 COPY_BUF(0,buffer,nbchars,val);
1407 NEXTL(l);
1408 } else {
1409 const xmlChar *cur = name;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001410
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001411 buffer[nbchars++] = '&';
1412 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1413 growBuffer(buffer);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001414 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001415 while (*cur != 0) {
1416 buffer[nbchars++] = *cur++;
1417 }
1418 buffer[nbchars++] = ';';
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001419 }
1420 }
1421 } else {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001422 COPY_BUF(l,buffer,nbchars,c);
1423 NEXTL(l);
1424 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001425 growBuffer(buffer);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001426 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001427 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001428 c = CUR_CHAR(l);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001429 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001430 buffer[nbchars++] = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001431 return(buffer);
1432}
1433
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001434/************************************************************************
1435 * *
1436 * Commodity functions to handle streams *
1437 * *
1438 ************************************************************************/
1439
1440/**
1441 * htmlFreeInputStream:
1442 * @input: an htmlParserInputPtr
1443 *
1444 * Free up an input stream.
1445 */
1446void
1447htmlFreeInputStream(htmlParserInputPtr input) {
1448 if (input == NULL) return;
1449
1450 if (input->filename != NULL) xmlFree((char *) input->filename);
1451 if (input->directory != NULL) xmlFree((char *) input->directory);
1452 if ((input->free != NULL) && (input->base != NULL))
1453 input->free((xmlChar *) input->base);
1454 if (input->buf != NULL)
1455 xmlFreeParserInputBuffer(input->buf);
1456 memset(input, -1, sizeof(htmlParserInput));
1457 xmlFree(input);
1458}
1459
1460/**
1461 * htmlNewInputStream:
1462 * @ctxt: an HTML parser context
1463 *
1464 * Create a new input stream structure
1465 * Returns the new input stream or NULL
1466 */
1467htmlParserInputPtr
1468htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1469 htmlParserInputPtr input;
1470
1471 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1472 if (input == NULL) {
1473 ctxt->errNo = XML_ERR_NO_MEMORY;
1474 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1475 ctxt->sax->error(ctxt->userData,
1476 "malloc: couldn't allocate a new input stream\n");
1477 ctxt->errNo = XML_ERR_NO_MEMORY;
1478 return(NULL);
1479 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001480 memset(input, 0, sizeof(htmlParserInput));
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001481 input->filename = NULL;
1482 input->directory = NULL;
1483 input->base = NULL;
1484 input->cur = NULL;
1485 input->buf = NULL;
1486 input->line = 1;
1487 input->col = 1;
1488 input->buf = NULL;
1489 input->free = NULL;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001490 input->version = NULL;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001491 input->consumed = 0;
1492 input->length = 0;
1493 return(input);
1494}
1495
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001496
1497/************************************************************************
1498 * *
1499 * Commodity functions, cleanup needed ? *
1500 * *
1501 ************************************************************************/
1502
1503/**
1504 * areBlanks:
1505 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001506 * @str: a xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001507 * @len: the size of @str
1508 *
1509 * Is this a sequence of blank chars that one can ignore ?
1510 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001511 * Returns 1 if ignorable 0 otherwise.
1512 */
1513
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001514static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001515 int i;
1516 xmlNodePtr lastChild;
1517
1518 for (i = 0;i < len;i++)
1519 if (!(IS_BLANK(str[i]))) return(0);
1520
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001521 if (CUR == 0) return(1);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001522 if (CUR != '<') return(0);
1523 if (ctxt->node == NULL) return(0);
1524 lastChild = xmlGetLastChild(ctxt->node);
1525 if (lastChild == NULL) {
1526 if (ctxt->node->content != NULL) return(0);
1527 } else if (xmlNodeIsText(lastChild))
1528 return(0);
1529 return(1);
1530}
1531
1532/**
1533 * htmlHandleEntity:
1534 * @ctxt: an HTML parser context
1535 * @entity: an XML entity pointer.
1536 *
1537 * Default handling of an HTML entity, call the parser with the
1538 * substitution string
1539 */
1540
1541void
1542htmlHandleEntity(htmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
1543 int len;
1544
1545 if (entity->content == NULL) {
1546 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1547 ctxt->sax->error(ctxt->userData, "htmlHandleEntity %s: content == NULL\n",
1548 entity->name);
1549 ctxt->wellFormed = 0;
1550 return;
1551 }
1552 len = xmlStrlen(entity->content);
1553
1554 /*
1555 * Just handle the content as a set of chars.
1556 */
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001557 htmlCheckParagraph(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001558 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
1559 ctxt->sax->characters(ctxt->userData, entity->content, len);
1560
1561}
1562
1563/**
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001564 * htmlNewDocNoDtD:
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001565 * @URI: URI for the dtd, or NULL
1566 * @ExternalID: the external ID of the DTD, or NULL
1567 *
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001568 * Returns a new document, do not intialize the DTD if not provided
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001569 */
1570htmlDocPtr
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001571htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001572 xmlDocPtr cur;
1573
1574 /*
1575 * Allocate a new document and fill the fields.
1576 */
Daniel Veillard6454aec1999-09-02 22:04:43 +00001577 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001578 if (cur == NULL) {
1579 fprintf(stderr, "xmlNewDoc : malloc failed\n");
1580 return(NULL);
1581 }
Daniel Veillarde7a5a771999-08-30 13:05:42 +00001582 memset(cur, 0, sizeof(xmlDoc));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001583
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001584 cur->type = XML_HTML_DOCUMENT_NODE;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001585 cur->version = NULL;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001586 cur->intSubset = NULL;
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001587 if ((ExternalID != NULL) ||
1588 (URI != NULL))
Daniel Veillard5cb5ab81999-12-21 15:35:29 +00001589 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
Daniel Veillardbe803962000-06-28 23:40:59 +00001590 cur->doc = cur;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001591 cur->name = NULL;
Daniel Veillardcf461992000-03-14 18:30:20 +00001592 cur->children = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001593 cur->extSubset = NULL;
1594 cur->oldNs = NULL;
1595 cur->encoding = NULL;
1596 cur->standalone = 1;
1597 cur->compression = 0;
Daniel Veillardc08a2c61999-09-08 21:35:25 +00001598 cur->ids = NULL;
1599 cur->refs = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001600#ifndef XML_WITHOUT_CORBA
1601 cur->_private = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001602#endif
1603 return(cur);
1604}
1605
Daniel Veillardb8f25c92000-08-19 19:52:36 +00001606/**
1607 * htmlNewDoc:
1608 * @URI: URI for the dtd, or NULL
1609 * @ExternalID: the external ID of the DTD, or NULL
1610 *
1611 * Returns a new document
1612 */
1613htmlDocPtr
1614htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1615 if ((URI == NULL) && (ExternalID == NULL))
1616 return(htmlNewDocNoDtD(
1617 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
1618 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"));
1619
1620 return(htmlNewDocNoDtD(URI, ExternalID));
1621}
1622
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001623
1624/************************************************************************
1625 * *
1626 * The parser itself *
1627 * Relates to http://www.w3.org/TR/html40 *
1628 * *
1629 ************************************************************************/
1630
1631/************************************************************************
1632 * *
1633 * The parser itself *
1634 * *
1635 ************************************************************************/
1636
1637/**
1638 * htmlParseHTMLName:
1639 * @ctxt: an HTML parser context
1640 *
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001641 * parse an HTML tag or attribute name, note that we convert it to lowercase
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001642 * since HTML names are not case-sensitive.
1643 *
1644 * Returns the Tag Name parsed or NULL
1645 */
1646
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001647xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001648htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001649 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001650 int i = 0;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001651 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001652
1653 if (!IS_LETTER(CUR) && (CUR != '_') &&
1654 (CUR != ':')) return(NULL);
1655
Daniel Veillard5e5c6231999-12-29 12:49:06 +00001656 while ((i < HTML_PARSER_BUFFER_SIZE) &&
Daniel Veillard3f6f7f62000-06-30 17:58:25 +00001657 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1658 (CUR == ':') || (CUR == '_'))) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00001659 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001660 else loc[i] = CUR;
1661 i++;
1662
1663 NEXT;
1664 }
1665
1666 ret = xmlStrndup(loc, i);
1667
1668 return(ret);
1669}
1670
1671/**
1672 * htmlParseName:
1673 * @ctxt: an HTML parser context
1674 *
1675 * parse an HTML name, this routine is case sensistive.
1676 *
1677 * Returns the Name parsed or NULL
1678 */
1679
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001680xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001681htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001682 xmlChar buf[HTML_MAX_NAMELEN];
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001683 int len = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001684
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001685 GROW;
1686 if (!IS_LETTER(CUR) && (CUR != '_')) {
1687 return(NULL);
1688 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001689
1690 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1691 (CUR == '.') || (CUR == '-') ||
1692 (CUR == '_') || (CUR == ':') ||
1693 (IS_COMBINING(CUR)) ||
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001694 (IS_EXTENDER(CUR))) {
1695 buf[len++] = CUR;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001696 NEXT;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001697 if (len >= HTML_MAX_NAMELEN) {
1698 fprintf(stderr,
1699 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1700 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1701 (CUR == '.') || (CUR == '-') ||
1702 (CUR == '_') || (CUR == ':') ||
1703 (IS_COMBINING(CUR)) ||
1704 (IS_EXTENDER(CUR)))
1705 NEXT;
1706 break;
1707 }
1708 }
1709 return(xmlStrndup(buf, len));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001710}
1711
1712/**
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001713 * htmlParseHTMLAttribute:
1714 * @ctxt: an HTML parser context
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001715 * @stop: a char stop value
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001716 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001717 * parse an HTML attribute value till the stop (quote), if
1718 * stop is 0 then it stops at the first space
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001719 *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001720 * Returns the attribute parsed or NULL
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001721 */
1722
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001723xmlChar *
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001724htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00001725#if 0
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001726 xmlChar buf[HTML_MAX_NAMELEN];
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001727 int len = 0;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001728
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001729 GROW;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001730 while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1731 if ((stop == 0) && (IS_BLANK(CUR))) break;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001732 buf[len++] = CUR;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001733 NEXT;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001734 if (len >= HTML_MAX_NAMELEN) {
1735 fprintf(stderr,
1736 "htmlParseHTMLAttribute: reached HTML_MAX_NAMELEN limit\n");
1737 while ((!IS_BLANK(CUR)) && (CUR != '<') &&
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001738 (CUR != '>') &&
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001739 (CUR != '\'') && (CUR != '"'))
1740 NEXT;
1741 break;
1742 }
1743 }
1744 return(xmlStrndup(buf, len));
Daniel Veillard71b656e2000-01-05 14:46:17 +00001745#else
1746 xmlChar *buffer = NULL;
1747 int buffer_size = 0;
1748 xmlChar *out = NULL;
1749 xmlChar *name = NULL;
1750
1751 xmlChar *cur = NULL;
1752 htmlEntityDescPtr ent;
1753
1754 /*
1755 * allocate a translation buffer.
1756 */
1757 buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1758 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1759 if (buffer == NULL) {
1760 perror("htmlParseHTMLAttribute: malloc failed");
1761 return(NULL);
1762 }
1763 out = buffer;
1764
1765 /*
1766 * Ok loop until we reach one of the ending chars
1767 */
1768 while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1769 if ((stop == 0) && (IS_BLANK(CUR))) break;
1770 if (CUR == '&') {
1771 if (NXT(1) == '#') {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001772 unsigned int c;
1773 int bits;
1774
1775 c = htmlParseCharRef(ctxt);
1776 if (c < 0x80)
1777 { *out++ = c; bits= -6; }
1778 else if (c < 0x800)
1779 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1780 else if (c < 0x10000)
1781 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1782 else
1783 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1784
1785 for ( ; bits >= 0; bits-= 6) {
1786 *out++ = ((c >> bits) & 0x3F) | 0x80;
1787 }
Daniel Veillard71b656e2000-01-05 14:46:17 +00001788 } else {
1789 ent = htmlParseEntityRef(ctxt, &name);
1790 if (name == NULL) {
1791 *out++ = '&';
1792 if (out - buffer > buffer_size - 100) {
1793 int index = out - buffer;
1794
1795 growBuffer(buffer);
1796 out = &buffer[index];
1797 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001798 } else if (ent == NULL) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00001799 *out++ = '&';
1800 cur = name;
1801 while (*cur != 0) {
1802 if (out - buffer > buffer_size - 100) {
1803 int index = out - buffer;
1804
1805 growBuffer(buffer);
1806 out = &buffer[index];
1807 }
1808 *out++ = *cur++;
1809 }
1810 xmlFree(name);
1811 } else {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001812 unsigned int c;
1813 int bits;
1814
Daniel Veillard71b656e2000-01-05 14:46:17 +00001815 if (out - buffer > buffer_size - 100) {
1816 int index = out - buffer;
1817
1818 growBuffer(buffer);
1819 out = &buffer[index];
1820 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001821 c = (xmlChar)ent->value;
1822 if (c < 0x80)
1823 { *out++ = c; bits= -6; }
1824 else if (c < 0x800)
1825 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1826 else if (c < 0x10000)
1827 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1828 else
1829 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1830
1831 for ( ; bits >= 0; bits-= 6) {
1832 *out++ = ((c >> bits) & 0x3F) | 0x80;
1833 }
Daniel Veillard71b656e2000-01-05 14:46:17 +00001834 xmlFree(name);
1835 }
1836 }
1837 } else {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001838 unsigned int c;
1839 int bits;
1840
Daniel Veillard71b656e2000-01-05 14:46:17 +00001841 if (out - buffer > buffer_size - 100) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001842 int index = out - buffer;
1843
1844 growBuffer(buffer);
1845 out = &buffer[index];
1846 }
1847 c = CUR;
1848 if (c < 0x80)
1849 { *out++ = c; bits= -6; }
1850 else if (c < 0x800)
1851 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1852 else if (c < 0x10000)
1853 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1854 else
1855 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1856
1857 for ( ; bits >= 0; bits-= 6) {
1858 *out++ = ((c >> bits) & 0x3F) | 0x80;
Daniel Veillard71b656e2000-01-05 14:46:17 +00001859 }
1860 NEXT;
1861 }
1862 }
1863 *out++ = 0;
1864 return(buffer);
1865#endif
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001866}
1867
1868/**
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001869 * htmlParseNmtoken:
1870 * @ctxt: an HTML parser context
1871 *
1872 * parse an HTML Nmtoken.
1873 *
1874 * Returns the Nmtoken parsed or NULL
1875 */
1876
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001877xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001878htmlParseNmtoken(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001879 xmlChar buf[HTML_MAX_NAMELEN];
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001880 int len = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001881
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001882 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001883 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1884 (CUR == '.') || (CUR == '-') ||
1885 (CUR == '_') || (CUR == ':') ||
1886 (IS_COMBINING(CUR)) ||
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001887 (IS_EXTENDER(CUR))) {
1888 buf[len++] = CUR;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001889 NEXT;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001890 if (len >= HTML_MAX_NAMELEN) {
1891 fprintf(stderr,
1892 "htmlParseNmtoken: reached HTML_MAX_NAMELEN limit\n");
1893 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1894 (CUR == '.') || (CUR == '-') ||
1895 (CUR == '_') || (CUR == ':') ||
1896 (IS_COMBINING(CUR)) ||
1897 (IS_EXTENDER(CUR)))
1898 NEXT;
1899 break;
1900 }
1901 }
1902 return(xmlStrndup(buf, len));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001903}
1904
1905/**
1906 * htmlParseEntityRef:
1907 * @ctxt: an HTML parser context
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001908 * @str: location to store the entity name
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001909 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001910 * parse an HTML ENTITY references
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001911 *
1912 * [68] EntityRef ::= '&' Name ';'
1913 *
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001914 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
1915 * if non-NULL *str will have to be freed by the caller.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001916 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001917htmlEntityDescPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001918htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
1919 xmlChar *name;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001920 htmlEntityDescPtr ent = NULL;
1921 *str = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001922
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001923 if (CUR == '&') {
1924 NEXT;
1925 name = htmlParseName(ctxt);
1926 if (name == NULL) {
1927 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1928 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
1929 ctxt->wellFormed = 0;
1930 } else {
Daniel Veillarde2d034d1999-07-27 19:52:06 +00001931 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001932 if (CUR == ';') {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001933 *str = name;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001934
1935 /*
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001936 * Lookup the entity in the table.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001937 */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001938 ent = htmlEntityLookup(name);
Daniel Veillard71b656e2000-01-05 14:46:17 +00001939 if (ent != NULL) /* OK that's ugly !!! */
1940 NEXT;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001941 } else {
1942 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1943 ctxt->sax->error(ctxt->userData,
1944 "htmlParseEntityRef: expecting ';'\n");
Daniel Veillard71b656e2000-01-05 14:46:17 +00001945 *str = name;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001946 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001947 }
1948 }
1949 return(ent);
1950}
1951
1952/**
1953 * htmlParseAttValue:
1954 * @ctxt: an HTML parser context
1955 *
1956 * parse a value for an attribute
1957 * Note: the parser won't do substitution of entities here, this
1958 * will be handled later in xmlStringGetNodeList, unless it was
1959 * asked for ctxt->replaceEntities != 0
1960 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001961 * Returns the AttValue parsed or NULL.
1962 */
1963
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001964xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001965htmlParseAttValue(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00001966 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001967
1968 if (CUR == '"') {
1969 NEXT;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001970 ret = htmlParseHTMLAttribute(ctxt, '"');
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001971 if (CUR != '"') {
1972 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1973 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
1974 ctxt->wellFormed = 0;
1975 } else
1976 NEXT;
1977 } else if (CUR == '\'') {
1978 NEXT;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001979 ret = htmlParseHTMLAttribute(ctxt, '\'');
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001980 if (CUR != '\'') {
1981 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1982 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
1983 ctxt->wellFormed = 0;
1984 } else
1985 NEXT;
1986 } else {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001987 /*
1988 * That's an HTMLism, the attribute value may not be quoted
1989 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00001990 ret = htmlParseHTMLAttribute(ctxt, 0);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00001991 if (ret == NULL) {
1992 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1993 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
1994 ctxt->wellFormed = 0;
1995 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001996 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00001997 return(ret);
1998}
1999
2000/**
2001 * htmlParseSystemLiteral:
2002 * @ctxt: an HTML parser context
2003 *
2004 * parse an HTML Literal
2005 *
2006 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2007 *
2008 * Returns the SystemLiteral parsed or NULL
2009 */
2010
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002011xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002012htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002013 const xmlChar *q;
2014 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002015
2016 if (CUR == '"') {
2017 NEXT;
2018 q = CUR_PTR;
2019 while ((IS_CHAR(CUR)) && (CUR != '"'))
2020 NEXT;
2021 if (!IS_CHAR(CUR)) {
2022 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2023 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2024 ctxt->wellFormed = 0;
2025 } else {
2026 ret = xmlStrndup(q, CUR_PTR - q);
2027 NEXT;
2028 }
2029 } else if (CUR == '\'') {
2030 NEXT;
2031 q = CUR_PTR;
2032 while ((IS_CHAR(CUR)) && (CUR != '\''))
2033 NEXT;
2034 if (!IS_CHAR(CUR)) {
2035 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2036 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2037 ctxt->wellFormed = 0;
2038 } else {
2039 ret = xmlStrndup(q, CUR_PTR - q);
2040 NEXT;
2041 }
2042 } else {
2043 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillardcf461992000-03-14 18:30:20 +00002044 ctxt->sax->error(ctxt->userData,
2045 "SystemLiteral \" or ' expected\n");
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002046 ctxt->wellFormed = 0;
2047 }
2048
2049 return(ret);
2050}
2051
2052/**
2053 * htmlParsePubidLiteral:
2054 * @ctxt: an HTML parser context
2055 *
2056 * parse an HTML public literal
2057 *
2058 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2059 *
2060 * Returns the PubidLiteral parsed or NULL.
2061 */
2062
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002063xmlChar *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002064htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002065 const xmlChar *q;
2066 xmlChar *ret = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002067 /*
2068 * Name ::= (Letter | '_') (NameChar)*
2069 */
2070 if (CUR == '"') {
2071 NEXT;
2072 q = CUR_PTR;
2073 while (IS_PUBIDCHAR(CUR)) NEXT;
2074 if (CUR != '"') {
2075 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2076 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2077 ctxt->wellFormed = 0;
2078 } else {
2079 ret = xmlStrndup(q, CUR_PTR - q);
2080 NEXT;
2081 }
2082 } else if (CUR == '\'') {
2083 NEXT;
2084 q = CUR_PTR;
2085 while ((IS_LETTER(CUR)) && (CUR != '\''))
2086 NEXT;
2087 if (!IS_LETTER(CUR)) {
2088 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2089 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2090 ctxt->wellFormed = 0;
2091 } else {
2092 ret = xmlStrndup(q, CUR_PTR - q);
2093 NEXT;
2094 }
2095 } else {
2096 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2097 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2098 ctxt->wellFormed = 0;
2099 }
2100
2101 return(ret);
2102}
2103
2104/**
2105 * htmlParseCharData:
2106 * @ctxt: an HTML parser context
2107 * @cdata: int indicating whether we are within a CDATA section
2108 *
2109 * parse a CharData section.
2110 * if we are within a CDATA section ']]>' marks an end of section.
2111 *
2112 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2113 */
2114
2115void
2116htmlParseCharData(htmlParserCtxtPtr ctxt, int cdata) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002117 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2118 int nbchar = 0;
2119 int cur, l;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002120
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002121 SHRINK;
2122 cur = CUR_CHAR(l);
2123 while (((cur != '<') || (ctxt->token == '<')) &&
2124 ((cur != '&') || (ctxt->token == '&')) &&
2125 (IS_CHAR(cur))) {
2126 COPY_BUF(l,buf,nbchar,cur);
2127 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2128 /*
2129 * Ok the segment is to be consumed as chars.
2130 */
2131 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2132 if (areBlanks(ctxt, buf, nbchar)) {
2133 if (ctxt->sax->ignorableWhitespace != NULL)
2134 ctxt->sax->ignorableWhitespace(ctxt->userData,
2135 buf, nbchar);
2136 } else {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00002137 htmlCheckParagraph(ctxt);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002138 if (ctxt->sax->characters != NULL)
2139 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2140 }
2141 }
2142 nbchar = 0;
2143 }
2144 NEXTL(l);
2145 cur = CUR_CHAR(l);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002146 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002147 if (nbchar != 0) {
2148 /*
2149 * Ok the segment is to be consumed as chars.
2150 */
2151 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2152 if (areBlanks(ctxt, buf, nbchar)) {
2153 if (ctxt->sax->ignorableWhitespace != NULL)
2154 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2155 } else {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00002156 htmlCheckParagraph(ctxt);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002157 if (ctxt->sax->characters != NULL)
2158 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002159 }
2160 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002161 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002162}
2163
2164/**
2165 * htmlParseExternalID:
2166 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002167 * @publicID: a xmlChar** receiving PubidLiteral
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002168 * @strict: indicate whether we should restrict parsing to only
2169 * production [75], see NOTE below
2170 *
2171 * Parse an External ID or a Public ID
2172 *
2173 * NOTE: Productions [75] and [83] interract badly since [75] can generate
2174 * 'PUBLIC' S PubidLiteral S SystemLiteral
2175 *
2176 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2177 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2178 *
2179 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2180 *
2181 * Returns the function returns SystemLiteral and in the second
2182 * case publicID receives PubidLiteral, is strict is off
2183 * it is possible to return NULL and have publicID set.
2184 */
2185
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002186xmlChar *
2187htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID, int strict) {
2188 xmlChar *URI = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002189
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002190 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2191 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2192 (UPP(4) == 'E') && (UPP(5) == 'M')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002193 SKIP(6);
2194 if (!IS_BLANK(CUR)) {
2195 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2196 ctxt->sax->error(ctxt->userData,
2197 "Space required after 'SYSTEM'\n");
2198 ctxt->wellFormed = 0;
2199 }
2200 SKIP_BLANKS;
2201 URI = htmlParseSystemLiteral(ctxt);
2202 if (URI == NULL) {
2203 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2204 ctxt->sax->error(ctxt->userData,
2205 "htmlParseExternalID: SYSTEM, no URI\n");
2206 ctxt->wellFormed = 0;
2207 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002208 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2209 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2210 (UPP(4) == 'I') && (UPP(5) == 'C')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002211 SKIP(6);
2212 if (!IS_BLANK(CUR)) {
2213 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2214 ctxt->sax->error(ctxt->userData,
2215 "Space required after 'PUBLIC'\n");
2216 ctxt->wellFormed = 0;
2217 }
2218 SKIP_BLANKS;
2219 *publicID = htmlParsePubidLiteral(ctxt);
2220 if (*publicID == NULL) {
2221 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2222 ctxt->sax->error(ctxt->userData,
2223 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2224 ctxt->wellFormed = 0;
2225 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002226 SKIP_BLANKS;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002227 if ((CUR == '"') || (CUR == '\'')) {
2228 URI = htmlParseSystemLiteral(ctxt);
2229 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002230 }
2231 return(URI);
2232}
2233
2234/**
2235 * htmlParseComment:
2236 * @ctxt: an HTML parser context
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002237 *
2238 * Parse an XML (SGML) comment <!-- .... -->
2239 *
2240 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2241 */
2242void
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002243htmlParseComment(htmlParserCtxtPtr ctxt) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002244 xmlChar *buf = NULL;
Daniel Veillard87b95392000-08-12 21:12:04 +00002245 int len;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002246 int size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard87b95392000-08-12 21:12:04 +00002247 int q, ql;
2248 int r, rl;
2249 int cur, l;
2250 xmlParserInputState state;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002251
2252 /*
2253 * Check that there is a comment right here.
2254 */
Daniel Veillard87b95392000-08-12 21:12:04 +00002255 if ((RAW != '<') || (NXT(1) != '!') ||
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002256 (NXT(2) != '-') || (NXT(3) != '-')) return;
2257
Daniel Veillard87b95392000-08-12 21:12:04 +00002258 state = ctxt->instate;
2259 ctxt->instate = XML_PARSER_COMMENT;
2260 SHRINK;
2261 SKIP(4);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002262 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2263 if (buf == NULL) {
2264 fprintf(stderr, "malloc of %d byte failed\n", size);
Daniel Veillard87b95392000-08-12 21:12:04 +00002265 ctxt->instate = state;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002266 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002267 }
Daniel Veillard87b95392000-08-12 21:12:04 +00002268 q = CUR_CHAR(ql);
2269 NEXTL(ql);
2270 r = CUR_CHAR(rl);
2271 NEXTL(rl);
2272 cur = CUR_CHAR(l);
2273 len = 0;
2274 while (IS_CHAR(cur) &&
2275 ((cur != '>') ||
2276 (r != '-') || (q != '-'))) {
2277 if (len + 5 >= size) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002278 size *= 2;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002279 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002280 if (buf == NULL) {
2281 fprintf(stderr, "realloc of %d byte failed\n", size);
Daniel Veillard87b95392000-08-12 21:12:04 +00002282 ctxt->instate = state;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002283 return;
2284 }
2285 }
Daniel Veillard87b95392000-08-12 21:12:04 +00002286 COPY_BUF(ql,buf,len,q);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002287 q = r;
Daniel Veillard87b95392000-08-12 21:12:04 +00002288 ql = rl;
2289 r = cur;
2290 rl = l;
2291 NEXTL(l);
2292 cur = CUR_CHAR(l);
2293 if (cur == 0) {
2294 SHRINK;
2295 GROW;
2296 cur = CUR_CHAR(l);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002297 }
2298 }
Daniel Veillard87b95392000-08-12 21:12:04 +00002299 buf[len] = 0;
2300 if (!IS_CHAR(cur)) {
2301 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2302 ctxt->sax->error(ctxt->userData,
2303 "Comment not terminated \n<!--%.50s\n", buf);
2304 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2305 ctxt->wellFormed = 0;
2306 xmlFree(buf);
2307 } else {
2308 NEXT;
2309 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2310 (!ctxt->disableSAX))
2311 ctxt->sax->comment(ctxt->userData, buf);
2312 xmlFree(buf);
2313 }
2314 ctxt->instate = state;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002315}
2316
2317/**
2318 * htmlParseCharRef:
2319 * @ctxt: an HTML parser context
2320 *
2321 * parse Reference declarations
2322 *
2323 * [66] CharRef ::= '&#' [0-9]+ ';' |
2324 * '&#x' [0-9a-fA-F]+ ';'
2325 *
2326 * Returns the value parsed (as an int)
2327 */
2328int
2329htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2330 int val = 0;
2331
2332 if ((CUR == '&') && (NXT(1) == '#') &&
2333 (NXT(2) == 'x')) {
2334 SKIP(3);
2335 while (CUR != ';') {
2336 if ((CUR >= '0') && (CUR <= '9'))
2337 val = val * 16 + (CUR - '0');
2338 else if ((CUR >= 'a') && (CUR <= 'f'))
2339 val = val * 16 + (CUR - 'a') + 10;
2340 else if ((CUR >= 'A') && (CUR <= 'F'))
2341 val = val * 16 + (CUR - 'A') + 10;
2342 else {
2343 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2344 ctxt->sax->error(ctxt->userData,
2345 "htmlParseCharRef: invalid hexadecimal value\n");
2346 ctxt->wellFormed = 0;
2347 val = 0;
2348 break;
2349 }
2350 NEXT;
2351 }
2352 if (CUR == ';')
2353 NEXT;
2354 } else if ((CUR == '&') && (NXT(1) == '#')) {
2355 SKIP(2);
2356 while (CUR != ';') {
2357 if ((CUR >= '0') && (CUR <= '9'))
2358 val = val * 10 + (CUR - '0');
2359 else {
2360 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2361 ctxt->sax->error(ctxt->userData,
2362 "htmlParseCharRef: invalid decimal value\n");
2363 ctxt->wellFormed = 0;
2364 val = 0;
2365 break;
2366 }
2367 NEXT;
2368 }
2369 if (CUR == ';')
2370 NEXT;
2371 } else {
2372 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2373 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2374 ctxt->wellFormed = 0;
2375 }
2376 /*
2377 * Check the value IS_CHAR ...
2378 */
2379 if (IS_CHAR(val)) {
2380 return(val);
2381 } else {
2382 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002383 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002384 val);
2385 ctxt->wellFormed = 0;
2386 }
2387 return(0);
2388}
2389
2390
2391/**
2392 * htmlParseDocTypeDecl :
2393 * @ctxt: an HTML parser context
2394 *
2395 * parse a DOCTYPE declaration
2396 *
2397 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2398 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2399 */
2400
2401void
2402htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002403 xmlChar *name;
2404 xmlChar *ExternalID = NULL;
2405 xmlChar *URI = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002406
2407 /*
2408 * We know that '<!DOCTYPE' has been detected.
2409 */
2410 SKIP(9);
2411
2412 SKIP_BLANKS;
2413
2414 /*
2415 * Parse the DOCTYPE name.
2416 */
2417 name = htmlParseName(ctxt);
2418 if (name == NULL) {
2419 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2420 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2421 ctxt->wellFormed = 0;
2422 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002423 /*
2424 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2425 */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002426
2427 SKIP_BLANKS;
2428
2429 /*
2430 * Check for SystemID and ExternalID
2431 */
Daniel Veillarde2d034d1999-07-27 19:52:06 +00002432 URI = htmlParseExternalID(ctxt, &ExternalID, 0);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002433 SKIP_BLANKS;
2434
2435 /*
2436 * We should be at the end of the DOCTYPE declaration.
2437 */
2438 if (CUR != '>') {
2439 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2440 ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
2441 ctxt->wellFormed = 0;
2442 /* We shouldn't try to resynchronize ... */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002443 }
2444 NEXT;
2445
2446 /*
Daniel Veillardd83eb822000-06-30 18:39:56 +00002447 * Create or update the document accordingly to the DOCTYPE
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002448 */
Daniel Veillardd83eb822000-06-30 18:39:56 +00002449 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2450 (!ctxt->disableSAX))
2451 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002452
2453 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002454 * Cleanup, since we don't use all those identifiers
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002455 */
Daniel Veillard6454aec1999-09-02 22:04:43 +00002456 if (URI != NULL) xmlFree(URI);
2457 if (ExternalID != NULL) xmlFree(ExternalID);
2458 if (name != NULL) xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002459}
2460
2461/**
2462 * htmlParseAttribute:
2463 * @ctxt: an HTML parser context
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002464 * @value: a xmlChar ** used to store the value of the attribute
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002465 *
2466 * parse an attribute
2467 *
2468 * [41] Attribute ::= Name Eq AttValue
2469 *
2470 * [25] Eq ::= S? '=' S?
2471 *
2472 * With namespace:
2473 *
2474 * [NS 11] Attribute ::= QName Eq AttValue
2475 *
2476 * Also the case QName == xmlns:??? is handled independently as a namespace
2477 * definition.
2478 *
2479 * Returns the attribute name, and the value in *value.
2480 */
2481
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002482xmlChar *
2483htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002484 xmlChar *name, *val = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002485
2486 *value = NULL;
2487 name = htmlParseName(ctxt);
2488 if (name == NULL) {
2489 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2490 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2491 ctxt->wellFormed = 0;
2492 return(NULL);
2493 }
2494
2495 /*
2496 * read the value
2497 */
2498 SKIP_BLANKS;
2499 if (CUR == '=') {
2500 NEXT;
2501 SKIP_BLANKS;
2502 val = htmlParseAttValue(ctxt);
Daniel Veillardbe803962000-06-28 23:40:59 +00002503 /******
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002504 } else {
Daniel Veillardbe803962000-06-28 23:40:59 +00002505 * TODO : some attribute must have values, some may not
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002506 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002507 ctxt->sax->warning(ctxt->userData,
Daniel Veillardbe803962000-06-28 23:40:59 +00002508 "No value for attribute %s\n", name); */
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002509 }
2510
2511 *value = val;
2512 return(name);
2513}
2514
2515/**
Daniel Veillard365e13b2000-07-02 07:56:37 +00002516 * htmlCheckEncoding:
2517 * @ctxt: an HTML parser context
2518 * @attvalue: the attribute value
2519 *
2520 * Checks an http-equiv attribute from a Meta tag to detect
2521 * the encoding
2522 * If a new encoding is detected the parser is switched to decode
2523 * it and pass UTF8
2524 */
2525void
2526htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2527 const xmlChar *encoding;
2528
2529 if ((ctxt == NULL) || (attvalue == NULL))
2530 return;
2531
Daniel Veillard365e13b2000-07-02 07:56:37 +00002532 encoding = xmlStrstr(attvalue, BAD_CAST"charset=");
2533 if (encoding == NULL)
2534 encoding = xmlStrstr(attvalue, BAD_CAST"Charset=");
2535 if (encoding == NULL)
2536 encoding = xmlStrstr(attvalue, BAD_CAST"CHARSET=");
2537 if (encoding != NULL) {
2538 encoding += 8;
2539 } else {
2540 encoding = xmlStrstr(attvalue, BAD_CAST"charset =");
2541 if (encoding == NULL)
2542 encoding = xmlStrstr(attvalue, BAD_CAST"Charset =");
2543 if (encoding == NULL)
2544 encoding = xmlStrstr(attvalue, BAD_CAST"CHARSET =");
2545 if (encoding != NULL)
2546 encoding += 9;
2547 }
2548 if (encoding != NULL) {
2549 xmlCharEncoding enc;
2550 xmlCharEncodingHandlerPtr handler;
2551
2552 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2553
2554 if (ctxt->input->encoding != NULL)
2555 xmlFree((xmlChar *) ctxt->input->encoding);
2556 ctxt->input->encoding = xmlStrdup(encoding);
2557
2558 enc = xmlParseCharEncoding((const char *) encoding);
2559 /*
2560 * registered set of known encodings
2561 */
2562 if (enc != XML_CHAR_ENCODING_ERROR) {
2563 xmlSwitchEncoding(ctxt, enc);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002564 ctxt->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillard365e13b2000-07-02 07:56:37 +00002565 } else {
2566 /*
2567 * fallback for unknown encodings
2568 */
2569 handler = xmlFindCharEncodingHandler((const char *) encoding);
2570 if (handler != NULL) {
2571 xmlSwitchToEncoding(ctxt, handler);
Daniel Veillard87b95392000-08-12 21:12:04 +00002572 ctxt->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillard365e13b2000-07-02 07:56:37 +00002573 } else {
2574 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2575 }
2576 }
Daniel Veillard87b95392000-08-12 21:12:04 +00002577
2578 if ((ctxt->input->buf != NULL) &&
2579 (ctxt->input->buf->encoder != NULL) &&
2580 (ctxt->input->buf->raw != NULL) &&
2581 (ctxt->input->buf->buffer != NULL)) {
2582 int nbchars;
2583 int processed;
2584
2585 /*
2586 * convert as much as possible to the parser reading buffer.
2587 */
2588 processed = ctxt->input->cur - ctxt->input->base;
2589 xmlBufferShrink(ctxt->input->buf->buffer, processed);
2590 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2591 ctxt->input->buf->buffer,
2592 ctxt->input->buf->raw);
2593 if (nbchars < 0) {
2594 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2595 ctxt->sax->error(ctxt->userData,
2596 "htmlCheckEncoding: encoder error\n");
2597 ctxt->errNo = XML_ERR_INVALID_ENCODING;
2598 }
2599 ctxt->input->base =
2600 ctxt->input->cur = ctxt->input->buf->buffer->content;
2601 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00002602 }
2603}
2604
2605/**
2606 * htmlCheckMeta:
2607 * @ctxt: an HTML parser context
2608 * @atts: the attributes values
2609 *
2610 * Checks an attributes from a Meta tag
2611 */
2612void
2613htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2614 int i;
2615 const xmlChar *att, *value;
2616 int http = 0;
2617 const xmlChar *content = NULL;
2618
2619 if ((ctxt == NULL) || (atts == NULL))
2620 return;
2621
2622 i = 0;
2623 att = atts[i++];
2624 while (att != NULL) {
2625 value = atts[i++];
2626 if ((value != NULL) &&
2627 ((!xmlStrcmp(att, BAD_CAST"http-equiv")) ||
2628 (!xmlStrcmp(att, BAD_CAST"Http-Equiv")) ||
2629 (!xmlStrcmp(att, BAD_CAST"HTTP-EQUIV"))) &&
2630 ((!xmlStrcmp(value, BAD_CAST"Content-Type")) ||
2631 (!xmlStrcmp(value, BAD_CAST"content-type")) ||
2632 (!xmlStrcmp(value, BAD_CAST"CONTENT-TYPE"))))
2633 http = 1;
2634 else if ((value != NULL) &&
2635 ((!xmlStrcmp(att, BAD_CAST"content")) ||
2636 (!xmlStrcmp(att, BAD_CAST"Content")) ||
2637 (!xmlStrcmp(att, BAD_CAST"CONTENT"))))
2638 content = value;
2639 att = atts[i++];
2640 }
2641 if ((http) && (content != NULL))
2642 htmlCheckEncoding(ctxt, content);
2643
2644}
2645
2646/**
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002647 * htmlParseStartTag:
2648 * @ctxt: an HTML parser context
2649 *
2650 * parse a start of tag either for rule element or
2651 * EmptyElement. In both case we don't parse the tag closing chars.
2652 *
2653 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2654 *
2655 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2656 *
2657 * With namespace:
2658 *
2659 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2660 *
2661 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2662 *
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002663 */
2664
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002665void
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002666htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002667 xmlChar *name;
2668 xmlChar *attname;
2669 xmlChar *attvalue;
2670 const xmlChar **atts = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002671 int nbatts = 0;
2672 int maxatts = 0;
Daniel Veillard365e13b2000-07-02 07:56:37 +00002673 int meta = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002674 int i;
2675
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002676 if (CUR != '<') return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002677 NEXT;
2678
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002679 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002680 name = htmlParseHTMLName(ctxt);
2681 if (name == NULL) {
2682 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2683 ctxt->sax->error(ctxt->userData,
2684 "htmlParseStartTag: invalid element name\n");
2685 ctxt->wellFormed = 0;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002686 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002687 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00002688 if (!xmlStrcmp(name, BAD_CAST"meta"))
2689 meta = 1;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002690
2691 /*
2692 * Check for auto-closure of HTML elements.
2693 */
2694 htmlAutoClose(ctxt, name);
2695
2696 /*
Daniel Veillardbe803962000-06-28 23:40:59 +00002697 * Check for implied HTML elements.
2698 */
2699 htmlCheckImplied(ctxt, name);
2700
2701 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002702 * Now parse the attributes, it ends up with the ending
2703 *
2704 * (S Attribute)* S?
2705 */
2706 SKIP_BLANKS;
2707 while ((IS_CHAR(CUR)) &&
2708 (CUR != '>') &&
2709 ((CUR != '/') || (NXT(1) != '>'))) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002710 long cons = ctxt->nbChars;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002711
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002712 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002713 attname = htmlParseAttribute(ctxt, &attvalue);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002714 if (attname != NULL) {
Daniel Veillard365e13b2000-07-02 07:56:37 +00002715
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002716 /*
2717 * Well formedness requires at most one declaration of an attribute
2718 */
2719 for (i = 0; i < nbatts;i += 2) {
2720 if (!xmlStrcmp(atts[i], attname)) {
2721 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002722 ctxt->sax->error(ctxt->userData,
2723 "Attribute %s redefined\n",
2724 attname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002725 ctxt->wellFormed = 0;
Daniel Veillard6454aec1999-09-02 22:04:43 +00002726 xmlFree(attname);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002727 if (attvalue != NULL)
2728 xmlFree(attvalue);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002729 goto failed;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002730 }
2731 }
2732
2733 /*
2734 * Add the pair to atts
2735 */
2736 if (atts == NULL) {
2737 maxatts = 10;
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002738 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002739 if (atts == NULL) {
2740 fprintf(stderr, "malloc of %ld byte failed\n",
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002741 maxatts * (long)sizeof(xmlChar *));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002742 if (name != NULL) xmlFree(name);
2743 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002744 }
Daniel Veillard51e3b151999-11-12 17:02:31 +00002745 } else if (nbatts + 4 > maxatts) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002746 maxatts *= 2;
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002747 atts = (const xmlChar **) xmlRealloc(atts, maxatts * sizeof(xmlChar *));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002748 if (atts == NULL) {
2749 fprintf(stderr, "realloc of %ld byte failed\n",
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002750 maxatts * (long)sizeof(xmlChar *));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002751 if (name != NULL) xmlFree(name);
2752 return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002753 }
2754 }
2755 atts[nbatts++] = attname;
2756 atts[nbatts++] = attvalue;
2757 atts[nbatts] = NULL;
2758 atts[nbatts + 1] = NULL;
2759 }
2760
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002761failed:
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002762 SKIP_BLANKS;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002763 if (cons == ctxt->nbChars) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002764 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2765 ctxt->sax->error(ctxt->userData,
2766 "htmlParseStartTag: problem parsing attributes\n");
2767 ctxt->wellFormed = 0;
2768 break;
2769 }
2770 }
2771
2772 /*
Daniel Veillard365e13b2000-07-02 07:56:37 +00002773 * Handle specific association to the META tag
2774 */
2775 if (meta)
2776 htmlCheckMeta(ctxt, atts);
2777
2778 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002779 * SAX: Start of Element !
2780 */
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002781 htmlnamePush(ctxt, xmlStrdup(name));
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002782#ifdef DEBUG
2783 fprintf(stderr,"Start of element %s: pushed %s\n", name, ctxt->name);
2784#endif
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002785 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
2786 ctxt->sax->startElement(ctxt->userData, name, atts);
2787
2788 if (atts != NULL) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00002789 for (i = 0;i < nbatts;i++) {
2790 if (atts[i] != NULL)
2791 xmlFree((xmlChar *) atts[i]);
2792 }
Daniel Veillard3f6f7f62000-06-30 17:58:25 +00002793 xmlFree((void *) atts);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002794 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002795 if (name != NULL) xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002796}
2797
2798/**
2799 * htmlParseEndTag:
2800 * @ctxt: an HTML parser context
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002801 *
2802 * parse an end of tag
2803 *
2804 * [42] ETag ::= '</' Name S? '>'
2805 *
2806 * With namespace
2807 *
2808 * [NS 9] ETag ::= '</' QName S? '>'
2809 */
2810
2811void
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002812htmlParseEndTag(htmlParserCtxtPtr ctxt) {
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002813 xmlChar *name;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002814 xmlChar *oldname;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002815 int i;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002816
2817 if ((CUR != '<') || (NXT(1) != '/')) {
2818 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2819 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
2820 ctxt->wellFormed = 0;
2821 return;
2822 }
2823 SKIP(2);
2824
2825 name = htmlParseHTMLName(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002826 if (name == NULL) return;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002827
2828 /*
2829 * We should definitely be at the ending "S? '>'" part
2830 */
2831 SKIP_BLANKS;
2832 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
2833 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2834 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
2835 ctxt->wellFormed = 0;
2836 } else
2837 NEXT;
2838
2839 /*
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002840 * If the name read is not one of the element in the parsing stack
2841 * then return, it's just an error.
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002842 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002843 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
2844 if (!xmlStrcmp(name, ctxt->nameTab[i])) break;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002845 }
2846 if (i < 0) {
2847 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002848 ctxt->sax->error(ctxt->userData,
2849 "Unexpected end tag : %s\n", name);
Daniel Veillard6454aec1999-09-02 22:04:43 +00002850 xmlFree(name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002851 ctxt->wellFormed = 0;
2852 return;
2853 }
2854
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002855
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002856 /*
2857 * Check for auto-closure of HTML elements.
2858 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002859
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002860 htmlAutoCloseOnClose(ctxt, name);
2861
2862 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002863 * Well formedness constraints, opening and closing must match.
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002864 * With the exception that the autoclose may have popped stuff out
2865 * of the stack.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002866 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002867 if (xmlStrcmp(name, ctxt->name)) {
2868#ifdef DEBUG
2869 fprintf(stderr,"End of tag %s: expecting %s\n", name, ctxt->name);
2870#endif
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002871 if ((ctxt->name != NULL) &&
2872 (xmlStrcmp(ctxt->name, name))) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002873 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2874 ctxt->sax->error(ctxt->userData,
2875 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002876 name, ctxt->name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002877 ctxt->wellFormed = 0;
2878 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002879 }
2880
2881 /*
2882 * SAX: End of Tag
2883 */
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002884 oldname = ctxt->name;
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002885 if ((oldname != NULL) && (!xmlStrcmp(oldname, name))) {
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002886 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2887 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00002888 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002889 if (oldname != NULL) {
2890#ifdef DEBUG
2891 fprintf(stderr,"End of tag %s: popping out %s\n", name, oldname);
2892#endif
2893 xmlFree(oldname);
2894#ifdef DEBUG
2895 } else {
2896 fprintf(stderr,"End of tag %s: stack empty !!!\n", name);
2897#endif
2898 }
2899 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002900
2901 if (name != NULL)
Daniel Veillard6454aec1999-09-02 22:04:43 +00002902 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002903
2904 return;
2905}
2906
2907
2908/**
2909 * htmlParseReference:
2910 * @ctxt: an HTML parser context
2911 *
2912 * parse and handle entity references in content,
2913 * this will end-up in a call to character() since this is either a
2914 * CharRef, or a predefined entity.
2915 */
2916void
2917htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002918 htmlEntityDescPtr ent;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002919 xmlChar out[6];
Daniel Veillarddd6b3671999-09-23 22:19:22 +00002920 xmlChar *name;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002921 if (CUR != '&') return;
2922
2923 if (NXT(1) == '#') {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002924 unsigned int c;
2925 int bits, i = 0;
2926
2927 c = htmlParseCharRef(ctxt);
2928 if (c < 0x80) { out[i++]= c; bits= -6; }
2929 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2930 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2931 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
2932
2933 for ( ; bits >= 0; bits-= 6) {
2934 out[i++]= ((c >> bits) & 0x3F) | 0x80;
2935 }
2936 out[i] = 0;
2937
Daniel Veillardb8f25c92000-08-19 19:52:36 +00002938 htmlCheckParagraph(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002939 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002940 ctxt->sax->characters(ctxt->userData, out, i);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002941 } else {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002942 ent = htmlParseEntityRef(ctxt, &name);
Daniel Veillard71b656e2000-01-05 14:46:17 +00002943 if (name == NULL) {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00002944 htmlCheckParagraph(ctxt);
Daniel Veillard1255ab72000-08-14 15:13:33 +00002945 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
2946 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
Daniel Veillard71b656e2000-01-05 14:46:17 +00002947 return;
2948 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002949 if ((ent == NULL) || (ent->value <= 0)) {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00002950 htmlCheckParagraph(ctxt);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002951 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
Daniel Veillardb96e6431999-08-29 21:02:19 +00002952 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002953 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
Daniel Veillard71b656e2000-01-05 14:46:17 +00002954 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002955 }
2956 } else {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002957 unsigned int c;
2958 int bits, i = 0;
2959
2960 c = ent->value;
2961 if (c < 0x80)
2962 { out[i++]= c; bits= -6; }
2963 else if (c < 0x800)
2964 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2965 else if (c < 0x10000)
2966 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2967 else
2968 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
2969
2970 for ( ; bits >= 0; bits-= 6) {
2971 out[i++]= ((c >> bits) & 0x3F) | 0x80;
2972 }
2973 out[i] = 0;
2974
Daniel Veillardb8f25c92000-08-19 19:52:36 +00002975 htmlCheckParagraph(ctxt);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002976 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002977 ctxt->sax->characters(ctxt->userData, out, i);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002978 }
Daniel Veillard6454aec1999-09-02 22:04:43 +00002979 xmlFree(name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002980 }
2981}
2982
2983/**
2984 * htmlParseContent:
2985 * @ctxt: an HTML parser context
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002986 * @name: the node name
Daniel Veillardbe70ff71999-07-05 16:50:46 +00002987 *
2988 * Parse a content: comment, sub-element, reference or text.
2989 *
2990 */
2991
2992void
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002993htmlParseContent(htmlParserCtxtPtr ctxt) {
Daniel Veillard2673d3c1999-10-08 14:37:09 +00002994 xmlChar *currentNode;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002995 int depth;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00002996
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00002997 currentNode = xmlStrdup(ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00002998 depth = ctxt->nameNr;
2999 while (1) {
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003000 long cons = ctxt->nbChars;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003001
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003002 GROW;
3003 /*
3004 * Our tag or one of it's parent or children is ending.
3005 */
3006 if ((CUR == '<') && (NXT(1) == '/')) {
3007 htmlParseEndTag(ctxt);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003008 if (currentNode != NULL) xmlFree(currentNode);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003009 return;
3010 }
3011
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003012 /*
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003013 * Has this node been popped out during parsing of
3014 * the next element
3015 */
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003016 if ((xmlStrcmp(currentNode, ctxt->name)) &&
3017 (depth >= ctxt->nameNr)) {
3018 if (currentNode != NULL) xmlFree(currentNode);
3019 return;
3020 }
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003021
3022 /*
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003023 * Sometimes DOCTYPE arrives in the middle of the document
3024 */
3025 if ((CUR == '<') && (NXT(1) == '!') &&
3026 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3027 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3028 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3029 (UPP(8) == 'E')) {
3030 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3031 ctxt->sax->error(ctxt->userData,
3032 "Misplaced DOCTYPE declaration\n");
3033 ctxt->wellFormed = 0;
3034 htmlParseDocTypeDecl(ctxt);
3035 }
3036
3037 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003038 * First case : a comment
3039 */
3040 if ((CUR == '<') && (NXT(1) == '!') &&
3041 (NXT(2) == '-') && (NXT(3) == '-')) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003042 htmlParseComment(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003043 }
3044
3045 /*
3046 * Second case : a sub-element.
3047 */
3048 else if (CUR == '<') {
3049 htmlParseElement(ctxt);
3050 }
3051
3052 /*
3053 * Third case : a reference. If if has not been resolved,
3054 * parsing returns it's Name, create the node
3055 */
3056 else if (CUR == '&') {
3057 htmlParseReference(ctxt);
3058 }
3059
3060 /*
Daniel Veillard365e13b2000-07-02 07:56:37 +00003061 * Fourth : end of the resource
3062 */
3063 else if (CUR == 0) {
3064 htmlAutoClose(ctxt, NULL);
3065 }
3066
3067 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003068 * Last case, text. Note that References are handled directly.
3069 */
3070 else {
3071 htmlParseCharData(ctxt, 0);
3072 }
3073
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003074 if (cons == ctxt->nbChars) {
Daniel Veillard35008381999-10-25 13:15:52 +00003075 if (ctxt->node != NULL) {
3076 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3077 ctxt->sax->error(ctxt->userData,
3078 "detected an error in element content\n");
3079 ctxt->wellFormed = 0;
3080 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003081 break;
3082 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003083
Daniel Veillarde2d034d1999-07-27 19:52:06 +00003084 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003085 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003086 if (currentNode != NULL) xmlFree(currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003087}
3088
3089/**
3090 * htmlParseElement:
3091 * @ctxt: an HTML parser context
3092 *
3093 * parse an HTML element, this is highly recursive
3094 *
3095 * [39] element ::= EmptyElemTag | STag content ETag
3096 *
3097 * [41] Attribute ::= Name Eq AttValue
3098 */
3099
3100void
3101htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003102 xmlChar *name;
Daniel Veillard7d2c2761999-10-11 15:09:51 +00003103 xmlChar *currentNode = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003104 htmlElemDescPtr info;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003105 htmlParserNodeInfo node_info;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003106 xmlChar *oldname;
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003107 int depth = ctxt->nameNr;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003108
3109 /* Capture start position */
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003110 if (ctxt->record_info) {
3111 node_info.begin_pos = ctxt->input->consumed +
3112 (CUR_PTR - ctxt->input->base);
3113 node_info.begin_line = ctxt->input->line;
3114 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003115
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003116 oldname = xmlStrdup(ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003117 htmlParseStartTag(ctxt);
3118 name = ctxt->name;
3119#ifdef DEBUG
3120 if (oldname == NULL)
3121 fprintf(stderr, "Start of element %s\n", name);
3122 else if (name == NULL)
3123 fprintf(stderr, "Start of element failed, was %s\n", oldname);
3124 else
3125 fprintf(stderr, "Start of element %s, was %s\n", name, oldname);
3126#endif
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003127 if (((depth == ctxt->nameNr) && (!xmlStrcmp(oldname, ctxt->name))) ||
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003128 (name == NULL)) {
3129 if (CUR == '>')
3130 NEXT;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003131 if (oldname != NULL)
3132 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003133 return;
3134 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003135 if (oldname != NULL)
3136 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003137
3138 /*
3139 * Lookup the info for that element.
3140 */
3141 info = htmlTagLookup(name);
3142 if (info == NULL) {
3143 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3144 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3145 name);
3146 ctxt->wellFormed = 0;
3147 } else if (info->depr) {
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003148/***************************
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003149 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3150 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3151 name);
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003152 ***************************/
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003153 }
3154
3155 /*
3156 * Check for an Empty Element labelled the XML/SGML way
3157 */
3158 if ((CUR == '/') && (NXT(1) == '>')) {
3159 SKIP(2);
3160 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3161 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003162 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003163#ifdef DEBUG
3164 fprintf(stderr,"End of tag the XML way: popping out %s\n", oldname);
3165#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003166 if (oldname != NULL)
3167 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003168 return;
3169 }
3170
Daniel Veillarde2d034d1999-07-27 19:52:06 +00003171 if (CUR == '>') {
3172 NEXT;
3173 } else {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003174 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard87b95392000-08-12 21:12:04 +00003175 ctxt->sax->error(ctxt->userData,
3176 "Couldn't find end of Start Tag %s\n",
3177 name);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003178 ctxt->wellFormed = 0;
3179
3180 /*
3181 * end of parsing of this node.
3182 */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003183 if (!xmlStrcmp(name, ctxt->name)) {
3184 nodePop(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003185 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003186#ifdef DEBUG
3187 fprintf(stderr,"End of start tag problem: popping out %s\n", oldname);
3188#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003189 if (oldname != NULL)
3190 xmlFree(oldname);
3191 }
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003192
3193 /*
3194 * Capture end position and add node
3195 */
3196 if ( currentNode != NULL && ctxt->record_info ) {
3197 node_info.end_pos = ctxt->input->consumed +
3198 (CUR_PTR - ctxt->input->base);
3199 node_info.end_line = ctxt->input->line;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003200 node_info.node = ctxt->node;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003201 xmlParserAddNodeInfo(ctxt, &node_info);
3202 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003203 return;
3204 }
3205
3206 /*
3207 * Check for an Empty Element from DTD definition
3208 */
3209 if ((info != NULL) && (info->empty)) {
3210 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3211 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003212 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003213#ifdef DEBUG
3214 fprintf(stderr,"End of empty tag %s : popping out %s\n", name, oldname);
3215#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003216 if (oldname != NULL)
3217 xmlFree(oldname);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003218 return;
3219 }
3220
3221 /*
3222 * Parse the content of the element:
3223 */
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003224 currentNode = xmlStrdup(ctxt->name);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003225 depth = ctxt->nameNr;
3226 while (IS_CHAR(CUR)) {
3227 htmlParseContent(ctxt);
3228 if (ctxt->nameNr < depth) break;
3229 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003230
3231 if (!IS_CHAR(CUR)) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003232 /************
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003233 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3234 ctxt->sax->error(ctxt->userData,
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003235 "Premature end of data in tag %s\n", currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003236 ctxt->wellFormed = 0;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003237 *************/
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003238
3239 /*
3240 * end of parsing of this node.
3241 */
3242 nodePop(ctxt);
Daniel Veillard4c3a2031999-11-19 17:46:26 +00003243 oldname = htmlnamePop(ctxt);
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003244#ifdef DEBUG
3245 fprintf(stderr,"Premature end of tag %s : popping out %s\n", name, oldname);
3246#endif
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003247 if (oldname != NULL)
3248 xmlFree(oldname);
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003249 if (currentNode != NULL)
3250 xmlFree(currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003251 return;
3252 }
3253
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003254 /*
3255 * Capture end position and add node
3256 */
3257 if ( currentNode != NULL && ctxt->record_info ) {
3258 node_info.end_pos = ctxt->input->consumed +
3259 (CUR_PTR - ctxt->input->base);
3260 node_info.end_line = ctxt->input->line;
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003261 node_info.node = ctxt->node;
Daniel Veillard1ff7ae31999-09-01 12:19:13 +00003262 xmlParserAddNodeInfo(ctxt, &node_info);
3263 }
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003264 if (currentNode != NULL)
3265 xmlFree(currentNode);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003266}
3267
3268/**
3269 * htmlParseDocument :
3270 * @ctxt: an HTML parser context
3271 *
3272 * parse an HTML document (and build a tree if using the standard SAX
3273 * interface).
3274 *
3275 * Returns 0, -1 in case of error. the parser context is augmented
3276 * as a result of the parsing.
3277 */
3278
3279int
3280htmlParseDocument(htmlParserCtxtPtr ctxt) {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003281 xmlDtdPtr dtd;
3282
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003283 htmlDefaultSAXHandlerInit();
3284 ctxt->html = 1;
3285
Daniel Veillarde2d034d1999-07-27 19:52:06 +00003286 GROW;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003287 /*
Daniel Veillardb96e6431999-08-29 21:02:19 +00003288 * SAX: beginning of the document processing.
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003289 */
3290 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3291 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3292
3293 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003294 * Wipe out everything which is before the first '<'
3295 */
Daniel Veillard35008381999-10-25 13:15:52 +00003296 SKIP_BLANKS;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003297 if (CUR == 0) {
3298 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3299 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3300 ctxt->wellFormed = 0;
3301 }
3302
Daniel Veillardbe803962000-06-28 23:40:59 +00003303 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3304 ctxt->sax->startDocument(ctxt->userData);
3305
3306
Daniel Veillard35008381999-10-25 13:15:52 +00003307 /*
3308 * Parse possible comments before any content
3309 */
3310 while ((CUR == '<') && (NXT(1) == '!') &&
3311 (NXT(2) == '-') && (NXT(3) == '-')) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003312 htmlParseComment(ctxt);
Daniel Veillard35008381999-10-25 13:15:52 +00003313 SKIP_BLANKS;
3314 }
3315
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003316
3317 /*
3318 * Then possibly doc type declaration(s) and more Misc
3319 * (doctypedecl Misc*)?
3320 */
3321 if ((CUR == '<') && (NXT(1) == '!') &&
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003322 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3323 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3324 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3325 (UPP(8) == 'E')) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003326 htmlParseDocTypeDecl(ctxt);
3327 }
3328 SKIP_BLANKS;
3329
3330 /*
Daniel Veillard87b95392000-08-12 21:12:04 +00003331 * Parse possible comments before any content
3332 */
3333 while ((CUR == '<') && (NXT(1) == '!') &&
3334 (NXT(2) == '-') && (NXT(3) == '-')) {
3335 htmlParseComment(ctxt);
3336 SKIP_BLANKS;
3337 }
3338
3339 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003340 * Time to start parsing the tree itself
3341 */
Daniel Veillard35008381999-10-25 13:15:52 +00003342 htmlParseContent(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003343
3344 /*
Daniel Veillard365e13b2000-07-02 07:56:37 +00003345 * autoclose
3346 */
3347 if (CUR == 0)
3348 htmlAutoClose(ctxt, NULL);
3349
3350
3351 /*
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003352 * SAX: end of the document processing.
3353 */
3354 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3355 ctxt->sax->endDocument(ctxt->userData);
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003356
3357 if (ctxt->myDoc != NULL) {
3358 dtd = xmlGetIntSubset(ctxt->myDoc);
3359 if (dtd == NULL)
3360 ctxt->myDoc->intSubset =
3361 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3362 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3363 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3364 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003365 if (! ctxt->wellFormed) return(-1);
3366 return(0);
3367}
3368
3369
Daniel Veillarddbfd6411999-12-28 16:35:14 +00003370/************************************************************************
3371 * *
3372 * Parser contexts handling *
3373 * *
3374 ************************************************************************/
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003375
3376/**
3377 * xmlInitParserCtxt:
3378 * @ctxt: an HTML parser context
3379 *
3380 * Initialize a parser context
3381 */
3382
3383void
3384htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3385{
3386 htmlSAXHandler *sax;
3387
Daniel Veillard35008381999-10-25 13:15:52 +00003388 if (ctxt == NULL) return;
3389 memset(ctxt, 0, sizeof(htmlParserCtxt));
3390
Daniel Veillard6454aec1999-09-02 22:04:43 +00003391 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003392 if (sax == NULL) {
3393 fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
3394 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003395 memset(sax, 0, sizeof(htmlSAXHandler));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003396
3397 /* Allocate the Input stack */
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003398 ctxt->inputTab = (htmlParserInputPtr *)
3399 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3400 if (ctxt->inputTab == NULL) {
3401 fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
3402 }
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003403 ctxt->inputNr = 0;
3404 ctxt->inputMax = 5;
3405 ctxt->input = NULL;
3406 ctxt->version = NULL;
3407 ctxt->encoding = NULL;
3408 ctxt->standalone = -1;
Daniel Veillarddbfd6411999-12-28 16:35:14 +00003409 ctxt->instate = XML_PARSER_START;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003410
3411 /* Allocate the Node stack */
Daniel Veillard6454aec1999-09-02 22:04:43 +00003412 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003413 ctxt->nodeNr = 0;
3414 ctxt->nodeMax = 10;
3415 ctxt->node = NULL;
3416
Daniel Veillard2673d3c1999-10-08 14:37:09 +00003417 /* Allocate the Name stack */
3418 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
3419 ctxt->nameNr = 0;
3420 ctxt->nameMax = 10;
3421 ctxt->name = NULL;
3422
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003423 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3424 else {
3425 ctxt->sax = sax;
3426 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3427 }
3428 ctxt->userData = ctxt;
3429 ctxt->myDoc = NULL;
3430 ctxt->wellFormed = 1;
Daniel Veillard5233ffc1999-07-06 22:25:25 +00003431 ctxt->replaceEntities = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003432 ctxt->html = 1;
3433 ctxt->record_info = 0;
Daniel Veillard35008381999-10-25 13:15:52 +00003434 ctxt->validate = 0;
Daniel Veillardaf78a0e1999-12-12 13:03:50 +00003435 ctxt->nbChars = 0;
Daniel Veillarddbfd6411999-12-28 16:35:14 +00003436 ctxt->checkIndex = 0;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003437 xmlInitNodeInfoSeq(&ctxt->node_seq);
3438}
3439
3440/**
3441 * htmlFreeParserCtxt:
3442 * @ctxt: an HTML parser context
3443 *
3444 * Free all the memory used by a parser context. However the parsed
3445 * document in ctxt->myDoc is not freed.
3446 */
3447
3448void
3449htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3450{
Daniel Veillard365e13b2000-07-02 07:56:37 +00003451 xmlFreeParserCtxt(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003452}
3453
3454/**
3455 * htmlCreateDocParserCtxt :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003456 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003457 * @encoding: a free form C string describing the HTML document encoding, or NULL
3458 *
3459 * Create a parser context for an HTML document.
3460 *
3461 * Returns the new parser context or NULL
3462 */
3463htmlParserCtxtPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00003464htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003465 htmlParserCtxtPtr ctxt;
3466 htmlParserInputPtr input;
3467 /* htmlCharEncoding enc; */
3468
Daniel Veillard6454aec1999-09-02 22:04:43 +00003469 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003470 if (ctxt == NULL) {
3471 perror("malloc");
3472 return(NULL);
3473 }
3474 htmlInitParserCtxt(ctxt);
Daniel Veillard6454aec1999-09-02 22:04:43 +00003475 input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003476 if (input == NULL) {
3477 perror("malloc");
Daniel Veillard6454aec1999-09-02 22:04:43 +00003478 xmlFree(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003479 return(NULL);
3480 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00003481 memset(input, 0, sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003482
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003483 input->line = 1;
3484 input->col = 1;
3485 input->base = cur;
3486 input->cur = cur;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00003487
3488 inputPush(ctxt, input);
3489 return(ctxt);
3490}
3491
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003492/************************************************************************
3493 * *
3494 * Progressive parsing interfaces *
3495 * *
3496 ************************************************************************/
3497
3498/**
3499 * htmlParseLookupSequence:
3500 * @ctxt: an HTML parser context
3501 * @first: the first char to lookup
3502 * @next: the next char to lookup or zero
3503 * @third: the next char to lookup or zero
3504 *
3505 * Try to find if a sequence (first, next, third) or just (first next) or
3506 * (first) is available in the input stream.
3507 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3508 * to avoid rescanning sequences of bytes, it DOES change the state of the
3509 * parser, do not use liberally.
3510 * This is basically similar to xmlParseLookupSequence()
3511 *
3512 * Returns the index to the current parsing point if the full sequence
3513 * is available, -1 otherwise.
3514 */
3515int
3516htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3517 xmlChar next, xmlChar third) {
3518 int base, len;
3519 htmlParserInputPtr in;
3520 const xmlChar *buf;
3521
3522 in = ctxt->input;
3523 if (in == NULL) return(-1);
3524 base = in->cur - in->base;
3525 if (base < 0) return(-1);
3526 if (ctxt->checkIndex > base)
3527 base = ctxt->checkIndex;
3528 if (in->buf == NULL) {
3529 buf = in->base;
3530 len = in->length;
3531 } else {
3532 buf = in->buf->buffer->content;
3533 len = in->buf->buffer->use;
3534 }
3535 /* take into account the sequence length */
3536 if (third) len -= 2;
3537 else if (next) len --;
3538 for (;base < len;base++) {
3539 if (buf[base] == first) {
3540 if (third != 0) {
3541 if ((buf[base + 1] != next) ||
3542 (buf[base + 2] != third)) continue;
3543 } else if (next != 0) {
3544 if (buf[base + 1] != next) continue;
3545 }
3546 ctxt->checkIndex = 0;
3547#ifdef DEBUG_PUSH
3548 if (next == 0)
3549 fprintf(stderr, "HPP: lookup '%c' found at %d\n",
3550 first, base);
3551 else if (third == 0)
3552 fprintf(stderr, "HPP: lookup '%c%c' found at %d\n",
3553 first, next, base);
3554 else
3555 fprintf(stderr, "HPP: lookup '%c%c%c' found at %d\n",
3556 first, next, third, base);
3557#endif
3558 return(base - (in->cur - in->base));
3559 }
3560 }
3561 ctxt->checkIndex = base;
3562#ifdef DEBUG_PUSH
3563 if (next == 0)
3564 fprintf(stderr, "HPP: lookup '%c' failed\n", first);
3565 else if (third == 0)
3566 fprintf(stderr, "HPP: lookup '%c%c' failed\n", first, next);
3567 else
3568 fprintf(stderr, "HPP: lookup '%c%c%c' failed\n", first, next, third);
3569#endif
3570 return(-1);
3571}
3572
3573/**
Daniel Veillard71b656e2000-01-05 14:46:17 +00003574 * htmlParseTryOrFinish:
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003575 * @ctxt: an HTML parser context
Daniel Veillard71b656e2000-01-05 14:46:17 +00003576 * @terminate: last chunk indicator
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003577 *
3578 * Try to progress on parsing
3579 *
3580 * Returns zero if no parsing was possible
3581 */
3582int
Daniel Veillard71b656e2000-01-05 14:46:17 +00003583htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003584 int ret = 0;
3585 htmlParserInputPtr in;
Daniel Veillard365e13b2000-07-02 07:56:37 +00003586 int avail = 0;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003587 xmlChar cur, next;
3588
3589#ifdef DEBUG_PUSH
3590 switch (ctxt->instate) {
3591 case XML_PARSER_EOF:
3592 fprintf(stderr, "HPP: try EOF\n"); break;
3593 case XML_PARSER_START:
3594 fprintf(stderr, "HPP: try START\n"); break;
3595 case XML_PARSER_MISC:
3596 fprintf(stderr, "HPP: try MISC\n");break;
3597 case XML_PARSER_COMMENT:
3598 fprintf(stderr, "HPP: try COMMENT\n");break;
3599 case XML_PARSER_PROLOG:
3600 fprintf(stderr, "HPP: try PROLOG\n");break;
3601 case XML_PARSER_START_TAG:
3602 fprintf(stderr, "HPP: try START_TAG\n");break;
3603 case XML_PARSER_CONTENT:
3604 fprintf(stderr, "HPP: try CONTENT\n");break;
3605 case XML_PARSER_CDATA_SECTION:
3606 fprintf(stderr, "HPP: try CDATA_SECTION\n");break;
3607 case XML_PARSER_END_TAG:
3608 fprintf(stderr, "HPP: try END_TAG\n");break;
3609 case XML_PARSER_ENTITY_DECL:
3610 fprintf(stderr, "HPP: try ENTITY_DECL\n");break;
3611 case XML_PARSER_ENTITY_VALUE:
3612 fprintf(stderr, "HPP: try ENTITY_VALUE\n");break;
3613 case XML_PARSER_ATTRIBUTE_VALUE:
3614 fprintf(stderr, "HPP: try ATTRIBUTE_VALUE\n");break;
3615 case XML_PARSER_DTD:
3616 fprintf(stderr, "HPP: try DTD\n");break;
3617 case XML_PARSER_EPILOG:
3618 fprintf(stderr, "HPP: try EPILOG\n");break;
3619 case XML_PARSER_PI:
3620 fprintf(stderr, "HPP: try PI\n");break;
3621 }
3622#endif
3623
3624 while (1) {
3625
3626 in = ctxt->input;
3627 if (in == NULL) break;
3628 if (in->buf == NULL)
3629 avail = in->length - (in->cur - in->base);
3630 else
3631 avail = in->buf->buffer->use - (in->cur - in->base);
Daniel Veillard365e13b2000-07-02 07:56:37 +00003632 if ((avail == 0) && (terminate)) {
3633 htmlAutoClose(ctxt, NULL);
Daniel Veillard87b95392000-08-12 21:12:04 +00003634 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
3635 /*
3636 * SAX: end of the document processing.
3637 */
Daniel Veillard365e13b2000-07-02 07:56:37 +00003638 ctxt->instate = XML_PARSER_EOF;
Daniel Veillard87b95392000-08-12 21:12:04 +00003639 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3640 ctxt->sax->endDocument(ctxt->userData);
3641 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00003642 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003643 if (avail < 1)
3644 goto done;
3645 switch (ctxt->instate) {
3646 case XML_PARSER_EOF:
3647 /*
3648 * Document parsing is done !
3649 */
3650 goto done;
3651 case XML_PARSER_START:
3652 /*
3653 * Very first chars read from the document flow.
3654 */
3655 cur = in->cur[0];
3656 if (IS_BLANK(cur)) {
3657 SKIP_BLANKS;
3658 if (in->buf == NULL)
3659 avail = in->length - (in->cur - in->base);
3660 else
3661 avail = in->buf->buffer->use - (in->cur - in->base);
3662 }
3663 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3664 ctxt->sax->setDocumentLocator(ctxt->userData,
3665 &xmlDefaultSAXLocator);
Daniel Veillardd83eb822000-06-30 18:39:56 +00003666 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
3667 (!ctxt->disableSAX))
3668 ctxt->sax->startDocument(ctxt->userData);
3669
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003670 cur = in->cur[0];
3671 next = in->cur[1];
3672 if ((cur == '<') && (next == '!') &&
3673 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3674 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3675 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3676 (UPP(8) == 'E')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003677 if ((!terminate) &&
3678 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003679 goto done;
3680#ifdef DEBUG_PUSH
3681 fprintf(stderr, "HPP: Parsing internal subset\n");
3682#endif
3683 htmlParseDocTypeDecl(ctxt);
3684 ctxt->instate = XML_PARSER_PROLOG;
3685#ifdef DEBUG_PUSH
3686 fprintf(stderr, "HPP: entering PROLOG\n");
3687#endif
3688 } else {
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003689 ctxt->instate = XML_PARSER_MISC;
3690 }
3691#ifdef DEBUG_PUSH
3692 fprintf(stderr, "HPP: entering MISC\n");
3693#endif
3694 break;
3695 case XML_PARSER_MISC:
3696 SKIP_BLANKS;
3697 if (in->buf == NULL)
3698 avail = in->length - (in->cur - in->base);
3699 else
3700 avail = in->buf->buffer->use - (in->cur - in->base);
3701 if (avail < 2)
3702 goto done;
3703 cur = in->cur[0];
3704 next = in->cur[1];
3705 if ((cur == '<') && (next == '!') &&
3706 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003707 if ((!terminate) &&
3708 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003709 goto done;
3710#ifdef DEBUG_PUSH
3711 fprintf(stderr, "HPP: Parsing Comment\n");
3712#endif
3713 htmlParseComment(ctxt);
3714 ctxt->instate = XML_PARSER_MISC;
3715 } else if ((cur == '<') && (next == '!') &&
3716 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3717 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3718 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3719 (UPP(8) == 'E')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003720 if ((!terminate) &&
3721 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003722 goto done;
3723#ifdef DEBUG_PUSH
3724 fprintf(stderr, "HPP: Parsing internal subset\n");
3725#endif
3726 htmlParseDocTypeDecl(ctxt);
3727 ctxt->instate = XML_PARSER_PROLOG;
3728#ifdef DEBUG_PUSH
3729 fprintf(stderr, "HPP: entering PROLOG\n");
3730#endif
3731 } else if ((cur == '<') && (next == '!') &&
3732 (avail < 9)) {
3733 goto done;
3734 } else {
3735 ctxt->instate = XML_PARSER_START_TAG;
3736#ifdef DEBUG_PUSH
3737 fprintf(stderr, "HPP: entering START_TAG\n");
3738#endif
3739 }
3740 break;
3741 case XML_PARSER_PROLOG:
3742 SKIP_BLANKS;
3743 if (in->buf == NULL)
3744 avail = in->length - (in->cur - in->base);
3745 else
3746 avail = in->buf->buffer->use - (in->cur - in->base);
3747 if (avail < 2)
3748 goto done;
3749 cur = in->cur[0];
3750 next = in->cur[1];
3751 if ((cur == '<') && (next == '!') &&
3752 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003753 if ((!terminate) &&
3754 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003755 goto done;
3756#ifdef DEBUG_PUSH
3757 fprintf(stderr, "HPP: Parsing Comment\n");
3758#endif
3759 htmlParseComment(ctxt);
3760 ctxt->instate = XML_PARSER_PROLOG;
3761 } else if ((cur == '<') && (next == '!') &&
3762 (avail < 4)) {
3763 goto done;
3764 } else {
3765 ctxt->instate = XML_PARSER_START_TAG;
3766#ifdef DEBUG_PUSH
3767 fprintf(stderr, "HPP: entering START_TAG\n");
3768#endif
3769 }
3770 break;
3771 case XML_PARSER_EPILOG:
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003772 if (in->buf == NULL)
3773 avail = in->length - (in->cur - in->base);
3774 else
3775 avail = in->buf->buffer->use - (in->cur - in->base);
Daniel Veillard87b95392000-08-12 21:12:04 +00003776 if (avail < 1)
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003777 goto done;
3778 cur = in->cur[0];
Daniel Veillard87b95392000-08-12 21:12:04 +00003779 if (IS_BLANK(cur)) {
3780 htmlParseCharData(ctxt, 0);
3781 goto done;
3782 }
3783 if (avail < 2)
3784 goto done;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003785 next = in->cur[1];
3786 if ((cur == '<') && (next == '!') &&
3787 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00003788 if ((!terminate) &&
3789 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003790 goto done;
3791#ifdef DEBUG_PUSH
3792 fprintf(stderr, "HPP: Parsing Comment\n");
3793#endif
3794 htmlParseComment(ctxt);
3795 ctxt->instate = XML_PARSER_EPILOG;
3796 } else if ((cur == '<') && (next == '!') &&
3797 (avail < 4)) {
3798 goto done;
3799 } else {
3800 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3801 ctxt->sax->error(ctxt->userData,
3802 "Extra content at the end of the document\n");
3803 ctxt->wellFormed = 0;
3804 ctxt->errNo = XML_ERR_DOCUMENT_END;
3805 ctxt->instate = XML_PARSER_EOF;
3806#ifdef DEBUG_PUSH
3807 fprintf(stderr, "HPP: entering EOF\n");
3808#endif
3809 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3810 ctxt->sax->endDocument(ctxt->userData);
3811 goto done;
3812 }
3813 break;
3814 case XML_PARSER_START_TAG: {
3815 xmlChar *name, *oldname;
3816 int depth = ctxt->nameNr;
3817 htmlElemDescPtr info;
3818
3819 if (avail < 2)
3820 goto done;
3821 cur = in->cur[0];
3822 if (cur != '<') {
3823 ctxt->instate = XML_PARSER_CONTENT;
3824#ifdef DEBUG_PUSH
3825 fprintf(stderr, "HPP: entering CONTENT\n");
3826#endif
3827 break;
3828 }
Daniel Veillard71b656e2000-01-05 14:46:17 +00003829 if ((!terminate) &&
3830 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003831 goto done;
3832
3833 oldname = xmlStrdup(ctxt->name);
3834 htmlParseStartTag(ctxt);
3835 name = ctxt->name;
3836#ifdef DEBUG
3837 if (oldname == NULL)
3838 fprintf(stderr, "Start of element %s\n", name);
3839 else if (name == NULL)
3840 fprintf(stderr, "Start of element failed, was %s\n",
3841 oldname);
3842 else
3843 fprintf(stderr, "Start of element %s, was %s\n",
3844 name, oldname);
3845#endif
3846 if (((depth == ctxt->nameNr) &&
3847 (!xmlStrcmp(oldname, ctxt->name))) ||
3848 (name == NULL)) {
3849 if (CUR == '>')
3850 NEXT;
3851 if (oldname != NULL)
3852 xmlFree(oldname);
3853 break;
3854 }
3855 if (oldname != NULL)
3856 xmlFree(oldname);
3857
3858 /*
3859 * Lookup the info for that element.
3860 */
3861 info = htmlTagLookup(name);
3862 if (info == NULL) {
3863 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3864 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3865 name);
3866 ctxt->wellFormed = 0;
3867 } else if (info->depr) {
3868 /***************************
3869 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3870 ctxt->sax->warning(ctxt->userData,
3871 "Tag %s is deprecated\n",
3872 name);
3873 ***************************/
3874 }
3875
3876 /*
3877 * Check for an Empty Element labelled the XML/SGML way
3878 */
3879 if ((CUR == '/') && (NXT(1) == '>')) {
3880 SKIP(2);
3881 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3882 ctxt->sax->endElement(ctxt->userData, name);
3883 oldname = htmlnamePop(ctxt);
3884#ifdef DEBUG
3885 fprintf(stderr,"End of tag the XML way: popping out %s\n",
3886 oldname);
3887#endif
3888 if (oldname != NULL)
3889 xmlFree(oldname);
3890 ctxt->instate = XML_PARSER_CONTENT;
3891#ifdef DEBUG_PUSH
3892 fprintf(stderr, "HPP: entering CONTENT\n");
3893#endif
3894 break;
3895 }
3896
3897 if (CUR == '>') {
3898 NEXT;
3899 } else {
3900 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3901 ctxt->sax->error(ctxt->userData,
3902 "Couldn't find end of Start Tag %s\n",
3903 name);
3904 ctxt->wellFormed = 0;
3905
3906 /*
3907 * end of parsing of this node.
3908 */
3909 if (!xmlStrcmp(name, ctxt->name)) {
3910 nodePop(ctxt);
3911 oldname = htmlnamePop(ctxt);
3912#ifdef DEBUG
3913 fprintf(stderr,
3914 "End of start tag problem: popping out %s\n", oldname);
3915#endif
3916 if (oldname != NULL)
3917 xmlFree(oldname);
3918 }
3919
3920 ctxt->instate = XML_PARSER_CONTENT;
3921#ifdef DEBUG_PUSH
3922 fprintf(stderr, "HPP: entering CONTENT\n");
3923#endif
3924 break;
3925 }
3926
3927 /*
3928 * Check for an Empty Element from DTD definition
3929 */
3930 if ((info != NULL) && (info->empty)) {
3931 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3932 ctxt->sax->endElement(ctxt->userData, name);
3933 oldname = htmlnamePop(ctxt);
3934#ifdef DEBUG
3935 fprintf(stderr,"End of empty tag %s : popping out %s\n", name, oldname);
3936#endif
3937 if (oldname != NULL)
3938 xmlFree(oldname);
3939 }
3940 ctxt->instate = XML_PARSER_CONTENT;
3941#ifdef DEBUG_PUSH
3942 fprintf(stderr, "HPP: entering CONTENT\n");
3943#endif
3944 break;
3945 }
Daniel Veillard87b95392000-08-12 21:12:04 +00003946 case XML_PARSER_CONTENT: {
3947 long cons;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003948 /*
3949 * Handle preparsed entities and charRef
3950 */
3951 if (ctxt->token != 0) {
Daniel Veillard365e13b2000-07-02 07:56:37 +00003952 xmlChar chr[2] = { 0 , 0 } ;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003953
Daniel Veillard365e13b2000-07-02 07:56:37 +00003954 chr[0] = (xmlChar) ctxt->token;
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003955 htmlCheckParagraph(ctxt);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003956 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
Daniel Veillard365e13b2000-07-02 07:56:37 +00003957 ctxt->sax->characters(ctxt->userData, chr, 1);
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003958 ctxt->token = 0;
3959 ctxt->checkIndex = 0;
3960 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00003961 if ((avail == 1) && (terminate)) {
3962 cur = in->cur[0];
3963 if ((cur != '<') && (cur != '&')) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003964 if (ctxt->sax != NULL) {
3965 if (IS_BLANK(cur)) {
3966 if (ctxt->sax->ignorableWhitespace != NULL)
3967 ctxt->sax->ignorableWhitespace(
3968 ctxt->userData, &cur, 1);
3969 } else {
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003970 htmlCheckParagraph(ctxt);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00003971 if (ctxt->sax->characters != NULL)
3972 ctxt->sax->characters(
3973 ctxt->userData, &cur, 1);
3974 }
3975 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00003976 ctxt->token = 0;
3977 ctxt->checkIndex = 0;
3978 NEXT;
3979 }
3980 break;
3981 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00003982 if (avail < 2)
3983 goto done;
3984 cur = in->cur[0];
3985 next = in->cur[1];
Daniel Veillard87b95392000-08-12 21:12:04 +00003986 cons = ctxt->nbChars;
Daniel Veillardb8f25c92000-08-19 19:52:36 +00003987 /*
3988 * Sometimes DOCTYPE arrives in the middle of the document
3989 */
3990 if ((cur == '<') && (next == '!') &&
3991 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3992 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3993 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3994 (UPP(8) == 'E')) {
3995 if ((!terminate) &&
3996 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
3997 goto done;
3998 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3999 ctxt->sax->error(ctxt->userData,
4000 "Misplaced DOCTYPE declaration\n");
4001 ctxt->wellFormed = 0;
4002 htmlParseDocTypeDecl(ctxt);
4003 } else if ((cur == '<') && (next == '!') &&
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004004 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00004005 if ((!terminate) &&
4006 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004007 goto done;
4008#ifdef DEBUG_PUSH
4009 fprintf(stderr, "HPP: Parsing Comment\n");
4010#endif
4011 htmlParseComment(ctxt);
4012 ctxt->instate = XML_PARSER_CONTENT;
4013 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4014 goto done;
4015 } else if ((cur == '<') && (next == '/')) {
4016 ctxt->instate = XML_PARSER_END_TAG;
4017 ctxt->checkIndex = 0;
4018#ifdef DEBUG_PUSH
4019 fprintf(stderr, "HPP: entering END_TAG\n");
4020#endif
4021 break;
4022 } else if (cur == '<') {
4023 ctxt->instate = XML_PARSER_START_TAG;
4024 ctxt->checkIndex = 0;
4025#ifdef DEBUG_PUSH
4026 fprintf(stderr, "HPP: entering START_TAG\n");
4027#endif
4028 break;
4029 } else if (cur == '&') {
Daniel Veillard71b656e2000-01-05 14:46:17 +00004030 if ((!terminate) &&
4031 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004032 goto done;
4033#ifdef DEBUG_PUSH
4034 fprintf(stderr, "HPP: Parsing Reference\n");
4035#endif
4036 /* TODO: check generation of subtrees if noent !!! */
4037 htmlParseReference(ctxt);
4038 } else {
4039 /* TODO Avoid the extra copy, handle directly !!!!!! */
4040 /*
4041 * Goal of the following test is :
4042 * - minimize calls to the SAX 'character' callback
4043 * when they are mergeable
4044 */
4045 if ((ctxt->inputNr == 1) &&
4046 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
Daniel Veillard71b656e2000-01-05 14:46:17 +00004047 if ((!terminate) &&
4048 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004049 goto done;
4050 }
4051 ctxt->checkIndex = 0;
4052#ifdef DEBUG_PUSH
4053 fprintf(stderr, "HPP: Parsing char data\n");
4054#endif
4055 htmlParseCharData(ctxt, 0);
4056 }
Daniel Veillard87b95392000-08-12 21:12:04 +00004057 if (cons == ctxt->nbChars) {
4058 if (ctxt->node != NULL) {
4059 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4060 ctxt->sax->error(ctxt->userData,
4061 "detected an error in element content\n");
4062 ctxt->wellFormed = 0;
4063 NEXT;
4064 }
4065 break;
4066 }
4067
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004068 break;
Daniel Veillard87b95392000-08-12 21:12:04 +00004069 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004070 case XML_PARSER_END_TAG:
4071 if (avail < 2)
4072 goto done;
Daniel Veillard71b656e2000-01-05 14:46:17 +00004073 if ((!terminate) &&
4074 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004075 goto done;
4076 htmlParseEndTag(ctxt);
4077 if (ctxt->nameNr == 0) {
4078 ctxt->instate = XML_PARSER_EPILOG;
4079 } else {
4080 ctxt->instate = XML_PARSER_CONTENT;
4081 }
4082 ctxt->checkIndex = 0;
4083#ifdef DEBUG_PUSH
4084 fprintf(stderr, "HPP: entering CONTENT\n");
4085#endif
4086 break;
4087 case XML_PARSER_CDATA_SECTION:
4088 fprintf(stderr, "HPP: internal error, state == CDATA\n");
4089 ctxt->instate = XML_PARSER_CONTENT;
4090 ctxt->checkIndex = 0;
4091#ifdef DEBUG_PUSH
4092 fprintf(stderr, "HPP: entering CONTENT\n");
4093#endif
4094 break;
4095 case XML_PARSER_DTD:
4096 fprintf(stderr, "HPP: internal error, state == DTD\n");
4097 ctxt->instate = XML_PARSER_CONTENT;
4098 ctxt->checkIndex = 0;
4099#ifdef DEBUG_PUSH
4100 fprintf(stderr, "HPP: entering CONTENT\n");
4101#endif
4102 break;
4103 case XML_PARSER_COMMENT:
4104 fprintf(stderr, "HPP: internal error, state == COMMENT\n");
4105 ctxt->instate = XML_PARSER_CONTENT;
4106 ctxt->checkIndex = 0;
4107#ifdef DEBUG_PUSH
4108 fprintf(stderr, "HPP: entering CONTENT\n");
4109#endif
4110 break;
4111 case XML_PARSER_PI:
4112 fprintf(stderr, "HPP: internal error, state == PI\n");
4113 ctxt->instate = XML_PARSER_CONTENT;
4114 ctxt->checkIndex = 0;
4115#ifdef DEBUG_PUSH
4116 fprintf(stderr, "HPP: entering CONTENT\n");
4117#endif
4118 break;
4119 case XML_PARSER_ENTITY_DECL:
4120 fprintf(stderr, "HPP: internal error, state == ENTITY_DECL\n");
4121 ctxt->instate = XML_PARSER_CONTENT;
4122 ctxt->checkIndex = 0;
4123#ifdef DEBUG_PUSH
4124 fprintf(stderr, "HPP: entering CONTENT\n");
4125#endif
4126 break;
4127 case XML_PARSER_ENTITY_VALUE:
4128 fprintf(stderr, "HPP: internal error, state == ENTITY_VALUE\n");
4129 ctxt->instate = XML_PARSER_CONTENT;
4130 ctxt->checkIndex = 0;
4131#ifdef DEBUG_PUSH
4132 fprintf(stderr, "HPP: entering DTD\n");
4133#endif
4134 break;
4135 case XML_PARSER_ATTRIBUTE_VALUE:
4136 fprintf(stderr, "HPP: internal error, state == ATTRIBUTE_VALUE\n");
4137 ctxt->instate = XML_PARSER_START_TAG;
4138 ctxt->checkIndex = 0;
4139#ifdef DEBUG_PUSH
4140 fprintf(stderr, "HPP: entering START_TAG\n");
4141#endif
4142 break;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00004143 case XML_PARSER_SYSTEM_LITERAL:
4144 fprintf(stderr, "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
4145 ctxt->instate = XML_PARSER_CONTENT;
4146 ctxt->checkIndex = 0;
4147#ifdef DEBUG_PUSH
4148 fprintf(stderr, "HPP: entering CONTENT\n");
4149#endif
4150 break;
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004151 }
4152 }
4153done:
Daniel Veillard365e13b2000-07-02 07:56:37 +00004154 if ((avail == 0) && (terminate)) {
4155 htmlAutoClose(ctxt, NULL);
Daniel Veillard87b95392000-08-12 21:12:04 +00004156 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4157 /*
4158 * SAX: end of the document processing.
4159 */
Daniel Veillard365e13b2000-07-02 07:56:37 +00004160 ctxt->instate = XML_PARSER_EOF;
Daniel Veillard87b95392000-08-12 21:12:04 +00004161 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4162 ctxt->sax->endDocument(ctxt->userData);
4163 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00004164 }
Daniel Veillardb8f25c92000-08-19 19:52:36 +00004165 if ((ctxt->myDoc != NULL) &&
4166 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4167 (ctxt->instate == XML_PARSER_EPILOG))) {
4168 xmlDtdPtr dtd;
4169 dtd = xmlGetIntSubset(ctxt->myDoc);
4170 if (dtd == NULL)
4171 ctxt->myDoc->intSubset =
4172 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4173 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4174 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4175 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004176#ifdef DEBUG_PUSH
4177 fprintf(stderr, "HPP: done %d\n", ret);
4178#endif
4179 return(ret);
4180}
4181
4182/**
Daniel Veillard71b656e2000-01-05 14:46:17 +00004183 * htmlParseTry:
4184 * @ctxt: an HTML parser context
4185 *
4186 * Try to progress on parsing
4187 *
4188 * Returns zero if no parsing was possible
4189 */
4190int
4191htmlParseTry(htmlParserCtxtPtr ctxt) {
4192 return(htmlParseTryOrFinish(ctxt, 0));
4193}
4194
4195/**
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004196 * htmlParseChunk:
4197 * @ctxt: an XML parser context
4198 * @chunk: an char array
4199 * @size: the size in byte of the chunk
4200 * @terminate: last chunk indicator
4201 *
4202 * Parse a Chunk of memory
4203 *
4204 * Returns zero if no error, the xmlParserErrors otherwise.
4205 */
4206int
4207htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4208 int terminate) {
4209 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4210 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4211 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4212 int cur = ctxt->input->cur - ctxt->input->base;
4213
4214 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4215 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4216 ctxt->input->cur = ctxt->input->base + cur;
4217#ifdef DEBUG_PUSH
4218 fprintf(stderr, "HPP: pushed %d\n", size);
4219#endif
4220
Daniel Veillardd0f7f742000-02-02 17:42:48 +00004221 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4222 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard52402ce2000-08-22 23:36:12 +00004223 } else if (ctxt->instate != XML_PARSER_EOF) {
4224 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
Daniel Veillard71b656e2000-01-05 14:46:17 +00004225 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard52402ce2000-08-22 23:36:12 +00004226 }
Daniel Veillard5e5c6231999-12-29 12:49:06 +00004227 if (terminate) {
4228 if ((ctxt->instate != XML_PARSER_EOF) &&
4229 (ctxt->instate != XML_PARSER_EPILOG) &&
4230 (ctxt->instate != XML_PARSER_MISC)) {
4231 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4232 ctxt->sax->error(ctxt->userData,
4233 "Extra content at the end of the document\n");
4234 ctxt->wellFormed = 0;
4235 ctxt->errNo = XML_ERR_DOCUMENT_END;
4236 }
4237 if (ctxt->instate != XML_PARSER_EOF) {
4238 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4239 ctxt->sax->endDocument(ctxt->userData);
4240 }
4241 ctxt->instate = XML_PARSER_EOF;
4242 }
4243 return((xmlParserErrors) ctxt->errNo);
4244}
4245
4246/************************************************************************
4247 * *
4248 * User entry points *
4249 * *
4250 ************************************************************************/
4251
4252/**
4253 * htmlCreatePushParserCtxt :
4254 * @sax: a SAX handler
4255 * @user_data: The user data returned on SAX callbacks
4256 * @chunk: a pointer to an array of chars
4257 * @size: number of chars in the array
4258 * @filename: an optional file name or URI
4259 * @enc: an optional encoding
4260 *
4261 * Create a parser context for using the HTML parser in push mode
4262 * To allow content encoding detection, @size should be >= 4
4263 * The value of @filename is used for fetching external entities
4264 * and error/warning reports.
4265 *
4266 * Returns the new parser context or NULL
4267 */
4268htmlParserCtxtPtr
4269htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4270 const char *chunk, int size, const char *filename,
4271 xmlCharEncoding enc) {
4272 htmlParserCtxtPtr ctxt;
4273 htmlParserInputPtr inputStream;
4274 xmlParserInputBufferPtr buf;
4275
4276 buf = xmlAllocParserInputBuffer(enc);
4277 if (buf == NULL) return(NULL);
4278
4279 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4280 if (ctxt == NULL) {
4281 xmlFree(buf);
4282 return(NULL);
4283 }
4284 memset(ctxt, 0, sizeof(htmlParserCtxt));
4285 htmlInitParserCtxt(ctxt);
4286 if (sax != NULL) {
4287 if (ctxt->sax != &htmlDefaultSAXHandler)
4288 xmlFree(ctxt->sax);
4289 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4290 if (ctxt->sax == NULL) {
4291 xmlFree(buf);
4292 xmlFree(ctxt);
4293 return(NULL);
4294 }
4295 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4296 if (user_data != NULL)
4297 ctxt->userData = user_data;
4298 }
4299 if (filename == NULL) {
4300 ctxt->directory = NULL;
4301 } else {
4302 ctxt->directory = xmlParserGetDirectory(filename);
4303 }
4304
4305 inputStream = htmlNewInputStream(ctxt);
4306 if (inputStream == NULL) {
4307 xmlFreeParserCtxt(ctxt);
4308 return(NULL);
4309 }
4310
4311 if (filename == NULL)
4312 inputStream->filename = NULL;
4313 else
4314 inputStream->filename = xmlMemStrdup(filename);
4315 inputStream->buf = buf;
4316 inputStream->base = inputStream->buf->buffer->content;
4317 inputStream->cur = inputStream->buf->buffer->content;
4318
4319 inputPush(ctxt, inputStream);
4320
4321 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4322 (ctxt->input->buf != NULL)) {
4323 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4324#ifdef DEBUG_PUSH
4325 fprintf(stderr, "HPP: pushed %d\n", size);
4326#endif
4327 }
4328
4329 return(ctxt);
4330}
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004331
4332/**
4333 * htmlSAXParseDoc :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00004334 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004335 * @encoding: a free form C string describing the HTML document encoding, or NULL
4336 * @sax: the SAX handler block
4337 * @userData: if using SAX, this pointer will be provided on callbacks.
4338 *
4339 * parse an HTML in-memory document and build a tree.
4340 * It use the given SAX function block to handle the parsing callback.
4341 * If sax is NULL, fallback to the default DOM tree building routines.
4342 *
4343 * Returns the resulting document tree
4344 */
4345
4346htmlDocPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00004347htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004348 htmlDocPtr ret;
4349 htmlParserCtxtPtr ctxt;
4350
4351 if (cur == NULL) return(NULL);
4352
4353
4354 ctxt = htmlCreateDocParserCtxt(cur, encoding);
4355 if (ctxt == NULL) return(NULL);
4356 if (sax != NULL) {
4357 ctxt->sax = sax;
4358 ctxt->userData = userData;
4359 }
4360
4361 htmlParseDocument(ctxt);
4362 ret = ctxt->myDoc;
4363 if (sax != NULL) {
4364 ctxt->sax = NULL;
4365 ctxt->userData = NULL;
4366 }
4367 htmlFreeParserCtxt(ctxt);
4368
4369 return(ret);
4370}
4371
4372/**
4373 * htmlParseDoc :
Daniel Veillarddd6b3671999-09-23 22:19:22 +00004374 * @cur: a pointer to an array of xmlChar
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004375 * @encoding: a free form C string describing the HTML document encoding, or NULL
4376 *
4377 * parse an HTML in-memory document and build a tree.
4378 *
4379 * Returns the resulting document tree
4380 */
4381
4382htmlDocPtr
Daniel Veillarddd6b3671999-09-23 22:19:22 +00004383htmlParseDoc(xmlChar *cur, const char *encoding) {
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004384 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4385}
4386
4387
4388/**
4389 * htmlCreateFileParserCtxt :
4390 * @filename: the filename
4391 * @encoding: a free form C string describing the HTML document encoding, or NULL
4392 *
4393 * Create a parser context for a file content.
4394 * Automatic support for ZLIB/Compress compressed document is provided
4395 * by default if found at compile-time.
4396 *
4397 * Returns the new parser context or NULL
4398 */
4399htmlParserCtxtPtr
4400htmlCreateFileParserCtxt(const char *filename, const char *encoding)
4401{
4402 htmlParserCtxtPtr ctxt;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004403 htmlParserInputPtr inputStream;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00004404 xmlParserInputBufferPtr buf;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004405 /* htmlCharEncoding enc; */
4406
Daniel Veillarde2d034d1999-07-27 19:52:06 +00004407 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
4408 if (buf == NULL) return(NULL);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004409
Daniel Veillard6454aec1999-09-02 22:04:43 +00004410 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004411 if (ctxt == NULL) {
4412 perror("malloc");
4413 return(NULL);
4414 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00004415 memset(ctxt, 0, sizeof(htmlParserCtxt));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004416 htmlInitParserCtxt(ctxt);
Daniel Veillard6454aec1999-09-02 22:04:43 +00004417 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004418 if (inputStream == NULL) {
4419 perror("malloc");
Daniel Veillard6454aec1999-09-02 22:04:43 +00004420 xmlFree(ctxt);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004421 return(NULL);
4422 }
Daniel Veillard7c1206f1999-10-14 09:10:25 +00004423 memset(inputStream, 0, sizeof(htmlParserInput));
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004424
Daniel Veillard6454aec1999-09-02 22:04:43 +00004425 inputStream->filename = xmlMemStrdup(filename);
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004426 inputStream->line = 1;
4427 inputStream->col = 1;
Daniel Veillarde2d034d1999-07-27 19:52:06 +00004428 inputStream->buf = buf;
Daniel Veillard35008381999-10-25 13:15:52 +00004429 inputStream->directory = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004430
Daniel Veillarde2d034d1999-07-27 19:52:06 +00004431 inputStream->base = inputStream->buf->buffer->content;
4432 inputStream->cur = inputStream->buf->buffer->content;
4433 inputStream->free = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004434
4435 inputPush(ctxt, inputStream);
4436 return(ctxt);
4437}
4438
4439/**
4440 * htmlSAXParseFile :
4441 * @filename: the filename
4442 * @encoding: a free form C string describing the HTML document encoding, or NULL
4443 * @sax: the SAX handler block
4444 * @userData: if using SAX, this pointer will be provided on callbacks.
4445 *
4446 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4447 * compressed document is provided by default if found at compile-time.
4448 * It use the given SAX function block to handle the parsing callback.
4449 * If sax is NULL, fallback to the default DOM tree building routines.
4450 *
4451 * Returns the resulting document tree
4452 */
4453
4454htmlDocPtr
4455htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
4456 void *userData) {
4457 htmlDocPtr ret;
4458 htmlParserCtxtPtr ctxt;
Daniel Veillard87b95392000-08-12 21:12:04 +00004459 htmlSAXHandlerPtr oldsax = NULL;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004460
4461 ctxt = htmlCreateFileParserCtxt(filename, encoding);
4462 if (ctxt == NULL) return(NULL);
4463 if (sax != NULL) {
Daniel Veillard87b95392000-08-12 21:12:04 +00004464 oldsax = ctxt->sax;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004465 ctxt->sax = sax;
4466 ctxt->userData = userData;
4467 }
4468
4469 htmlParseDocument(ctxt);
4470
4471 ret = ctxt->myDoc;
4472 if (sax != NULL) {
Daniel Veillard87b95392000-08-12 21:12:04 +00004473 ctxt->sax = oldsax;
Daniel Veillardbe70ff71999-07-05 16:50:46 +00004474 ctxt->userData = NULL;
4475 }
4476 htmlFreeParserCtxt(ctxt);
4477
4478 return(ret);
4479}
4480
4481/**
4482 * htmlParseFile :
4483 * @filename: the filename
4484 * @encoding: a free form C string describing the HTML document encoding, or NULL
4485 *
4486 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4487 * compressed document is provided by default if found at compile-time.
4488 *
4489 * Returns the resulting document tree
4490 */
4491
4492htmlDocPtr
4493htmlParseFile(const char *filename, const char *encoding) {
4494 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
4495}
Daniel Veillard361d8452000-04-03 19:48:13 +00004496
4497#endif /* LIBXML_HTML_ENABLED */