blob: 10f851660725f5d6df0fc906bf3ac674d4112fa7 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
Daniel Veillard22090732001-07-16 00:06:07 +000054static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000055
Daniel Veillard56a4cb82001-03-24 17:00:36 +000056xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000058static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059
60/************************************************************************
61 * *
Daniel Veillardf403d292003-10-05 13:51:35 +000062 * Some factorized error routines *
63 * *
64 ************************************************************************/
65
66/**
William M. Brackedb65a72004-02-06 07:36:04 +000067 * htmlErrMemory:
Daniel Veillardf403d292003-10-05 13:51:35 +000068 * @ctxt: an HTML parser context
69 * @extra: extra informations
70 *
71 * Handle a redefinition of attribute error
72 */
73static void
74htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75{
Daniel Veillard157fee02003-10-31 10:36:03 +000076 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77 (ctxt->instate == XML_PARSER_EOF))
78 return;
Daniel Veillardf403d292003-10-05 13:51:35 +000079 if (ctxt != NULL) {
80 ctxt->errNo = XML_ERR_NO_MEMORY;
81 ctxt->instate = XML_PARSER_EOF;
82 ctxt->disableSAX = 1;
83 }
84 if (extra)
Daniel Veillard659e71e2003-10-10 14:10:40 +000085 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000086 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87 NULL, NULL, 0, 0,
88 "Memory allocation failed : %s\n", extra);
89 else
Daniel Veillard659e71e2003-10-10 14:10:40 +000090 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000091 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92 NULL, NULL, 0, 0, "Memory allocation failed\n");
93}
94
95/**
96 * htmlParseErr:
97 * @ctxt: an HTML parser context
98 * @error: the error number
99 * @msg: the error message
100 * @str1: string infor
101 * @str2: string infor
102 *
103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
104 */
105static void
106htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107 const char *msg, const xmlChar *str1, const xmlChar *str2)
108{
Daniel Veillard157fee02003-10-31 10:36:03 +0000109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110 (ctxt->instate == XML_PARSER_EOF))
111 return;
Daniel Veillardf403d292003-10-05 13:51:35 +0000112 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000113 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000114 XML_ERR_ERROR, NULL, 0,
115 (const char *) str1, (const char *) str2,
116 NULL, 0, 0,
117 msg, str1, str2);
118 ctxt->wellFormed = 0;
119}
120
121/**
122 * htmlParseErrInt:
123 * @ctxt: an HTML parser context
124 * @error: the error number
125 * @msg: the error message
126 * @val: integer info
127 *
128 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
129 */
130static void
131htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
132 const char *msg, int val)
133{
Daniel Veillard157fee02003-10-31 10:36:03 +0000134 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
135 (ctxt->instate == XML_PARSER_EOF))
136 return;
Daniel Veillardf403d292003-10-05 13:51:35 +0000137 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000138 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000139 XML_ERR_ERROR, NULL, 0, NULL, NULL,
140 NULL, val, 0, msg, val);
141 ctxt->wellFormed = 0;
142}
143
144/************************************************************************
145 * *
Owen Taylor3473f882001-02-23 17:55:21 +0000146 * Parser stacks related functions and macros *
147 * *
148 ************************************************************************/
149
Daniel Veillard1c732d22002-11-30 11:22:59 +0000150/**
151 * htmlnamePush:
152 * @ctxt: an HTML parser context
153 * @value: the element name
154 *
155 * Pushes a new element name on top of the name stack
156 *
157 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +0000158 */
Daniel Veillard1c732d22002-11-30 11:22:59 +0000159static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000160htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
Daniel Veillard1c732d22002-11-30 11:22:59 +0000161{
162 if (ctxt->nameNr >= ctxt->nameMax) {
163 ctxt->nameMax *= 2;
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000164 ctxt->nameTab = (const xmlChar * *)
Igor Zlatkovicd37c1392003-08-28 10:34:33 +0000165 xmlRealloc((xmlChar * *)ctxt->nameTab,
Daniel Veillard1c732d22002-11-30 11:22:59 +0000166 ctxt->nameMax *
167 sizeof(ctxt->nameTab[0]));
168 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000169 htmlErrMemory(ctxt, NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000170 return (0);
171 }
172 }
173 ctxt->nameTab[ctxt->nameNr] = value;
174 ctxt->name = value;
175 return (ctxt->nameNr++);
176}
177/**
178 * htmlnamePop:
179 * @ctxt: an HTML parser context
180 *
181 * Pops the top element name from the name stack
182 *
183 * Returns the name just removed
184 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000185static const xmlChar *
Daniel Veillard1c732d22002-11-30 11:22:59 +0000186htmlnamePop(htmlParserCtxtPtr ctxt)
187{
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000188 const xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000189
Daniel Veillard1c732d22002-11-30 11:22:59 +0000190 if (ctxt->nameNr <= 0)
191 return (0);
192 ctxt->nameNr--;
193 if (ctxt->nameNr < 0)
194 return (0);
195 if (ctxt->nameNr > 0)
196 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
197 else
198 ctxt->name = NULL;
199 ret = ctxt->nameTab[ctxt->nameNr];
200 ctxt->nameTab[ctxt->nameNr] = 0;
201 return (ret);
202}
Owen Taylor3473f882001-02-23 17:55:21 +0000203
204/*
205 * Macros for accessing the content. Those should be used only by the parser,
206 * and not exported.
207 *
208 * Dirty macros, i.e. one need to make assumption on the context to use them
209 *
210 * CUR_PTR return the current pointer to the xmlChar to be parsed.
211 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
212 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
213 * in UNICODE mode. This should be used internally by the parser
214 * only to compare to ASCII values otherwise it would break when
215 * running with UTF-8 encoding.
216 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
217 * to compare on ASCII based substring.
218 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
219 * it should be used only to compare on ASCII based substring.
220 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000221 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000222 *
223 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
224 *
225 * CURRENT Returns the current char value, with the full decoding of
226 * UTF-8 if we are using this mode. It returns an int.
227 * NEXT Skip to the next character, this does the proper decoding
228 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000229 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000230 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
231 */
232
233#define UPPER (toupper(*ctxt->input->cur))
234
Daniel Veillard77a90a72003-03-22 00:04:05 +0000235#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000236
237#define NXT(val) ctxt->input->cur[(val)]
238
239#define UPP(val) (toupper(ctxt->input->cur[(val)]))
240
241#define CUR_PTR ctxt->input->cur
242
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000243#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
244 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
245 xmlParserInputShrink(ctxt->input)
Owen Taylor3473f882001-02-23 17:55:21 +0000246
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000247#define GROW if ((ctxt->progressive == 0) && \
248 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
249 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Owen Taylor3473f882001-02-23 17:55:21 +0000250
251#define CURRENT ((int) (*ctxt->input->cur))
252
253#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
254
255/* Inported from XML */
256
Daniel Veillard561b7f82002-03-20 21:55:57 +0000257/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
258#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000259#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000260
Daniel Veillard561b7f82002-03-20 21:55:57 +0000261#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000262#define NXT(val) ctxt->input->cur[(val)]
263#define CUR_PTR ctxt->input->cur
264
265
266#define NEXTL(l) do { \
267 if (*(ctxt->input->cur) == '\n') { \
268 ctxt->input->line++; ctxt->input->col = 1; \
269 } else ctxt->input->col++; \
270 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
271 } while (0)
272
273/************
274 \
275 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
276 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
277 ************/
278
279#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
280#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
281
282#define COPY_BUF(l,b,i,v) \
283 if (l == 1) b[i++] = (xmlChar) v; \
284 else i += xmlCopyChar(l,&b[i],v)
285
286/**
287 * htmlCurrentChar:
288 * @ctxt: the HTML parser context
289 * @len: pointer to the length of the char read
290 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000291 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000292 * bytes in the input buffer. Implement the end of line normalization:
293 * 2.11 End-of-Line Handling
294 * If the encoding is unspecified, in the case we find an ISO-Latin-1
295 * char, then the encoding converter is plugged in automatically.
296 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000297 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000298 */
299
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000300static int
Owen Taylor3473f882001-02-23 17:55:21 +0000301htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
302 if (ctxt->instate == XML_PARSER_EOF)
303 return(0);
304
305 if (ctxt->token != 0) {
306 *len = 0;
307 return(ctxt->token);
308 }
309 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
310 /*
311 * We are supposed to handle UTF8, check it's valid
312 * From rfc2044: encoding of the Unicode values on UTF-8:
313 *
314 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
315 * 0000 0000-0000 007F 0xxxxxxx
316 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
317 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
318 *
319 * Check for the 0x110000 limit too
320 */
321 const unsigned char *cur = ctxt->input->cur;
322 unsigned char c;
323 unsigned int val;
324
325 c = *cur;
326 if (c & 0x80) {
327 if (cur[1] == 0)
328 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
329 if ((cur[1] & 0xc0) != 0x80)
330 goto encoding_error;
331 if ((c & 0xe0) == 0xe0) {
332
333 if (cur[2] == 0)
334 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
335 if ((cur[2] & 0xc0) != 0x80)
336 goto encoding_error;
337 if ((c & 0xf0) == 0xf0) {
338 if (cur[3] == 0)
339 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
340 if (((c & 0xf8) != 0xf0) ||
341 ((cur[3] & 0xc0) != 0x80))
342 goto encoding_error;
343 /* 4-byte code */
344 *len = 4;
345 val = (cur[0] & 0x7) << 18;
346 val |= (cur[1] & 0x3f) << 12;
347 val |= (cur[2] & 0x3f) << 6;
348 val |= cur[3] & 0x3f;
349 } else {
350 /* 3-byte code */
351 *len = 3;
352 val = (cur[0] & 0xf) << 12;
353 val |= (cur[1] & 0x3f) << 6;
354 val |= cur[2] & 0x3f;
355 }
356 } else {
357 /* 2-byte code */
358 *len = 2;
359 val = (cur[0] & 0x1f) << 6;
360 val |= cur[1] & 0x3f;
361 }
362 if (!IS_CHAR(val)) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000363 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
364 "Char 0x%X out of allowed range\n", val);
Owen Taylor3473f882001-02-23 17:55:21 +0000365 }
366 return(val);
367 } else {
368 /* 1-byte code */
369 *len = 1;
370 return((int) *ctxt->input->cur);
371 }
372 }
373 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000374 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000375 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000376 * XML constructs only use < 128 chars
377 */
378 *len = 1;
379 if ((int) *ctxt->input->cur < 0x80)
380 return((int) *ctxt->input->cur);
381
382 /*
383 * Humm this is bad, do an automatic flow conversion
384 */
385 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
386 ctxt->charset = XML_CHAR_ENCODING_UTF8;
387 return(xmlCurrentChar(ctxt, len));
388
389encoding_error:
390 /*
391 * If we detect an UTF8 error that probably mean that the
392 * input encoding didn't get properly advertized in the
393 * declaration header. Report the error and switch the encoding
394 * to ISO-Latin-1 (if you don't like this policy, just declare the
395 * encoding !)
396 */
Daniel Veillardf403d292003-10-05 13:51:35 +0000397 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
398 "Input is not proper UTF-8, indicate encoding !\n",
399 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +0000400 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +0000401 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
402 ctxt->input->cur[0], ctxt->input->cur[1],
403 ctxt->input->cur[2], ctxt->input->cur[3]);
404 }
405
406 ctxt->charset = XML_CHAR_ENCODING_8859_1;
407 *len = 1;
408 return((int) *ctxt->input->cur);
409}
410
411/**
Owen Taylor3473f882001-02-23 17:55:21 +0000412 * htmlSkipBlankChars:
413 * @ctxt: the HTML parser context
414 *
415 * skip all blanks character found at that point in the input streams.
416 *
417 * Returns the number of space chars skipped
418 */
419
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000420static int
Owen Taylor3473f882001-02-23 17:55:21 +0000421htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
422 int res = 0;
423
William M. Brack76e95df2003-10-18 16:20:14 +0000424 while (IS_BLANK_CH(*(ctxt->input->cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000425 if ((*ctxt->input->cur == 0) &&
426 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
427 xmlPopInput(ctxt);
428 } else {
429 if (*(ctxt->input->cur) == '\n') {
430 ctxt->input->line++; ctxt->input->col = 1;
431 } else ctxt->input->col++;
432 ctxt->input->cur++;
433 ctxt->nbChars++;
434 if (*ctxt->input->cur == 0)
435 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
436 }
437 res++;
438 }
439 return(res);
440}
441
442
443
444/************************************************************************
445 * *
446 * The list of HTML elements and their properties *
447 * *
448 ************************************************************************/
449
450/*
451 * Start Tag: 1 means the start tag can be ommited
452 * End Tag: 1 means the end tag can be ommited
453 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000454 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000455 * Depr: this element is deprecated
456 * DTD: 1 means that this element is valid only in the Loose DTD
457 * 2 means that this element is valid only in the Frameset DTD
458 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000459 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000460 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000461 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000462
463/* Definitions and a couple of vars for HTML Elements */
464
465#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000466#define NB_FONTSTYLE 8
Daniel Veillard930dfb62003-02-05 10:17:38 +0000467#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000468#define NB_PHRASE 10
Daniel Veillard930dfb62003-02-05 10:17:38 +0000469#define SPECIAL "a", "img", "applet", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000470#define NB_SPECIAL 15
Daniel Veillard930dfb62003-02-05 10:17:38 +0000471#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000472#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
473#define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
474#define NB_BLOCK NB_HEADING + NB_LIST + 14
Daniel Veillard930dfb62003-02-05 10:17:38 +0000475#define FORMCTRL "input", "select", "textarea", "label", "button"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000476#define NB_FORMCTRL 5
Daniel Veillard930dfb62003-02-05 10:17:38 +0000477#define PCDATA
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000478#define NB_PCDATA 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000479#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000480#define NB_HEADING 6
Daniel Veillard930dfb62003-02-05 10:17:38 +0000481#define LIST "ul", "ol", "dir", "menu"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000482#define NB_LIST 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000483#define MODIFIER
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000484#define NB_MODIFIER 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000485#define FLOW BLOCK,INLINE
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000486#define NB_FLOW NB_BLOCK + NB_INLINE
Daniel Veillard930dfb62003-02-05 10:17:38 +0000487#define EMPTY NULL
488
489
490static const char* html_flow[] = { FLOW, NULL } ;
491static const char* html_inline[] = { INLINE, NULL } ;
492
493/* placeholders: elts with content but no subelements */
494static const char* html_pcdata[] = { NULL } ;
495#define html_cdata html_pcdata
496
497
498/* ... and for HTML Attributes */
499
500#define COREATTRS "id", "class", "style", "title"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000501#define NB_COREATTRS 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000502#define I18N "lang", "dir"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000503#define NB_I18N 2
Daniel Veillard930dfb62003-02-05 10:17:38 +0000504#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000505#define NB_EVENTS 9
Daniel Veillard930dfb62003-02-05 10:17:38 +0000506#define ATTRS COREATTRS,I18N,EVENTS
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000507#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
Daniel Veillard930dfb62003-02-05 10:17:38 +0000508#define CELLHALIGN "align", "char", "charoff"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000509#define NB_CELLHALIGN 3
Daniel Veillard930dfb62003-02-05 10:17:38 +0000510#define CELLVALIGN "valign"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000511#define NB_CELLVALIGN 1
Daniel Veillard930dfb62003-02-05 10:17:38 +0000512
513static const char* html_attrs[] = { ATTRS, NULL } ;
514static const char* core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
515static const char* core_attrs[] = { COREATTRS, NULL } ;
516static const char* i18n_attrs[] = { I18N, NULL } ;
517
518
519/* Other declarations that should go inline ... */
520static const char* a_attrs[] = { ATTRS, "charset", "type", "name",
521 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
522 "tabindex", "onfocus", "onblur", NULL } ;
523static const char* target_attr[] = { "target", NULL } ;
524static const char* rows_cols_attr[] = { "rows", "cols", NULL } ;
525static const char* alt_attr[] = { "alt", NULL } ;
526static const char* src_alt_attrs[] = { "src", "alt", NULL } ;
527static const char* href_attrs[] = { "href", NULL } ;
528static const char* clear_attrs[] = { "clear", NULL } ;
529static const char* inline_p[] = { INLINE, "p", NULL } ;
530static const char* flow_param[] = { FLOW, "param", NULL } ;
531static const char* applet_attrs[] = { COREATTRS , "codebase",
532 "archive", "alt", "name", "height", "width", "align",
533 "hspace", "vspace", NULL } ;
534static const char* area_attrs[] = { "shape", "coords", "href", "nohref",
535 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
536static const char* basefont_attrs[] =
537 { "id", "size", "color", "face", NULL } ;
538static const char* quote_attrs[] = { ATTRS, "cite", NULL } ;
539static const char* body_contents[] = { FLOW, "ins", "del", NULL } ;
540static const char* body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
541static const char* body_depr[] = { "background", "bgcolor", "text",
542 "link", "vlink", "alink", NULL } ;
543static const char* button_attrs[] = { ATTRS, "name", "value", "type",
544 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
545
546
547static const char* col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
548static const char* col_elt[] = { "col", NULL } ;
549static const char* edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
550static const char* compact_attrs[] = { ATTRS, "compact", NULL } ;
551static const char* dl_contents[] = { "dt", "dd", NULL } ;
552static const char* compact_attr[] = { "compact", NULL } ;
553static const char* label_attr[] = { "label", NULL } ;
554static const char* fieldset_contents[] = { FLOW, "legend" } ;
555static const char* font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
556static const char* form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
557static const char* form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
558static const char* frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
559static const char* frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
560static const char* frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
561static const char* head_attrs[] = { I18N, "profile", NULL } ;
562static const char* head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
563static const char* hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
564static const char* version_attr[] = { "version", NULL } ;
565static const char* html_content[] = { "head", "body", "frameset", NULL } ;
566static const char* iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
567static const char* img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
568static const char* input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
569static const char* prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
570static const char* label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
571static const char* legend_attrs[] = { ATTRS, "accesskey", NULL } ;
572static const char* align_attr[] = { "align", NULL } ;
573static const char* link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
574static const char* map_contents[] = { BLOCK, "area", NULL } ;
575static const char* name_attr[] = { "name", NULL } ;
576static const char* action_attr[] = { "action", NULL } ;
577static const char* blockli_elt[] = { BLOCK, "li", NULL } ;
578static const char* meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
579static const char* content_attr[] = { "content", NULL } ;
580static const char* type_attr[] = { "type", NULL } ;
581static const char* noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
582static const char* object_contents[] = { FLOW, "param", NULL } ;
583static const char* object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
584static const char* object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
585static const char* ol_attrs[] = { "type", "compact", "start", NULL} ;
586static const char* option_elt[] = { "option", NULL } ;
587static const char* optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
588static const char* option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
589static const char* param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
590static const char* width_attr[] = { "width", NULL } ;
591static const char* pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
592static const char* script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
593static const char* language_attr[] = { "language", NULL } ;
594static const char* select_content[] = { "optgroup", "option", NULL } ;
595static const char* select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
596static const char* style_attrs[] = { I18N, "media", "title", NULL } ;
597static const char* table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
598static const char* table_depr[] = { "align", "bgcolor", NULL } ;
599static const char* table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
600static const char* tr_elt[] = { "tr", NULL } ;
601static const char* talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
602static const char* th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
603static const char* th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
604static const char* textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
605static const char* tr_contents[] = { "th", "td", NULL } ;
606static const char* bgcolor_attr[] = { "bgcolor", NULL } ;
607static const char* li_elt[] = { "li", NULL } ;
608static const char* ul_depr[] = { "type", "compact", NULL} ;
609static const char* dir_attr[] = { "dir", NULL} ;
610
611#define DECL (const char**)
612
Daniel Veillard22090732001-07-16 00:06:07 +0000613static const htmlElemDesc
614html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000615{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
616 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
617},
618{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
619 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
620},
621{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
622 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
623},
624{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
625 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
626},
627{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
628 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
629},
630{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
631 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
632},
633{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
634 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
635},
636{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
637 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
638},
639{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
640 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
641},
642{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
643 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
644},
645{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
646 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
647},
648{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
649 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
650},
651{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
652 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
653},
654{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
655 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
656},
657{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
658 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
659},
660{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
661 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
662},
663{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
664 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
665},
666{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
667 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
668},
669{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
670 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
671},
672{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
673 EMPTY , NULL , DECL col_attrs , NULL, NULL
674},
675{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
676 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
677},
678{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
679 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
680},
681{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
682 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
683},
684{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
685 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
686},
687{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
688 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
689},
690{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
691 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
692},
693{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
694 DECL dl_contents , "dd" , html_attrs, DECL compact_attr, NULL
695},
696{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
697 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
698},
699{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
700 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
701},
702{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
703 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
704},
705{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
706 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
707},
708{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
709 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
710},
711{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
712 EMPTY, NULL, NULL, DECL frame_attrs, NULL
713},
714{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
715 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
716},
717{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
718 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
719},
720{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
721 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
722},
723{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
724 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
725},
726{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
727 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
728},
729{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
730 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
731},
732{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
733 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
734},
735{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
736 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
737},
738{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
739 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
740},
741{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
742 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
743},
744{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
745 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
746},
747{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
748 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
749},
750{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
751 EMPTY, NULL, DECL img_attrs, DECL align_attr, src_alt_attrs
752},
753{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
754 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
755},
756{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
757 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
758},
759{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
760 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
761},
762{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
763 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
764},
765{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
766 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
767},
768{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
769 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
770},
771{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
772 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
773},
774{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
775 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
776},
777{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
778 DECL map_contents , NULL, DECL html_attrs , NULL, name_attr
779},
780{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
781 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
782},
783{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
784 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
785},
786{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
787 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
788},
789{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
790 DECL html_flow, "div", DECL html_attrs, NULL, NULL
791},
792{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
793 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
794},
795{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
796 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
797},
798{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
799 option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
800},
801{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
802 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
803},
804{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
805 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
806},
807{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
808 EMPTY, NULL, DECL param_attrs, NULL, name_attr
809},
810{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
811 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
812},
813{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
814 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
815},
816{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
817 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
818},
819{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
820 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
821},
822{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
823 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
824},
825{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
826 DECL select_content, NULL, DECL select_attrs, NULL, NULL
827},
828{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
829 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
830},
831{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
832 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
833},
834{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
835 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
836},
837{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
838 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
839},
840{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
841 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
842},
843{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
844 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
845},
846{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
847 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
848},
849{ "table", 0, 0, 0, 0, 0, 0, 0, "",
850 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
851},
852{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
853 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
854},
855{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
856 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
857},
858{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
859 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
860},
861{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
862 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
863},
864{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
865 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
866},
867{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
868 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
869},
870{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
871 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
872},
873{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
874 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
875},
876{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
877 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
878},
879{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
880 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
881},
882{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
883 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
884},
885{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
886 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
887}
Owen Taylor3473f882001-02-23 17:55:21 +0000888};
889
890/*
Owen Taylor3473f882001-02-23 17:55:21 +0000891 * start tags that imply the end of current element
892 */
Daniel Veillard22090732001-07-16 00:06:07 +0000893static const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000894"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
895 "dl", "ul", "ol", "menu", "dir", "address", "pre",
896 "listing", "xmp", "head", NULL,
897"head", "p", NULL,
898"title", "p", NULL,
899"body", "head", "style", "link", "title", "p", NULL,
Daniel Veillard25d5d9a2004-04-05 07:08:42 +0000900"frameset", "head", "style", "link", "title", "p", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000901"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
902 "pre", "listing", "xmp", "head", "li", NULL,
903"hr", "p", "head", NULL,
904"h1", "p", "head", NULL,
905"h2", "p", "head", NULL,
906"h3", "p", "head", NULL,
907"h4", "p", "head", NULL,
908"h5", "p", "head", NULL,
909"h6", "p", "head", NULL,
910"dir", "p", "head", NULL,
911"address", "p", "head", "ul", NULL,
912"pre", "p", "head", "ul", NULL,
913"listing", "p", "head", NULL,
914"xmp", "p", "head", NULL,
915"blockquote", "p", "head", NULL,
916"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
917 "xmp", "head", NULL,
918"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
919 "head", "dd", NULL,
920"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
921 "head", "dt", NULL,
922"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
923 "listing", "xmp", NULL,
924"ol", "p", "head", "ul", NULL,
925"menu", "p", "head", "ul", NULL,
926"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
927"div", "p", "head", NULL,
928"noscript", "p", "head", NULL,
929"center", "font", "b", "i", "p", "head", NULL,
930"a", "a", NULL,
931"caption", "p", NULL,
932"colgroup", "caption", "colgroup", "col", "p", NULL,
933"col", "caption", "col", "p", NULL,
934"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
935 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000936"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
937"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000938"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
939"thead", "caption", "col", "colgroup", NULL,
940"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
941 "tbody", "p", NULL,
942"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
943 "tfoot", "tbody", "p", NULL,
944"optgroup", "option", NULL,
945"option", "option", NULL,
946"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
947 "pre", "listing", "xmp", "a", NULL,
948NULL
949};
950
951/*
952 * The list of HTML elements which are supposed not to have
953 * CDATA content and where a p element will be implied
954 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000955 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +0000956 * implied paragraph
957 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000958static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000959 "html",
960 "head",
961 "body",
962 NULL
963};
964
965/*
966 * The list of HTML attributes which are of content %Script;
967 * NOTE: when adding ones, check htmlIsScriptAttribute() since
968 * it assumes the name starts with 'on'
969 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000970static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000971 "onclick",
972 "ondblclick",
973 "onmousedown",
974 "onmouseup",
975 "onmouseover",
976 "onmousemove",
977 "onmouseout",
978 "onkeypress",
979 "onkeydown",
980 "onkeyup",
981 "onload",
982 "onunload",
983 "onfocus",
984 "onblur",
985 "onsubmit",
986 "onrest",
987 "onchange",
988 "onselect"
989};
990
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000991/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000992 * This table is used by the htmlparser to know what to do with
993 * broken html pages. By assigning different priorities to different
994 * elements the parser can decide how to handle extra endtags.
995 * Endtags are only allowed to close elements with lower or equal
996 * priority.
997 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000998
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000999typedef struct {
1000 const char *name;
1001 int priority;
1002} elementPriority;
1003
Daniel Veillard22090732001-07-16 00:06:07 +00001004static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001005 {"div", 150},
1006 {"td", 160},
1007 {"th", 160},
1008 {"tr", 170},
1009 {"thead", 180},
1010 {"tbody", 180},
1011 {"tfoot", 180},
1012 {"table", 190},
1013 {"head", 200},
1014 {"body", 200},
1015 {"html", 220},
1016 {NULL, 100} /* Default priority */
1017};
Owen Taylor3473f882001-02-23 17:55:21 +00001018
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001019static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +00001020static int htmlStartCloseIndexinitialized = 0;
1021
1022/************************************************************************
1023 * *
1024 * functions to handle HTML specific data *
1025 * *
1026 ************************************************************************/
1027
1028/**
1029 * htmlInitAutoClose:
1030 *
1031 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1032 * This is not reentrant. Call xmlInitParser() once before processing in
1033 * case of use in multithreaded programs.
1034 */
1035void
1036htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001037 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001038
1039 if (htmlStartCloseIndexinitialized) return;
1040
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001041 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1042 indx = 0;
1043 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1044 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +00001045 while (htmlStartClose[i] != NULL) i++;
1046 i++;
1047 }
1048 htmlStartCloseIndexinitialized = 1;
1049}
1050
1051/**
1052 * htmlTagLookup:
1053 * @tag: The tag name in lowercase
1054 *
1055 * Lookup the HTML tag in the ElementTable
1056 *
1057 * Returns the related htmlElemDescPtr or NULL if not found.
1058 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001059const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001060htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001061 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001062
1063 for (i = 0; i < (sizeof(html40ElementTable) /
1064 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +00001065 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
William M. Brack78637da2003-07-31 14:47:38 +00001066 return((htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001067 }
1068 return(NULL);
1069}
1070
1071/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001072 * htmlGetEndPriority:
1073 * @name: The name of the element to look up the priority for.
1074 *
1075 * Return value: The "endtag" priority.
1076 **/
1077static int
1078htmlGetEndPriority (const xmlChar *name) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001079 int i = 0;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001080
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001081 while ((htmlEndPriority[i].name != NULL) &&
1082 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1083 i++;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001084
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001085 return(htmlEndPriority[i].priority);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001086}
1087
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001088
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001089/**
Owen Taylor3473f882001-02-23 17:55:21 +00001090 * htmlCheckAutoClose:
1091 * @newtag: The new tag name
1092 * @oldtag: The old tag name
1093 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001094 * Checks whether the new tag is one of the registered valid tags for
1095 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +00001096 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1097 *
1098 * Returns 0 if no, 1 if yes.
1099 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001100static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001101htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1102{
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001103 int i, indx;
1104 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001105
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001106 if (htmlStartCloseIndexinitialized == 0)
1107 htmlInitAutoClose();
Owen Taylor3473f882001-02-23 17:55:21 +00001108
1109 /* inefficient, but not a big deal */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001110 for (indx = 0; indx < 100; indx++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001111 closed = htmlStartCloseIndex[indx];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001112 if (closed == NULL)
1113 return (0);
1114 if (xmlStrEqual(BAD_CAST * closed, newtag))
1115 break;
Owen Taylor3473f882001-02-23 17:55:21 +00001116 }
1117
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001118 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001119 i++;
1120 while (htmlStartClose[i] != NULL) {
1121 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001122 return (1);
1123 }
1124 i++;
Owen Taylor3473f882001-02-23 17:55:21 +00001125 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001126 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00001127}
1128
1129/**
1130 * htmlAutoCloseOnClose:
1131 * @ctxt: an HTML parser context
1132 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001133 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001134 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001135 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001136 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001137static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001138htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1139{
1140 const htmlElemDesc *info;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001141 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001142
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001143 priority = htmlGetEndPriority(newtag);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001144
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001145 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001146
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001147 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1148 break;
1149 /*
1150 * A missplaced endtag can only close elements with lower
1151 * or equal priority, so if we find an element with higher
1152 * priority before we find an element with
1153 * matching name, we just ignore this endtag
1154 */
1155 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1156 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001157 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001158 if (i < 0)
1159 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001160
1161 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001162 info = htmlTagLookup(ctxt->name);
Daniel Veillardf403d292003-10-05 13:51:35 +00001163 if ((info != NULL) && (info->endTag == 3)) {
1164 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1165 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard05bcb7e2003-10-19 14:26:34 +00001166 newtag, ctxt->name);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001167 }
1168 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1169 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001170 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001171 }
1172}
1173
1174/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001175 * htmlAutoCloseOnEnd:
1176 * @ctxt: an HTML parser context
1177 *
1178 * Close all remaining tags at the end of the stream
1179 */
1180static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001181htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1182{
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001183 int i;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001184
William M. Brack899e64a2003-09-26 18:03:42 +00001185 if (ctxt->nameNr == 0)
1186 return;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001187 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001188 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1189 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001190 htmlnamePop(ctxt);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001191 }
1192}
1193
1194/**
Owen Taylor3473f882001-02-23 17:55:21 +00001195 * htmlAutoClose:
1196 * @ctxt: an HTML parser context
1197 * @newtag: The new tag name or NULL
1198 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001199 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001200 * The list is kept in htmlStartClose array. This function is
1201 * called when a new tag has been detected and generates the
1202 * appropriates closes if possible/needed.
1203 * If newtag is NULL this mean we are at the end of the resource
1204 * and we should check
1205 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001206static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001207htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1208{
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001209 while ((newtag != NULL) && (ctxt->name != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001210 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001211 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1212 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001213 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001214 }
1215 if (newtag == NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001216 htmlAutoCloseOnEnd(ctxt);
1217 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001218 }
1219 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001220 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1221 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1222 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001223 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1224 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001225 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001226 }
Owen Taylor3473f882001-02-23 17:55:21 +00001227}
1228
1229/**
1230 * htmlAutoCloseTag:
1231 * @doc: the HTML document
1232 * @name: The tag name
1233 * @elem: the HTML element
1234 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001235 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001236 * The list is kept in htmlStartClose array. This function checks
1237 * if the element or one of it's children would autoclose the
1238 * given tag.
1239 *
1240 * Returns 1 if autoclose, 0 otherwise
1241 */
1242int
1243htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1244 htmlNodePtr child;
1245
1246 if (elem == NULL) return(1);
1247 if (xmlStrEqual(name, elem->name)) return(0);
1248 if (htmlCheckAutoClose(elem->name, name)) return(1);
1249 child = elem->children;
1250 while (child != NULL) {
1251 if (htmlAutoCloseTag(doc, name, child)) return(1);
1252 child = child->next;
1253 }
1254 return(0);
1255}
1256
1257/**
1258 * htmlIsAutoClosed:
1259 * @doc: the HTML document
1260 * @elem: the HTML element
1261 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001262 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001263 * The list is kept in htmlStartClose array. This function checks
1264 * if a tag is autoclosed by one of it's child
1265 *
1266 * Returns 1 if autoclosed, 0 otherwise
1267 */
1268int
1269htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1270 htmlNodePtr child;
1271
1272 if (elem == NULL) return(1);
1273 child = elem->children;
1274 while (child != NULL) {
1275 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1276 child = child->next;
1277 }
1278 return(0);
1279}
1280
1281/**
1282 * htmlCheckImplied:
1283 * @ctxt: an HTML parser context
1284 * @newtag: The new tag name
1285 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001286 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001287 * called when a new tag has been detected and generates the
1288 * appropriates implicit tags if missing
1289 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001290static void
Owen Taylor3473f882001-02-23 17:55:21 +00001291htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1292 if (!htmlOmittedDefaultValue)
1293 return;
1294 if (xmlStrEqual(newtag, BAD_CAST"html"))
1295 return;
1296 if (ctxt->nameNr <= 0) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001297 htmlnamePush(ctxt, BAD_CAST"html");
Owen Taylor3473f882001-02-23 17:55:21 +00001298 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1299 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1300 }
1301 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1302 return;
1303 if ((ctxt->nameNr <= 1) &&
1304 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1305 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1306 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1307 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1308 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1309 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1310 /*
1311 * dropped OBJECT ... i you put it first BODY will be
1312 * assumed !
1313 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001314 htmlnamePush(ctxt, BAD_CAST"head");
Owen Taylor3473f882001-02-23 17:55:21 +00001315 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1316 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1317 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1318 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1319 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1320 int i;
1321 for (i = 0;i < ctxt->nameNr;i++) {
1322 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1323 return;
1324 }
1325 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1326 return;
1327 }
1328 }
1329
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001330 htmlnamePush(ctxt, BAD_CAST"body");
Owen Taylor3473f882001-02-23 17:55:21 +00001331 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1332 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1333 }
1334}
1335
1336/**
1337 * htmlCheckParagraph
1338 * @ctxt: an HTML parser context
1339 *
1340 * Check whether a p element need to be implied before inserting
1341 * characters in the current element.
1342 *
1343 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1344 * in case of error.
1345 */
1346
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001347static int
Owen Taylor3473f882001-02-23 17:55:21 +00001348htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1349 const xmlChar *tag;
1350 int i;
1351
1352 if (ctxt == NULL)
1353 return(-1);
1354 tag = ctxt->name;
1355 if (tag == NULL) {
1356 htmlAutoClose(ctxt, BAD_CAST"p");
1357 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001358 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001359 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1360 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1361 return(1);
1362 }
1363 if (!htmlOmittedDefaultValue)
1364 return(0);
1365 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1366 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Owen Taylor3473f882001-02-23 17:55:21 +00001367 htmlAutoClose(ctxt, BAD_CAST"p");
1368 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001369 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001370 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1371 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1372 return(1);
1373 }
1374 }
1375 return(0);
1376}
1377
1378/**
1379 * htmlIsScriptAttribute:
1380 * @name: an attribute name
1381 *
1382 * Check if an attribute is of content type Script
1383 *
1384 * Returns 1 is the attribute is a script 0 otherwise
1385 */
1386int
1387htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001388 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001389
1390 if (name == NULL)
1391 return(0);
1392 /*
1393 * all script attributes start with 'on'
1394 */
1395 if ((name[0] != 'o') || (name[1] != 'n'))
1396 return(0);
1397 for (i = 0;
1398 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1399 i++) {
1400 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1401 return(1);
1402 }
1403 return(0);
1404}
1405
1406/************************************************************************
1407 * *
1408 * The list of HTML predefined entities *
1409 * *
1410 ************************************************************************/
1411
1412
Daniel Veillard22090732001-07-16 00:06:07 +00001413static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001414/*
1415 * the 4 absolute ones, plus apostrophe.
1416 */
1417{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1418{ 38, "amp", "ampersand, U+0026 ISOnum" },
1419{ 39, "apos", "single quote" },
1420{ 60, "lt", "less-than sign, U+003C ISOnum" },
1421{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1422
1423/*
1424 * A bunch still in the 128-255 range
1425 * Replacing them depend really on the charset used.
1426 */
1427{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1428{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1429{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1430{ 163, "pound","pound sign, U+00A3 ISOnum" },
1431{ 164, "curren","currency sign, U+00A4 ISOnum" },
1432{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1433{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1434{ 167, "sect", "section sign, U+00A7 ISOnum" },
1435{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1436{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1437{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1438{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1439{ 172, "not", "not sign, U+00AC ISOnum" },
1440{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1441{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1442{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1443{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1444{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1445{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1446{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1447{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1448{ 181, "micro","micro sign, U+00B5 ISOnum" },
1449{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1450{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1451{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1452{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1453{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1454{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1455{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1456{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1457{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1458{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1459{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1460{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1461{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1462{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1463{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1464{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1465{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1466{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1467{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1468{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1469{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1470{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1471{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1472{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1473{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1474{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1475{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1476{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1477{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1478{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1479{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1480{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1481{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1482{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1483{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1484{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1485{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1486{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1487{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1488{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1489{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1490{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1491{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1492{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1493{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1494{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1495{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1496{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1497{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1498{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1499{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1500{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1501{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1502{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1503{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1504{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1505{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1506{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1507{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1508{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1509{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1510{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1511{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1512{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1513{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1514{ 247, "divide","division sign, U+00F7 ISOnum" },
1515{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1516{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1517{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1518{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1519{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1520{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1521{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1522{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1523
1524{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1525{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1526{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1527{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1528{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1529
1530/*
1531 * Anything below should really be kept as entities references
1532 */
1533{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1534
1535{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1536{ 732, "tilde","small tilde, U+02DC ISOdia" },
1537
1538{ 913, "Alpha","greek capital letter alpha, U+0391" },
1539{ 914, "Beta", "greek capital letter beta, U+0392" },
1540{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1541{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1542{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1543{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1544{ 919, "Eta", "greek capital letter eta, U+0397" },
1545{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1546{ 921, "Iota", "greek capital letter iota, U+0399" },
1547{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001548{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001549{ 924, "Mu", "greek capital letter mu, U+039C" },
1550{ 925, "Nu", "greek capital letter nu, U+039D" },
1551{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1552{ 927, "Omicron","greek capital letter omicron, U+039F" },
1553{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1554{ 929, "Rho", "greek capital letter rho, U+03A1" },
1555{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1556{ 932, "Tau", "greek capital letter tau, U+03A4" },
1557{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1558{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1559{ 935, "Chi", "greek capital letter chi, U+03A7" },
1560{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1561{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1562
1563{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1564{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1565{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1566{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1567{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1568{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1569{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1570{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1571{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1572{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1573{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1574{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1575{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1576{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1577{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1578{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1579{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1580{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1581{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1582{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1583{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1584{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1585{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1586{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1587{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1588{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1589{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1590{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1591
1592{ 8194, "ensp", "en space, U+2002 ISOpub" },
1593{ 8195, "emsp", "em space, U+2003 ISOpub" },
1594{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1595{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1596{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1597{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1598{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1599{ 8211, "ndash","en dash, U+2013 ISOpub" },
1600{ 8212, "mdash","em dash, U+2014 ISOpub" },
1601{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1602{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1603{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1604{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1605{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1606{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1607{ 8224, "dagger","dagger, U+2020 ISOpub" },
1608{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1609
1610{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1611{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1612
1613{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1614
1615{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1616{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1617
1618{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1619{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1620
1621{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1622{ 8260, "frasl","fraction slash, U+2044 NEW" },
1623
1624{ 8364, "euro", "euro sign, U+20AC NEW" },
1625
1626{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1627{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1628{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1629{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1630{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1631{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1632{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1633{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1634{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1635{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1636{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1637{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1638{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1639{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1640{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1641{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1642
1643{ 8704, "forall","for all, U+2200 ISOtech" },
1644{ 8706, "part", "partial differential, U+2202 ISOtech" },
1645{ 8707, "exist","there exists, U+2203 ISOtech" },
1646{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1647{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1648{ 8712, "isin", "element of, U+2208 ISOtech" },
1649{ 8713, "notin","not an element of, U+2209 ISOtech" },
1650{ 8715, "ni", "contains as member, U+220B ISOtech" },
1651{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001652{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001653{ 8722, "minus","minus sign, U+2212 ISOtech" },
1654{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1655{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1656{ 8733, "prop", "proportional to, U+221D ISOtech" },
1657{ 8734, "infin","infinity, U+221E ISOtech" },
1658{ 8736, "ang", "angle, U+2220 ISOamso" },
1659{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1660{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1661{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1662{ 8746, "cup", "union = cup, U+222A ISOtech" },
1663{ 8747, "int", "integral, U+222B ISOtech" },
1664{ 8756, "there4","therefore, U+2234 ISOtech" },
1665{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1666{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1667{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1668{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1669{ 8801, "equiv","identical to, U+2261 ISOtech" },
1670{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1671{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1672{ 8834, "sub", "subset of, U+2282 ISOtech" },
1673{ 8835, "sup", "superset of, U+2283 ISOtech" },
1674{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1675{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1676{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1677{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1678{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1679{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1680{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1681{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1682{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1683{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1684{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1685{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1686{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1687{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1688
1689{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1690{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1691{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1692{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1693
1694};
1695
1696/************************************************************************
1697 * *
1698 * Commodity functions to handle entities *
1699 * *
1700 ************************************************************************/
1701
1702/*
1703 * Macro used to grow the current buffer.
1704 */
1705#define growBuffer(buffer) { \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001706 xmlChar *tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001707 buffer##_size *= 2; \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001708 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1709 if (tmp == NULL) { \
Daniel Veillardf403d292003-10-05 13:51:35 +00001710 htmlErrMemory(ctxt, "growing buffer\n"); \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001711 xmlFree(buffer); \
Owen Taylor3473f882001-02-23 17:55:21 +00001712 return(NULL); \
1713 } \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001714 buffer = tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001715}
1716
1717/**
1718 * htmlEntityLookup:
1719 * @name: the entity name
1720 *
1721 * Lookup the given entity in EntitiesTable
1722 *
1723 * TODO: the linear scan is really ugly, an hash table is really needed.
1724 *
1725 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1726 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001727const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001728htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001729 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001730
1731 for (i = 0;i < (sizeof(html40EntitiesTable)/
1732 sizeof(html40EntitiesTable[0]));i++) {
1733 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
William M. Brack78637da2003-07-31 14:47:38 +00001734 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001735 }
1736 }
1737 return(NULL);
1738}
1739
1740/**
1741 * htmlEntityValueLookup:
1742 * @value: the entity's unicode value
1743 *
1744 * Lookup the given entity in EntitiesTable
1745 *
1746 * TODO: the linear scan is really ugly, an hash table is really needed.
1747 *
1748 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1749 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001750const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001751htmlEntityValueLookup(unsigned int value) {
1752 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001753
1754 for (i = 0;i < (sizeof(html40EntitiesTable)/
1755 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001756 if (html40EntitiesTable[i].value >= value) {
1757 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001758 break;
William M. Brack78637da2003-07-31 14:47:38 +00001759 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001760 }
Owen Taylor3473f882001-02-23 17:55:21 +00001761 }
1762 return(NULL);
1763}
1764
1765/**
1766 * UTF8ToHtml:
1767 * @out: a pointer to an array of bytes to store the result
1768 * @outlen: the length of @out
1769 * @in: a pointer to an array of UTF-8 chars
1770 * @inlen: the length of @in
1771 *
1772 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1773 * plus HTML entities block of chars out.
1774 *
1775 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1776 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001777 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001778 * The value of @outlen after return is the number of octets consumed.
1779 */
1780int
1781UTF8ToHtml(unsigned char* out, int *outlen,
1782 const unsigned char* in, int *inlen) {
1783 const unsigned char* processed = in;
1784 const unsigned char* outend;
1785 const unsigned char* outstart = out;
1786 const unsigned char* instart = in;
1787 const unsigned char* inend;
1788 unsigned int c, d;
1789 int trailing;
1790
1791 if (in == NULL) {
1792 /*
1793 * initialization nothing to do
1794 */
1795 *outlen = 0;
1796 *inlen = 0;
1797 return(0);
1798 }
1799 inend = in + (*inlen);
1800 outend = out + (*outlen);
1801 while (in < inend) {
1802 d = *in++;
1803 if (d < 0x80) { c= d; trailing= 0; }
1804 else if (d < 0xC0) {
1805 /* trailing byte in leading position */
1806 *outlen = out - outstart;
1807 *inlen = processed - instart;
1808 return(-2);
1809 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1810 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1811 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1812 else {
1813 /* no chance for this in Ascii */
1814 *outlen = out - outstart;
1815 *inlen = processed - instart;
1816 return(-2);
1817 }
1818
1819 if (inend - in < trailing) {
1820 break;
1821 }
1822
1823 for ( ; trailing; trailing--) {
1824 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1825 break;
1826 c <<= 6;
1827 c |= d & 0x3F;
1828 }
1829
1830 /* assertion: c is a single UTF-4 value */
1831 if (c < 0x80) {
1832 if (out + 1 >= outend)
1833 break;
1834 *out++ = c;
1835 } else {
1836 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001837 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001838
1839 /*
1840 * Try to lookup a predefined HTML entity for it
1841 */
1842
1843 ent = htmlEntityValueLookup(c);
1844 if (ent == NULL) {
1845 /* no chance for this in Ascii */
1846 *outlen = out - outstart;
1847 *inlen = processed - instart;
1848 return(-2);
1849 }
1850 len = strlen(ent->name);
1851 if (out + 2 + len >= outend)
1852 break;
1853 *out++ = '&';
1854 memcpy(out, ent->name, len);
1855 out += len;
1856 *out++ = ';';
1857 }
1858 processed = in;
1859 }
1860 *outlen = out - outstart;
1861 *inlen = processed - instart;
1862 return(0);
1863}
1864
1865/**
1866 * htmlEncodeEntities:
1867 * @out: a pointer to an array of bytes to store the result
1868 * @outlen: the length of @out
1869 * @in: a pointer to an array of UTF-8 chars
1870 * @inlen: the length of @in
1871 * @quoteChar: the quote character to escape (' or ") or zero.
1872 *
1873 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1874 * plus HTML entities block of chars out.
1875 *
1876 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1877 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001878 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001879 * The value of @outlen after return is the number of octets consumed.
1880 */
1881int
1882htmlEncodeEntities(unsigned char* out, int *outlen,
1883 const unsigned char* in, int *inlen, int quoteChar) {
1884 const unsigned char* processed = in;
1885 const unsigned char* outend = out + (*outlen);
1886 const unsigned char* outstart = out;
1887 const unsigned char* instart = in;
1888 const unsigned char* inend = in + (*inlen);
1889 unsigned int c, d;
1890 int trailing;
1891
1892 while (in < inend) {
1893 d = *in++;
1894 if (d < 0x80) { c= d; trailing= 0; }
1895 else if (d < 0xC0) {
1896 /* trailing byte in leading position */
1897 *outlen = out - outstart;
1898 *inlen = processed - instart;
1899 return(-2);
1900 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1901 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1902 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1903 else {
1904 /* no chance for this in Ascii */
1905 *outlen = out - outstart;
1906 *inlen = processed - instart;
1907 return(-2);
1908 }
1909
1910 if (inend - in < trailing)
1911 break;
1912
1913 while (trailing--) {
1914 if (((d= *in++) & 0xC0) != 0x80) {
1915 *outlen = out - outstart;
1916 *inlen = processed - instart;
1917 return(-2);
1918 }
1919 c <<= 6;
1920 c |= d & 0x3F;
1921 }
1922
1923 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001924 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1925 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001926 if (out >= outend)
1927 break;
1928 *out++ = c;
1929 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001930 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001931 const char *cp;
1932 char nbuf[16];
1933 int len;
1934
1935 /*
1936 * Try to lookup a predefined HTML entity for it
1937 */
1938 ent = htmlEntityValueLookup(c);
1939 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00001940 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00001941 cp = nbuf;
1942 }
1943 else
1944 cp = ent->name;
1945 len = strlen(cp);
1946 if (out + 2 + len > outend)
1947 break;
1948 *out++ = '&';
1949 memcpy(out, cp, len);
1950 out += len;
1951 *out++ = ';';
1952 }
1953 processed = in;
1954 }
1955 *outlen = out - outstart;
1956 *inlen = processed - instart;
1957 return(0);
1958}
1959
Owen Taylor3473f882001-02-23 17:55:21 +00001960/************************************************************************
1961 * *
1962 * Commodity functions to handle streams *
1963 * *
1964 ************************************************************************/
1965
1966/**
Owen Taylor3473f882001-02-23 17:55:21 +00001967 * htmlNewInputStream:
1968 * @ctxt: an HTML parser context
1969 *
1970 * Create a new input stream structure
1971 * Returns the new input stream or NULL
1972 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001973static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001974htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1975 htmlParserInputPtr input;
1976
1977 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1978 if (input == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00001979 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
Owen Taylor3473f882001-02-23 17:55:21 +00001980 return(NULL);
1981 }
1982 memset(input, 0, sizeof(htmlParserInput));
1983 input->filename = NULL;
1984 input->directory = NULL;
1985 input->base = NULL;
1986 input->cur = NULL;
1987 input->buf = NULL;
1988 input->line = 1;
1989 input->col = 1;
1990 input->buf = NULL;
1991 input->free = NULL;
1992 input->version = NULL;
1993 input->consumed = 0;
1994 input->length = 0;
1995 return(input);
1996}
1997
1998
1999/************************************************************************
2000 * *
2001 * Commodity functions, cleanup needed ? *
2002 * *
2003 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002004/*
2005 * all tags allowing pc data from the html 4.01 loose dtd
2006 * NOTE: it might be more apropriate to integrate this information
2007 * into the html40ElementTable array but I don't want to risk any
2008 * binary incomptibility
2009 */
2010static const char *allowPCData[] = {
2011 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2012 "blockquote", "body", "button", "caption", "center", "cite", "code",
2013 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2014 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2015 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2016 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2017};
Owen Taylor3473f882001-02-23 17:55:21 +00002018
2019/**
2020 * areBlanks:
2021 * @ctxt: an HTML parser context
2022 * @str: a xmlChar *
2023 * @len: the size of @str
2024 *
2025 * Is this a sequence of blank chars that one can ignore ?
2026 *
2027 * Returns 1 if ignorable 0 otherwise.
2028 */
2029
2030static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002031 unsigned int i;
2032 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002033 xmlNodePtr lastChild;
2034
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002035 for (j = 0;j < len;j++)
William M. Brack76e95df2003-10-18 16:20:14 +00002036 if (!(IS_BLANK_CH(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002037
2038 if (CUR == 0) return(1);
2039 if (CUR != '<') return(0);
2040 if (ctxt->name == NULL)
2041 return(1);
2042 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2043 return(1);
2044 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2045 return(1);
2046 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
2047 return(1);
2048 if (ctxt->node == NULL) return(0);
2049 lastChild = xmlGetLastChild(ctxt->node);
Daniel Veillard18a65092004-05-11 15:57:42 +00002050 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2051 lastChild = lastChild->prev;
Owen Taylor3473f882001-02-23 17:55:21 +00002052 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002053 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2054 (ctxt->node->content != NULL)) return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002055 /* keep ws in constructs like ...<b> </b>...
2056 for all tags "b" allowing PCDATA */
2057 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2058 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2059 return(0);
2060 }
2061 }
Owen Taylor3473f882001-02-23 17:55:21 +00002062 } else if (xmlNodeIsText(lastChild)) {
2063 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002064 } else {
2065 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2066 for all tags "p" allowing PCDATA */
2067 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2068 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2069 return(0);
2070 }
2071 }
Owen Taylor3473f882001-02-23 17:55:21 +00002072 }
2073 return(1);
2074}
2075
2076/**
Owen Taylor3473f882001-02-23 17:55:21 +00002077 * htmlNewDocNoDtD:
2078 * @URI: URI for the dtd, or NULL
2079 * @ExternalID: the external ID of the DTD, or NULL
2080 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002081 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2082 * are NULL
2083 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002084 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002085 */
2086htmlDocPtr
2087htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2088 xmlDocPtr cur;
2089
2090 /*
2091 * Allocate a new document and fill the fields.
2092 */
2093 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2094 if (cur == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002095 htmlErrMemory(NULL, "HTML document creation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002096 return(NULL);
2097 }
2098 memset(cur, 0, sizeof(xmlDoc));
2099
2100 cur->type = XML_HTML_DOCUMENT_NODE;
2101 cur->version = NULL;
2102 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002103 cur->doc = cur;
2104 cur->name = NULL;
2105 cur->children = NULL;
2106 cur->extSubset = NULL;
2107 cur->oldNs = NULL;
2108 cur->encoding = NULL;
2109 cur->standalone = 1;
2110 cur->compression = 0;
2111 cur->ids = NULL;
2112 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002113 cur->_private = NULL;
Daniel Veillard7cc23572004-07-29 11:20:30 +00002114 cur->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002115 if ((ExternalID != NULL) ||
2116 (URI != NULL))
Daniel Veillard40412cd2003-09-03 13:28:32 +00002117 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002118 return(cur);
2119}
2120
2121/**
2122 * htmlNewDoc:
2123 * @URI: URI for the dtd, or NULL
2124 * @ExternalID: the external ID of the DTD, or NULL
2125 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002126 * Creates a new HTML document
2127 *
Owen Taylor3473f882001-02-23 17:55:21 +00002128 * Returns a new document
2129 */
2130htmlDocPtr
2131htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2132 if ((URI == NULL) && (ExternalID == NULL))
2133 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002134 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2135 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002136
2137 return(htmlNewDocNoDtD(URI, ExternalID));
2138}
2139
2140
2141/************************************************************************
2142 * *
2143 * The parser itself *
2144 * Relates to http://www.w3.org/TR/html40 *
2145 * *
2146 ************************************************************************/
2147
2148/************************************************************************
2149 * *
2150 * The parser itself *
2151 * *
2152 ************************************************************************/
2153
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002154static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002155
Owen Taylor3473f882001-02-23 17:55:21 +00002156/**
2157 * htmlParseHTMLName:
2158 * @ctxt: an HTML parser context
2159 *
2160 * parse an HTML tag or attribute name, note that we convert it to lowercase
2161 * since HTML names are not case-sensitive.
2162 *
2163 * Returns the Tag Name parsed or NULL
2164 */
2165
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002166static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002167htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002168 int i = 0;
2169 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2170
William M. Brack76e95df2003-10-18 16:20:14 +00002171 if (!IS_LETTER_CH(CUR) && (CUR != '_') &&
Owen Taylor3473f882001-02-23 17:55:21 +00002172 (CUR != ':')) return(NULL);
2173
2174 while ((i < HTML_PARSER_BUFFER_SIZE) &&
William M. Brack76e95df2003-10-18 16:20:14 +00002175 ((IS_LETTER_CH(CUR)) || (IS_DIGIT_CH(CUR)) ||
Owen Taylor3473f882001-02-23 17:55:21 +00002176 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
2177 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2178 else loc[i] = CUR;
2179 i++;
2180
2181 NEXT;
2182 }
2183
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002184 return(xmlDictLookup(ctxt->dict, loc, i));
Owen Taylor3473f882001-02-23 17:55:21 +00002185}
2186
2187/**
2188 * htmlParseName:
2189 * @ctxt: an HTML parser context
2190 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002191 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002192 *
2193 * Returns the Name parsed or NULL
2194 */
2195
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002196static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002197htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002198 const xmlChar *in;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002199 const xmlChar *ret;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002200 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002201
2202 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002203
2204 /*
2205 * Accelerator for simple ASCII names
2206 */
2207 in = ctxt->input->cur;
2208 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2209 ((*in >= 0x41) && (*in <= 0x5A)) ||
2210 (*in == '_') || (*in == ':')) {
2211 in++;
2212 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2213 ((*in >= 0x41) && (*in <= 0x5A)) ||
2214 ((*in >= 0x30) && (*in <= 0x39)) ||
2215 (*in == '_') || (*in == '-') ||
2216 (*in == ':') || (*in == '.'))
2217 in++;
2218 if ((*in > 0) && (*in < 0x80)) {
2219 count = in - ctxt->input->cur;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002220 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002221 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002222 ctxt->nbChars += count;
2223 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002224 return(ret);
2225 }
2226 }
2227 return(htmlParseNameComplex(ctxt));
2228}
2229
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002230static const xmlChar *
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002231htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002232 int len = 0, l;
2233 int c;
2234 int count = 0;
2235
2236 /*
2237 * Handler for more complex cases
2238 */
2239 GROW;
2240 c = CUR_CHAR(l);
2241 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2242 (!IS_LETTER(c) && (c != '_') &&
2243 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002244 return(NULL);
2245 }
2246
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002247 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2248 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2249 (c == '.') || (c == '-') ||
2250 (c == '_') || (c == ':') ||
2251 (IS_COMBINING(c)) ||
2252 (IS_EXTENDER(c)))) {
2253 if (count++ > 100) {
2254 count = 0;
2255 GROW;
2256 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002257 len += l;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002258 NEXTL(l);
2259 c = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002260 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002261 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
Owen Taylor3473f882001-02-23 17:55:21 +00002262}
2263
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002264
Owen Taylor3473f882001-02-23 17:55:21 +00002265/**
2266 * htmlParseHTMLAttribute:
2267 * @ctxt: an HTML parser context
2268 * @stop: a char stop value
2269 *
2270 * parse an HTML attribute value till the stop (quote), if
2271 * stop is 0 then it stops at the first space
2272 *
2273 * Returns the attribute parsed or NULL
2274 */
2275
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002276static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002277htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2278 xmlChar *buffer = NULL;
2279 int buffer_size = 0;
2280 xmlChar *out = NULL;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002281 const xmlChar *name = NULL;
2282 const xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002283 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002284
2285 /*
2286 * allocate a translation buffer.
2287 */
2288 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002289 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002290 if (buffer == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002291 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002292 return(NULL);
2293 }
2294 out = buffer;
2295
2296 /*
2297 * Ok loop until we reach one of the ending chars
2298 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002299 while ((CUR != 0) && (CUR != stop)) {
2300 if ((stop == 0) && (CUR == '>')) break;
William M. Brack76e95df2003-10-18 16:20:14 +00002301 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002302 if (CUR == '&') {
2303 if (NXT(1) == '#') {
2304 unsigned int c;
2305 int bits;
2306
2307 c = htmlParseCharRef(ctxt);
2308 if (c < 0x80)
2309 { *out++ = c; bits= -6; }
2310 else if (c < 0x800)
2311 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2312 else if (c < 0x10000)
2313 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2314 else
2315 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2316
2317 for ( ; bits >= 0; bits-= 6) {
2318 *out++ = ((c >> bits) & 0x3F) | 0x80;
2319 }
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002320
2321 if (out - buffer > buffer_size - 100) {
2322 int indx = out - buffer;
2323
2324 growBuffer(buffer);
2325 out = &buffer[indx];
2326 }
Owen Taylor3473f882001-02-23 17:55:21 +00002327 } else {
2328 ent = htmlParseEntityRef(ctxt, &name);
2329 if (name == NULL) {
2330 *out++ = '&';
2331 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002332 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002333
2334 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002335 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002336 }
2337 } else if (ent == NULL) {
2338 *out++ = '&';
2339 cur = name;
2340 while (*cur != 0) {
2341 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002342 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002343
2344 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002345 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002346 }
2347 *out++ = *cur++;
2348 }
Owen Taylor3473f882001-02-23 17:55:21 +00002349 } else {
2350 unsigned int c;
2351 int bits;
2352
2353 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002354 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002355
2356 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002357 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002358 }
2359 c = (xmlChar)ent->value;
2360 if (c < 0x80)
2361 { *out++ = c; bits= -6; }
2362 else if (c < 0x800)
2363 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2364 else if (c < 0x10000)
2365 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2366 else
2367 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2368
2369 for ( ; bits >= 0; bits-= 6) {
2370 *out++ = ((c >> bits) & 0x3F) | 0x80;
2371 }
Owen Taylor3473f882001-02-23 17:55:21 +00002372 }
2373 }
2374 } else {
2375 unsigned int c;
2376 int bits, l;
2377
2378 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002379 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002380
2381 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002382 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002383 }
2384 c = CUR_CHAR(l);
2385 if (c < 0x80)
2386 { *out++ = c; bits= -6; }
2387 else if (c < 0x800)
2388 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2389 else if (c < 0x10000)
2390 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2391 else
2392 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2393
2394 for ( ; bits >= 0; bits-= 6) {
2395 *out++ = ((c >> bits) & 0x3F) | 0x80;
2396 }
2397 NEXT;
2398 }
2399 }
2400 *out++ = 0;
2401 return(buffer);
2402}
2403
2404/**
Owen Taylor3473f882001-02-23 17:55:21 +00002405 * htmlParseEntityRef:
2406 * @ctxt: an HTML parser context
2407 * @str: location to store the entity name
2408 *
2409 * parse an HTML ENTITY references
2410 *
2411 * [68] EntityRef ::= '&' Name ';'
2412 *
2413 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2414 * if non-NULL *str will have to be freed by the caller.
2415 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002416const htmlEntityDesc *
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002417htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2418 const xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002419 const htmlEntityDesc * ent = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002420 *str = NULL;
2421
2422 if (CUR == '&') {
2423 NEXT;
2424 name = htmlParseName(ctxt);
2425 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002426 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2427 "htmlParseEntityRef: no name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002428 } else {
2429 GROW;
2430 if (CUR == ';') {
2431 *str = name;
2432
2433 /*
2434 * Lookup the entity in the table.
2435 */
2436 ent = htmlEntityLookup(name);
2437 if (ent != NULL) /* OK that's ugly !!! */
2438 NEXT;
2439 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002440 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2441 "htmlParseEntityRef: expecting ';'\n",
2442 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002443 *str = name;
2444 }
2445 }
2446 }
2447 return(ent);
2448}
2449
2450/**
2451 * htmlParseAttValue:
2452 * @ctxt: an HTML parser context
2453 *
2454 * parse a value for an attribute
2455 * Note: the parser won't do substitution of entities here, this
2456 * will be handled later in xmlStringGetNodeList, unless it was
2457 * asked for ctxt->replaceEntities != 0
2458 *
2459 * Returns the AttValue parsed or NULL.
2460 */
2461
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002462static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002463htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2464 xmlChar *ret = NULL;
2465
2466 if (CUR == '"') {
2467 NEXT;
2468 ret = htmlParseHTMLAttribute(ctxt, '"');
2469 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002470 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2471 "AttValue: \" expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002472 } else
2473 NEXT;
2474 } else if (CUR == '\'') {
2475 NEXT;
2476 ret = htmlParseHTMLAttribute(ctxt, '\'');
2477 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002478 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2479 "AttValue: ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002480 } else
2481 NEXT;
2482 } else {
2483 /*
2484 * That's an HTMLism, the attribute value may not be quoted
2485 */
2486 ret = htmlParseHTMLAttribute(ctxt, 0);
2487 if (ret == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002488 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2489 "AttValue: no value found\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002490 }
2491 }
2492 return(ret);
2493}
2494
2495/**
2496 * htmlParseSystemLiteral:
2497 * @ctxt: an HTML parser context
2498 *
2499 * parse an HTML Literal
2500 *
2501 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2502 *
2503 * Returns the SystemLiteral parsed or NULL
2504 */
2505
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002506static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002507htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2508 const xmlChar *q;
2509 xmlChar *ret = NULL;
2510
2511 if (CUR == '"') {
2512 NEXT;
2513 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002514 while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
Owen Taylor3473f882001-02-23 17:55:21 +00002515 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002516 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002517 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2518 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002519 } else {
2520 ret = xmlStrndup(q, CUR_PTR - q);
2521 NEXT;
2522 }
2523 } else if (CUR == '\'') {
2524 NEXT;
2525 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002526 while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002527 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002528 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002529 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2530 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002531 } else {
2532 ret = xmlStrndup(q, CUR_PTR - q);
2533 NEXT;
2534 }
2535 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002536 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2537 " or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002538 }
2539
2540 return(ret);
2541}
2542
2543/**
2544 * htmlParsePubidLiteral:
2545 * @ctxt: an HTML parser context
2546 *
2547 * parse an HTML public literal
2548 *
2549 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2550 *
2551 * Returns the PubidLiteral parsed or NULL.
2552 */
2553
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002554static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002555htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2556 const xmlChar *q;
2557 xmlChar *ret = NULL;
2558 /*
2559 * Name ::= (Letter | '_') (NameChar)*
2560 */
2561 if (CUR == '"') {
2562 NEXT;
2563 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002564 while (IS_PUBIDCHAR_CH(CUR)) NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00002565 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002566 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2567 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002568 } else {
2569 ret = xmlStrndup(q, CUR_PTR - q);
2570 NEXT;
2571 }
2572 } else if (CUR == '\'') {
2573 NEXT;
2574 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002575 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002576 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002577 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002578 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2579 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002580 } else {
2581 ret = xmlStrndup(q, CUR_PTR - q);
2582 NEXT;
2583 }
2584 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002585 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2586 "PubidLiteral \" or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002587 }
2588
2589 return(ret);
2590}
2591
2592/**
2593 * htmlParseScript:
2594 * @ctxt: an HTML parser context
2595 *
2596 * parse the content of an HTML SCRIPT or STYLE element
2597 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2598 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2599 * http://www.w3.org/TR/html4/types.html#type-script
2600 * http://www.w3.org/TR/html4/types.html#h-6.15
2601 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2602 *
2603 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2604 * element and the value of intrinsic event attributes. User agents must
2605 * not evaluate script data as HTML markup but instead must pass it on as
2606 * data to a script engine.
2607 * NOTES:
2608 * - The content is passed like CDATA
2609 * - the attributes for style and scripting "onXXX" are also described
2610 * as CDATA but SGML allows entities references in attributes so their
2611 * processing is identical as other attributes
2612 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002613static void
Owen Taylor3473f882001-02-23 17:55:21 +00002614htmlParseScript(htmlParserCtxtPtr ctxt) {
2615 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2616 int nbchar = 0;
2617 xmlChar cur;
2618
2619 SHRINK;
2620 cur = CUR;
William M. Brack76e95df2003-10-18 16:20:14 +00002621 while (IS_CHAR_CH(cur)) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00002622 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2623 (NXT(3) == '-')) {
2624 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2625 if (ctxt->sax->cdataBlock!= NULL) {
2626 /*
2627 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2628 */
2629 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002630 } else if (ctxt->sax->characters != NULL) {
2631 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Daniel Veillardc1f78342001-11-10 11:43:05 +00002632 }
2633 }
2634 nbchar = 0;
2635 htmlParseComment(ctxt);
2636 cur = CUR;
2637 continue;
2638 } else if ((cur == '<') && (NXT(1) == '/')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002639 /*
2640 * One should break here, the specification is clear:
2641 * Authors should therefore escape "</" within the content.
2642 * Escape mechanisms are specific to each scripting or
2643 * style sheet language.
2644 */
2645 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2646 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2647 break; /* while */
2648 }
2649 buf[nbchar++] = cur;
2650 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2651 if (ctxt->sax->cdataBlock!= NULL) {
2652 /*
2653 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2654 */
2655 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002656 } else if (ctxt->sax->characters != NULL) {
2657 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002658 }
2659 nbchar = 0;
2660 }
2661 NEXT;
2662 cur = CUR;
2663 }
William M. Brack76e95df2003-10-18 16:20:14 +00002664 if (!(IS_CHAR_CH(cur))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002665 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2666 "Invalid char in CDATA 0x%X\n", cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002667 NEXT;
2668 }
2669
2670 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2671 if (ctxt->sax->cdataBlock!= NULL) {
2672 /*
2673 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2674 */
2675 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002676 } else if (ctxt->sax->characters != NULL) {
2677 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002678 }
2679 }
2680}
2681
2682
2683/**
2684 * htmlParseCharData:
2685 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002686 *
2687 * parse a CharData section.
2688 * if we are within a CDATA section ']]>' marks an end of section.
2689 *
2690 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2691 */
2692
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002693static void
2694htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002695 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2696 int nbchar = 0;
2697 int cur, l;
2698
2699 SHRINK;
2700 cur = CUR_CHAR(l);
2701 while (((cur != '<') || (ctxt->token == '<')) &&
2702 ((cur != '&') || (ctxt->token == '&')) &&
2703 (IS_CHAR(cur))) {
2704 COPY_BUF(l,buf,nbchar,cur);
2705 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2706 /*
2707 * Ok the segment is to be consumed as chars.
2708 */
2709 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2710 if (areBlanks(ctxt, buf, nbchar)) {
2711 if (ctxt->sax->ignorableWhitespace != NULL)
2712 ctxt->sax->ignorableWhitespace(ctxt->userData,
2713 buf, nbchar);
2714 } else {
2715 htmlCheckParagraph(ctxt);
2716 if (ctxt->sax->characters != NULL)
2717 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2718 }
2719 }
2720 nbchar = 0;
2721 }
2722 NEXTL(l);
2723 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002724 if (cur == 0) {
2725 SHRINK;
2726 GROW;
2727 cur = CUR_CHAR(l);
2728 }
Owen Taylor3473f882001-02-23 17:55:21 +00002729 }
2730 if (nbchar != 0) {
2731 /*
2732 * Ok the segment is to be consumed as chars.
2733 */
2734 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2735 if (areBlanks(ctxt, buf, nbchar)) {
2736 if (ctxt->sax->ignorableWhitespace != NULL)
2737 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2738 } else {
2739 htmlCheckParagraph(ctxt);
2740 if (ctxt->sax->characters != NULL)
2741 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2742 }
2743 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002744 } else {
2745 /*
2746 * Loop detection
2747 */
2748 if (cur == 0)
2749 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002750 }
2751}
2752
2753/**
2754 * htmlParseExternalID:
2755 * @ctxt: an HTML parser context
2756 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002757 *
2758 * Parse an External ID or a Public ID
2759 *
Owen Taylor3473f882001-02-23 17:55:21 +00002760 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2761 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2762 *
2763 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2764 *
2765 * Returns the function returns SystemLiteral and in the second
2766 * case publicID receives PubidLiteral, is strict is off
2767 * it is possible to return NULL and have publicID set.
2768 */
2769
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002770static xmlChar *
2771htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002772 xmlChar *URI = NULL;
2773
2774 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2775 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2776 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2777 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002778 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002779 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2780 "Space required after 'SYSTEM'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002781 }
2782 SKIP_BLANKS;
2783 URI = htmlParseSystemLiteral(ctxt);
2784 if (URI == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002785 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2786 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002787 }
2788 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2789 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2790 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2791 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002792 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002793 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2794 "Space required after 'PUBLIC'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002795 }
2796 SKIP_BLANKS;
2797 *publicID = htmlParsePubidLiteral(ctxt);
2798 if (*publicID == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002799 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2800 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
2801 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002802 }
2803 SKIP_BLANKS;
2804 if ((CUR == '"') || (CUR == '\'')) {
2805 URI = htmlParseSystemLiteral(ctxt);
2806 }
2807 }
2808 return(URI);
2809}
2810
2811/**
2812 * htmlParseComment:
2813 * @ctxt: an HTML parser context
2814 *
2815 * Parse an XML (SGML) comment <!-- .... -->
2816 *
2817 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2818 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002819static void
Owen Taylor3473f882001-02-23 17:55:21 +00002820htmlParseComment(htmlParserCtxtPtr ctxt) {
2821 xmlChar *buf = NULL;
2822 int len;
2823 int size = HTML_PARSER_BUFFER_SIZE;
2824 int q, ql;
2825 int r, rl;
2826 int cur, l;
2827 xmlParserInputState state;
2828
2829 /*
2830 * Check that there is a comment right here.
2831 */
2832 if ((RAW != '<') || (NXT(1) != '!') ||
2833 (NXT(2) != '-') || (NXT(3) != '-')) return;
2834
2835 state = ctxt->instate;
2836 ctxt->instate = XML_PARSER_COMMENT;
2837 SHRINK;
2838 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002839 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002840 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002841 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002842 ctxt->instate = state;
2843 return;
2844 }
2845 q = CUR_CHAR(ql);
2846 NEXTL(ql);
2847 r = CUR_CHAR(rl);
2848 NEXTL(rl);
2849 cur = CUR_CHAR(l);
2850 len = 0;
2851 while (IS_CHAR(cur) &&
2852 ((cur != '>') ||
2853 (r != '-') || (q != '-'))) {
2854 if (len + 5 >= size) {
Daniel Veillard079f6a72004-09-23 13:15:03 +00002855 xmlChar *tmp;
2856
Owen Taylor3473f882001-02-23 17:55:21 +00002857 size *= 2;
Daniel Veillard079f6a72004-09-23 13:15:03 +00002858 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2859 if (tmp == NULL) {
2860 xmlFree(buf);
Daniel Veillardf403d292003-10-05 13:51:35 +00002861 htmlErrMemory(ctxt, "growing buffer failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002862 ctxt->instate = state;
2863 return;
2864 }
Daniel Veillard079f6a72004-09-23 13:15:03 +00002865 buf = tmp;
Owen Taylor3473f882001-02-23 17:55:21 +00002866 }
2867 COPY_BUF(ql,buf,len,q);
2868 q = r;
2869 ql = rl;
2870 r = cur;
2871 rl = l;
2872 NEXTL(l);
2873 cur = CUR_CHAR(l);
2874 if (cur == 0) {
2875 SHRINK;
2876 GROW;
2877 cur = CUR_CHAR(l);
2878 }
2879 }
2880 buf[len] = 0;
2881 if (!IS_CHAR(cur)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002882 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
2883 "Comment not terminated \n<!--%.50s\n", buf, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002884 xmlFree(buf);
2885 } else {
2886 NEXT;
2887 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2888 (!ctxt->disableSAX))
2889 ctxt->sax->comment(ctxt->userData, buf);
2890 xmlFree(buf);
2891 }
2892 ctxt->instate = state;
2893}
2894
2895/**
2896 * htmlParseCharRef:
2897 * @ctxt: an HTML parser context
2898 *
2899 * parse Reference declarations
2900 *
2901 * [66] CharRef ::= '&#' [0-9]+ ';' |
2902 * '&#x' [0-9a-fA-F]+ ';'
2903 *
2904 * Returns the value parsed (as an int)
2905 */
2906int
2907htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2908 int val = 0;
2909
2910 if ((CUR == '&') && (NXT(1) == '#') &&
Daniel Veillardc59d8262003-11-20 21:59:12 +00002911 ((NXT(2) == 'x') || NXT(2) == 'X')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002912 SKIP(3);
2913 while (CUR != ';') {
2914 if ((CUR >= '0') && (CUR <= '9'))
2915 val = val * 16 + (CUR - '0');
2916 else if ((CUR >= 'a') && (CUR <= 'f'))
2917 val = val * 16 + (CUR - 'a') + 10;
2918 else if ((CUR >= 'A') && (CUR <= 'F'))
2919 val = val * 16 + (CUR - 'A') + 10;
2920 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002921 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
2922 "htmlParseCharRef: invalid hexadecimal value\n",
2923 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002924 return(0);
2925 }
2926 NEXT;
2927 }
2928 if (CUR == ';')
2929 NEXT;
2930 } else if ((CUR == '&') && (NXT(1) == '#')) {
2931 SKIP(2);
2932 while (CUR != ';') {
2933 if ((CUR >= '0') && (CUR <= '9'))
2934 val = val * 10 + (CUR - '0');
2935 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002936 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
2937 "htmlParseCharRef: invalid decimal value\n",
2938 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002939 return(0);
2940 }
2941 NEXT;
2942 }
2943 if (CUR == ';')
2944 NEXT;
2945 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002946 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
2947 "htmlParseCharRef: invalid value\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002948 }
2949 /*
2950 * Check the value IS_CHAR ...
2951 */
2952 if (IS_CHAR(val)) {
2953 return(val);
2954 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002955 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2956 "htmlParseCharRef: invalid xmlChar value %d\n",
2957 val);
Owen Taylor3473f882001-02-23 17:55:21 +00002958 }
2959 return(0);
2960}
2961
2962
2963/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00002964 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00002965 * @ctxt: an HTML parser context
2966 *
2967 * parse a DOCTYPE declaration
2968 *
2969 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2970 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2971 */
2972
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002973static void
Owen Taylor3473f882001-02-23 17:55:21 +00002974htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002975 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00002976 xmlChar *ExternalID = NULL;
2977 xmlChar *URI = NULL;
2978
2979 /*
2980 * We know that '<!DOCTYPE' has been detected.
2981 */
2982 SKIP(9);
2983
2984 SKIP_BLANKS;
2985
2986 /*
2987 * Parse the DOCTYPE name.
2988 */
2989 name = htmlParseName(ctxt);
2990 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002991 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2992 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
2993 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002994 }
2995 /*
2996 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2997 */
2998
2999 SKIP_BLANKS;
3000
3001 /*
3002 * Check for SystemID and ExternalID
3003 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003004 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003005 SKIP_BLANKS;
3006
3007 /*
3008 * We should be at the end of the DOCTYPE declaration.
3009 */
3010 if (CUR != '>') {
Daniel Veillardf403d292003-10-05 13:51:35 +00003011 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3012 "DOCTYPE improperly terminated\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003013 /* We shouldn't try to resynchronize ... */
3014 }
3015 NEXT;
3016
3017 /*
3018 * Create or update the document accordingly to the DOCTYPE
3019 */
3020 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3021 (!ctxt->disableSAX))
3022 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3023
3024 /*
3025 * Cleanup, since we don't use all those identifiers
3026 */
3027 if (URI != NULL) xmlFree(URI);
3028 if (ExternalID != NULL) xmlFree(ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003029}
3030
3031/**
3032 * htmlParseAttribute:
3033 * @ctxt: an HTML parser context
3034 * @value: a xmlChar ** used to store the value of the attribute
3035 *
3036 * parse an attribute
3037 *
3038 * [41] Attribute ::= Name Eq AttValue
3039 *
3040 * [25] Eq ::= S? '=' S?
3041 *
3042 * With namespace:
3043 *
3044 * [NS 11] Attribute ::= QName Eq AttValue
3045 *
3046 * Also the case QName == xmlns:??? is handled independently as a namespace
3047 * definition.
3048 *
3049 * Returns the attribute name, and the value in *value.
3050 */
3051
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003052static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003053htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003054 const xmlChar *name;
3055 xmlChar *val = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003056
3057 *value = NULL;
3058 name = htmlParseHTMLName(ctxt);
3059 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003060 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3061 "error parsing attribute name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003062 return(NULL);
3063 }
3064
3065 /*
3066 * read the value
3067 */
3068 SKIP_BLANKS;
3069 if (CUR == '=') {
3070 NEXT;
3071 SKIP_BLANKS;
3072 val = htmlParseAttValue(ctxt);
3073 /******
3074 } else {
3075 * TODO : some attribute must have values, some may not
3076 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3077 ctxt->sax->warning(ctxt->userData,
3078 "No value for attribute %s\n", name); */
3079 }
3080
3081 *value = val;
3082 return(name);
3083}
3084
3085/**
3086 * htmlCheckEncoding:
3087 * @ctxt: an HTML parser context
3088 * @attvalue: the attribute value
3089 *
3090 * Checks an http-equiv attribute from a Meta tag to detect
3091 * the encoding
3092 * If a new encoding is detected the parser is switched to decode
3093 * it and pass UTF8
3094 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003095static void
Owen Taylor3473f882001-02-23 17:55:21 +00003096htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3097 const xmlChar *encoding;
3098
3099 if ((ctxt == NULL) || (attvalue == NULL))
3100 return;
3101
3102 /* do not change encoding */
3103 if (ctxt->input->encoding != NULL)
3104 return;
3105
3106 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3107 if (encoding != NULL) {
3108 encoding += 8;
3109 } else {
3110 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3111 if (encoding != NULL)
3112 encoding += 9;
3113 }
3114 if (encoding != NULL) {
3115 xmlCharEncoding enc;
3116 xmlCharEncodingHandlerPtr handler;
3117
3118 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3119
3120 if (ctxt->input->encoding != NULL)
3121 xmlFree((xmlChar *) ctxt->input->encoding);
3122 ctxt->input->encoding = xmlStrdup(encoding);
3123
3124 enc = xmlParseCharEncoding((const char *) encoding);
3125 /*
3126 * registered set of known encodings
3127 */
3128 if (enc != XML_CHAR_ENCODING_ERROR) {
3129 xmlSwitchEncoding(ctxt, enc);
3130 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3131 } else {
3132 /*
3133 * fallback for unknown encodings
3134 */
3135 handler = xmlFindCharEncodingHandler((const char *) encoding);
3136 if (handler != NULL) {
3137 xmlSwitchToEncoding(ctxt, handler);
3138 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3139 } else {
3140 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3141 }
3142 }
3143
3144 if ((ctxt->input->buf != NULL) &&
3145 (ctxt->input->buf->encoder != NULL) &&
3146 (ctxt->input->buf->raw != NULL) &&
3147 (ctxt->input->buf->buffer != NULL)) {
3148 int nbchars;
3149 int processed;
3150
3151 /*
3152 * convert as much as possible to the parser reading buffer.
3153 */
3154 processed = ctxt->input->cur - ctxt->input->base;
3155 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3156 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3157 ctxt->input->buf->buffer,
3158 ctxt->input->buf->raw);
3159 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003160 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3161 "htmlCheckEncoding: encoder error\n",
3162 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003163 }
3164 ctxt->input->base =
3165 ctxt->input->cur = ctxt->input->buf->buffer->content;
3166 }
3167 }
3168}
3169
3170/**
3171 * htmlCheckMeta:
3172 * @ctxt: an HTML parser context
3173 * @atts: the attributes values
3174 *
3175 * Checks an attributes from a Meta tag
3176 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003177static void
Owen Taylor3473f882001-02-23 17:55:21 +00003178htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3179 int i;
3180 const xmlChar *att, *value;
3181 int http = 0;
3182 const xmlChar *content = NULL;
3183
3184 if ((ctxt == NULL) || (atts == NULL))
3185 return;
3186
3187 i = 0;
3188 att = atts[i++];
3189 while (att != NULL) {
3190 value = atts[i++];
3191 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3192 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3193 http = 1;
3194 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3195 content = value;
3196 att = atts[i++];
3197 }
3198 if ((http) && (content != NULL))
3199 htmlCheckEncoding(ctxt, content);
3200
3201}
3202
3203/**
3204 * htmlParseStartTag:
3205 * @ctxt: an HTML parser context
3206 *
3207 * parse a start of tag either for rule element or
3208 * EmptyElement. In both case we don't parse the tag closing chars.
3209 *
3210 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3211 *
3212 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3213 *
3214 * With namespace:
3215 *
3216 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3217 *
3218 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3219 *
3220 */
3221
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003222static void
Owen Taylor3473f882001-02-23 17:55:21 +00003223htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003224 const xmlChar *name;
3225 const xmlChar *attname;
Owen Taylor3473f882001-02-23 17:55:21 +00003226 xmlChar *attvalue;
Daniel Veillardf403d292003-10-05 13:51:35 +00003227 const xmlChar **atts = ctxt->atts;
Owen Taylor3473f882001-02-23 17:55:21 +00003228 int nbatts = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +00003229 int maxatts = ctxt->maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003230 int meta = 0;
3231 int i;
3232
3233 if (CUR != '<') return;
3234 NEXT;
3235
3236 GROW;
3237 name = htmlParseHTMLName(ctxt);
3238 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003239 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3240 "htmlParseStartTag: invalid element name\n",
3241 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003242 /* Dump the bogus tag like browsers do */
William M. Brack76e95df2003-10-18 16:20:14 +00003243 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
Owen Taylor3473f882001-02-23 17:55:21 +00003244 NEXT;
3245 return;
3246 }
3247 if (xmlStrEqual(name, BAD_CAST"meta"))
3248 meta = 1;
3249
3250 /*
3251 * Check for auto-closure of HTML elements.
3252 */
3253 htmlAutoClose(ctxt, name);
3254
3255 /*
3256 * Check for implied HTML elements.
3257 */
3258 htmlCheckImplied(ctxt, name);
3259
3260 /*
3261 * Avoid html at any level > 0, head at any level != 1
3262 * or any attempt to recurse body
3263 */
3264 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003265 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3266 "htmlParseStartTag: misplaced <html> tag\n",
3267 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003268 return;
3269 }
3270 if ((ctxt->nameNr != 1) &&
3271 (xmlStrEqual(name, BAD_CAST"head"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003272 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3273 "htmlParseStartTag: misplaced <head> tag\n",
3274 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003275 return;
3276 }
3277 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003278 int indx;
3279 for (indx = 0;indx < ctxt->nameNr;indx++) {
3280 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003281 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3282 "htmlParseStartTag: misplaced <body> tag\n",
3283 name, NULL);
Daniel Veillardc59d8262003-11-20 21:59:12 +00003284 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3285 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003286 return;
3287 }
3288 }
3289 }
3290
3291 /*
3292 * Now parse the attributes, it ends up with the ending
3293 *
3294 * (S Attribute)* S?
3295 */
3296 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003297 while ((IS_CHAR_CH(CUR)) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003298 (CUR != '>') &&
3299 ((CUR != '/') || (NXT(1) != '>'))) {
3300 long cons = ctxt->nbChars;
3301
3302 GROW;
3303 attname = htmlParseAttribute(ctxt, &attvalue);
3304 if (attname != NULL) {
3305
3306 /*
3307 * Well formedness requires at most one declaration of an attribute
3308 */
3309 for (i = 0; i < nbatts;i += 2) {
3310 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003311 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3312 "Attribute %s redefined\n", attname, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003313 if (attvalue != NULL)
3314 xmlFree(attvalue);
3315 goto failed;
3316 }
3317 }
3318
3319 /*
3320 * Add the pair to atts
3321 */
3322 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003323 maxatts = 22; /* allow for 10 attrs by default */
3324 atts = (const xmlChar **)
3325 xmlMalloc(maxatts * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003326 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003327 htmlErrMemory(ctxt, NULL);
3328 if (attvalue != NULL)
3329 xmlFree(attvalue);
3330 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003331 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003332 ctxt->atts = atts;
3333 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003334 } else if (nbatts + 4 > maxatts) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003335 const xmlChar **n;
3336
Owen Taylor3473f882001-02-23 17:55:21 +00003337 maxatts *= 2;
Daniel Veillardf403d292003-10-05 13:51:35 +00003338 n = (const xmlChar **) xmlRealloc((void *) atts,
3339 maxatts * sizeof(const xmlChar *));
3340 if (n == NULL) {
3341 htmlErrMemory(ctxt, NULL);
3342 if (attvalue != NULL)
3343 xmlFree(attvalue);
3344 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003345 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003346 atts = n;
3347 ctxt->atts = atts;
3348 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003349 }
3350 atts[nbatts++] = attname;
3351 atts[nbatts++] = attvalue;
3352 atts[nbatts] = NULL;
3353 atts[nbatts + 1] = NULL;
3354 }
3355 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003356 if (attvalue != NULL)
3357 xmlFree(attvalue);
Owen Taylor3473f882001-02-23 17:55:21 +00003358 /* Dump the bogus attribute string up to the next blank or
3359 * the end of the tag. */
William M. Brack76e95df2003-10-18 16:20:14 +00003360 while ((IS_CHAR_CH(CUR)) &&
3361 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
Daniel Veillard34ba3872003-07-15 13:34:05 +00003362 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003363 NEXT;
3364 }
3365
3366failed:
3367 SKIP_BLANKS;
3368 if (cons == ctxt->nbChars) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003369 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3370 "htmlParseStartTag: problem parsing attributes\n",
3371 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003372 break;
3373 }
3374 }
3375
3376 /*
3377 * Handle specific association to the META tag
3378 */
3379 if (meta)
3380 htmlCheckMeta(ctxt, atts);
3381
3382 /*
3383 * SAX: Start of Element !
3384 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003385 htmlnamePush(ctxt, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003386 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3387 if (nbatts != 0)
3388 ctxt->sax->startElement(ctxt->userData, name, atts);
3389 else
3390 ctxt->sax->startElement(ctxt->userData, name, NULL);
3391 }
Owen Taylor3473f882001-02-23 17:55:21 +00003392
3393 if (atts != NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003394 for (i = 1;i < nbatts;i += 2) {
Owen Taylor3473f882001-02-23 17:55:21 +00003395 if (atts[i] != NULL)
3396 xmlFree((xmlChar *) atts[i]);
3397 }
Owen Taylor3473f882001-02-23 17:55:21 +00003398 }
Owen Taylor3473f882001-02-23 17:55:21 +00003399}
3400
3401/**
3402 * htmlParseEndTag:
3403 * @ctxt: an HTML parser context
3404 *
3405 * parse an end of tag
3406 *
3407 * [42] ETag ::= '</' Name S? '>'
3408 *
3409 * With namespace
3410 *
3411 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003412 *
3413 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003414 */
3415
Daniel Veillardf420ac52001-07-04 16:04:09 +00003416static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003417htmlParseEndTag(htmlParserCtxtPtr ctxt)
3418{
3419 const xmlChar *name;
3420 const xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003421 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003422
3423 if ((CUR != '<') || (NXT(1) != '/')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003424 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3425 "htmlParseEndTag: '</' not found\n", NULL, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003426 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003427 }
3428 SKIP(2);
3429
3430 name = htmlParseHTMLName(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003431 if (name == NULL)
3432 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003433
3434 /*
3435 * We should definitely be at the ending "S? '>'" part
3436 */
3437 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003438 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003439 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3440 "End tag : expected '>'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003441 } else
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003442 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003443
3444 /*
3445 * If the name read is not one of the element in the parsing stack
3446 * then return, it's just an error.
3447 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003448 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3449 if (xmlStrEqual(name, ctxt->nameTab[i]))
3450 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003451 }
3452 if (i < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003453 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3454 "Unexpected end tag : %s\n", name, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003455 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003456 }
3457
3458
3459 /*
3460 * Check for auto-closure of HTML elements.
3461 */
3462
3463 htmlAutoCloseOnClose(ctxt, name);
3464
3465 /*
3466 * Well formedness constraints, opening and closing must match.
3467 * With the exception that the autoclose may have popped stuff out
3468 * of the stack.
3469 */
3470 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003471 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003472 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3473 "Opening and ending tag mismatch: %s and %s\n",
3474 name, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00003475 }
3476 }
3477
3478 /*
3479 * SAX: End of Tag
3480 */
3481 oldname = ctxt->name;
3482 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003483 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3484 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003485 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003486 ret = 1;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003487 } else {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003488 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003489 }
3490
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003491 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003492}
3493
3494
3495/**
3496 * htmlParseReference:
3497 * @ctxt: an HTML parser context
3498 *
3499 * parse and handle entity references in content,
3500 * this will end-up in a call to character() since this is either a
3501 * CharRef, or a predefined entity.
3502 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003503static void
Owen Taylor3473f882001-02-23 17:55:21 +00003504htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003505 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003506 xmlChar out[6];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003507 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003508 if (CUR != '&') return;
3509
3510 if (NXT(1) == '#') {
3511 unsigned int c;
3512 int bits, i = 0;
3513
3514 c = htmlParseCharRef(ctxt);
3515 if (c == 0)
3516 return;
3517
3518 if (c < 0x80) { out[i++]= c; bits= -6; }
3519 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3520 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3521 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3522
3523 for ( ; bits >= 0; bits-= 6) {
3524 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3525 }
3526 out[i] = 0;
3527
3528 htmlCheckParagraph(ctxt);
3529 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3530 ctxt->sax->characters(ctxt->userData, out, i);
3531 } else {
3532 ent = htmlParseEntityRef(ctxt, &name);
3533 if (name == NULL) {
3534 htmlCheckParagraph(ctxt);
3535 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3536 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3537 return;
3538 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003539 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003540 htmlCheckParagraph(ctxt);
3541 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3542 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3543 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3544 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3545 }
3546 } else {
3547 unsigned int c;
3548 int bits, i = 0;
3549
3550 c = ent->value;
3551 if (c < 0x80)
3552 { out[i++]= c; bits= -6; }
3553 else if (c < 0x800)
3554 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3555 else if (c < 0x10000)
3556 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3557 else
3558 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3559
3560 for ( ; bits >= 0; bits-= 6) {
3561 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3562 }
3563 out[i] = 0;
3564
3565 htmlCheckParagraph(ctxt);
3566 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3567 ctxt->sax->characters(ctxt->userData, out, i);
3568 }
Owen Taylor3473f882001-02-23 17:55:21 +00003569 }
3570}
3571
3572/**
3573 * htmlParseContent:
3574 * @ctxt: an HTML parser context
3575 * @name: the node name
3576 *
3577 * Parse a content: comment, sub-element, reference or text.
3578 *
3579 */
3580
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003581static void
Owen Taylor3473f882001-02-23 17:55:21 +00003582htmlParseContent(htmlParserCtxtPtr ctxt) {
3583 xmlChar *currentNode;
3584 int depth;
3585
3586 currentNode = xmlStrdup(ctxt->name);
3587 depth = ctxt->nameNr;
3588 while (1) {
3589 long cons = ctxt->nbChars;
3590
3591 GROW;
3592 /*
3593 * Our tag or one of it's parent or children is ending.
3594 */
3595 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003596 if (htmlParseEndTag(ctxt) &&
3597 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3598 if (currentNode != NULL)
3599 xmlFree(currentNode);
3600 return;
3601 }
3602 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003603 }
3604
3605 /*
3606 * Has this node been popped out during parsing of
3607 * the next element
3608 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003609 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3610 (!xmlStrEqual(currentNode, ctxt->name)))
3611 {
Owen Taylor3473f882001-02-23 17:55:21 +00003612 if (currentNode != NULL) xmlFree(currentNode);
3613 return;
3614 }
3615
Daniel Veillardf9533d12001-03-03 10:04:57 +00003616 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3617 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003618 /*
3619 * Handle SCRIPT/STYLE separately
3620 */
3621 htmlParseScript(ctxt);
3622 } else {
3623 /*
3624 * Sometimes DOCTYPE arrives in the middle of the document
3625 */
3626 if ((CUR == '<') && (NXT(1) == '!') &&
3627 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3628 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3629 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3630 (UPP(8) == 'E')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003631 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3632 "Misplaced DOCTYPE declaration\n",
3633 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003634 htmlParseDocTypeDecl(ctxt);
3635 }
3636
3637 /*
3638 * First case : a comment
3639 */
3640 if ((CUR == '<') && (NXT(1) == '!') &&
3641 (NXT(2) == '-') && (NXT(3) == '-')) {
3642 htmlParseComment(ctxt);
3643 }
3644
3645 /*
3646 * Second case : a sub-element.
3647 */
3648 else if (CUR == '<') {
3649 htmlParseElement(ctxt);
3650 }
3651
3652 /*
3653 * Third case : a reference. If if has not been resolved,
3654 * parsing returns it's Name, create the node
3655 */
3656 else if (CUR == '&') {
3657 htmlParseReference(ctxt);
3658 }
3659
3660 /*
3661 * Fourth : end of the resource
3662 */
3663 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003664 htmlAutoCloseOnEnd(ctxt);
3665 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003666 }
3667
3668 /*
3669 * Last case, text. Note that References are handled directly.
3670 */
3671 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003672 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003673 }
3674
3675 if (cons == ctxt->nbChars) {
3676 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003677 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3678 "detected an error in element content\n",
3679 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003680 }
3681 break;
3682 }
3683 }
3684 GROW;
3685 }
3686 if (currentNode != NULL) xmlFree(currentNode);
3687}
3688
3689/**
3690 * htmlParseElement:
3691 * @ctxt: an HTML parser context
3692 *
3693 * parse an HTML element, this is highly recursive
3694 *
3695 * [39] element ::= EmptyElemTag | STag content ETag
3696 *
3697 * [41] Attribute ::= Name Eq AttValue
3698 */
3699
3700void
3701htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003702 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003703 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003704 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003705 htmlParserNodeInfo node_info;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003706 const xmlChar *oldname;
Owen Taylor3473f882001-02-23 17:55:21 +00003707 int depth = ctxt->nameNr;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003708 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003709
3710 /* Capture start position */
3711 if (ctxt->record_info) {
3712 node_info.begin_pos = ctxt->input->consumed +
3713 (CUR_PTR - ctxt->input->base);
3714 node_info.begin_line = ctxt->input->line;
3715 }
3716
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003717 oldname = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00003718 htmlParseStartTag(ctxt);
3719 name = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00003720 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3721 (name == NULL)) {
3722 if (CUR == '>')
3723 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003724 return;
3725 }
Owen Taylor3473f882001-02-23 17:55:21 +00003726
3727 /*
3728 * Lookup the info for that element.
3729 */
3730 info = htmlTagLookup(name);
3731 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003732 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
3733 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003734 }
3735
3736 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00003737 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00003738 */
3739 if ((CUR == '/') && (NXT(1) == '>')) {
3740 SKIP(2);
3741 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3742 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003743 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003744 return;
3745 }
3746
3747 if (CUR == '>') {
3748 NEXT;
3749 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003750 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3751 "Couldn't find end of Start Tag %s\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003752
3753 /*
3754 * end of parsing of this node.
3755 */
3756 if (xmlStrEqual(name, ctxt->name)) {
3757 nodePop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00003758 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003759 }
3760
3761 /*
3762 * Capture end position and add node
3763 */
3764 if ( currentNode != NULL && ctxt->record_info ) {
3765 node_info.end_pos = ctxt->input->consumed +
3766 (CUR_PTR - ctxt->input->base);
3767 node_info.end_line = ctxt->input->line;
3768 node_info.node = ctxt->node;
3769 xmlParserAddNodeInfo(ctxt, &node_info);
3770 }
3771 return;
3772 }
3773
3774 /*
3775 * Check for an Empty Element from DTD definition
3776 */
3777 if ((info != NULL) && (info->empty)) {
3778 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3779 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003780 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003781 return;
3782 }
3783
3784 /*
3785 * Parse the content of the element:
3786 */
3787 currentNode = xmlStrdup(ctxt->name);
3788 depth = ctxt->nameNr;
William M. Brack76e95df2003-10-18 16:20:14 +00003789 while (IS_CHAR_CH(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00003790 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00003791 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00003792 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00003793 if (ctxt->nameNr < depth) break;
3794 }
3795
Owen Taylor3473f882001-02-23 17:55:21 +00003796 /*
3797 * Capture end position and add node
3798 */
3799 if ( currentNode != NULL && ctxt->record_info ) {
3800 node_info.end_pos = ctxt->input->consumed +
3801 (CUR_PTR - ctxt->input->base);
3802 node_info.end_line = ctxt->input->line;
3803 node_info.node = ctxt->node;
3804 xmlParserAddNodeInfo(ctxt, &node_info);
3805 }
William M. Brack76e95df2003-10-18 16:20:14 +00003806 if (!IS_CHAR_CH(CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003807 htmlAutoCloseOnEnd(ctxt);
3808 }
3809
Owen Taylor3473f882001-02-23 17:55:21 +00003810 if (currentNode != NULL)
3811 xmlFree(currentNode);
3812}
3813
3814/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003815 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00003816 * @ctxt: an HTML parser context
3817 *
3818 * parse an HTML document (and build a tree if using the standard SAX
3819 * interface).
3820 *
3821 * Returns 0, -1 in case of error. the parser context is augmented
3822 * as a result of the parsing.
3823 */
3824
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00003825int
Owen Taylor3473f882001-02-23 17:55:21 +00003826htmlParseDocument(htmlParserCtxtPtr ctxt) {
3827 xmlDtdPtr dtd;
3828
Daniel Veillardd0463562001-10-13 09:15:48 +00003829 xmlInitParser();
3830
Owen Taylor3473f882001-02-23 17:55:21 +00003831 htmlDefaultSAXHandlerInit();
3832 ctxt->html = 1;
3833
3834 GROW;
3835 /*
3836 * SAX: beginning of the document processing.
3837 */
3838 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3839 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3840
3841 /*
3842 * Wipe out everything which is before the first '<'
3843 */
3844 SKIP_BLANKS;
3845 if (CUR == 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003846 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
3847 "Document is empty\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003848 }
3849
3850 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3851 ctxt->sax->startDocument(ctxt->userData);
3852
3853
3854 /*
3855 * Parse possible comments before any content
3856 */
3857 while ((CUR == '<') && (NXT(1) == '!') &&
3858 (NXT(2) == '-') && (NXT(3) == '-')) {
3859 htmlParseComment(ctxt);
3860 SKIP_BLANKS;
3861 }
3862
3863
3864 /*
3865 * Then possibly doc type declaration(s) and more Misc
3866 * (doctypedecl Misc*)?
3867 */
3868 if ((CUR == '<') && (NXT(1) == '!') &&
3869 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3870 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3871 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3872 (UPP(8) == 'E')) {
3873 htmlParseDocTypeDecl(ctxt);
3874 }
3875 SKIP_BLANKS;
3876
3877 /*
3878 * Parse possible comments before any content
3879 */
3880 while ((CUR == '<') && (NXT(1) == '!') &&
3881 (NXT(2) == '-') && (NXT(3) == '-')) {
3882 htmlParseComment(ctxt);
3883 SKIP_BLANKS;
3884 }
3885
3886 /*
3887 * Time to start parsing the tree itself
3888 */
3889 htmlParseContent(ctxt);
3890
3891 /*
3892 * autoclose
3893 */
3894 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003895 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003896
3897
3898 /*
3899 * SAX: end of the document processing.
3900 */
3901 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3902 ctxt->sax->endDocument(ctxt->userData);
3903
3904 if (ctxt->myDoc != NULL) {
3905 dtd = xmlGetIntSubset(ctxt->myDoc);
3906 if (dtd == NULL)
3907 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00003908 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00003909 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3910 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3911 }
3912 if (! ctxt->wellFormed) return(-1);
3913 return(0);
3914}
3915
3916
3917/************************************************************************
3918 * *
3919 * Parser contexts handling *
3920 * *
3921 ************************************************************************/
3922
3923/**
William M. Brackedb65a72004-02-06 07:36:04 +00003924 * htmlInitParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00003925 * @ctxt: an HTML parser context
3926 *
3927 * Initialize a parser context
Daniel Veillardf403d292003-10-05 13:51:35 +00003928 *
3929 * Returns 0 in case of success and -1 in case of error
Owen Taylor3473f882001-02-23 17:55:21 +00003930 */
3931
Daniel Veillardf403d292003-10-05 13:51:35 +00003932static int
Owen Taylor3473f882001-02-23 17:55:21 +00003933htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3934{
3935 htmlSAXHandler *sax;
3936
Daniel Veillardf403d292003-10-05 13:51:35 +00003937 if (ctxt == NULL) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00003938 memset(ctxt, 0, sizeof(htmlParserCtxt));
3939
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003940 ctxt->dict = xmlDictCreate();
3941 if (ctxt->dict == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003942 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
3943 return(-1);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003944 }
Owen Taylor3473f882001-02-23 17:55:21 +00003945 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3946 if (sax == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003947 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
3948 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00003949 }
3950 else
3951 memset(sax, 0, sizeof(htmlSAXHandler));
3952
3953 /* Allocate the Input stack */
3954 ctxt->inputTab = (htmlParserInputPtr *)
3955 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3956 if (ctxt->inputTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003957 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003958 ctxt->inputNr = 0;
3959 ctxt->inputMax = 0;
3960 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00003961 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00003962 }
3963 ctxt->inputNr = 0;
3964 ctxt->inputMax = 5;
3965 ctxt->input = NULL;
3966 ctxt->version = NULL;
3967 ctxt->encoding = NULL;
3968 ctxt->standalone = -1;
3969 ctxt->instate = XML_PARSER_START;
3970
3971 /* Allocate the Node stack */
3972 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3973 if (ctxt->nodeTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003974 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003975 ctxt->nodeNr = 0;
3976 ctxt->nodeMax = 0;
3977 ctxt->node = NULL;
3978 ctxt->inputNr = 0;
3979 ctxt->inputMax = 0;
3980 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00003981 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00003982 }
3983 ctxt->nodeNr = 0;
3984 ctxt->nodeMax = 10;
3985 ctxt->node = NULL;
3986
3987 /* Allocate the Name stack */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003988 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003989 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003990 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003991 ctxt->nameNr = 0;
3992 ctxt->nameMax = 10;
3993 ctxt->name = NULL;
3994 ctxt->nodeNr = 0;
3995 ctxt->nodeMax = 0;
3996 ctxt->node = NULL;
3997 ctxt->inputNr = 0;
3998 ctxt->inputMax = 0;
3999 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004000 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004001 }
4002 ctxt->nameNr = 0;
4003 ctxt->nameMax = 10;
4004 ctxt->name = NULL;
4005
Daniel Veillard092643b2003-09-25 14:29:29 +00004006 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
Owen Taylor3473f882001-02-23 17:55:21 +00004007 else {
4008 ctxt->sax = sax;
Daniel Veillard092643b2003-09-25 14:29:29 +00004009 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Owen Taylor3473f882001-02-23 17:55:21 +00004010 }
4011 ctxt->userData = ctxt;
4012 ctxt->myDoc = NULL;
4013 ctxt->wellFormed = 1;
4014 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00004015 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00004016 ctxt->html = 1;
William M. Brackedb65a72004-02-06 07:36:04 +00004017 ctxt->vctxt.userData = ctxt;
4018 ctxt->vctxt.error = xmlParserValidityError;
4019 ctxt->vctxt.warning = xmlParserValidityWarning;
Owen Taylor3473f882001-02-23 17:55:21 +00004020 ctxt->record_info = 0;
4021 ctxt->validate = 0;
4022 ctxt->nbChars = 0;
4023 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00004024 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004025 xmlInitNodeInfoSeq(&ctxt->node_seq);
Daniel Veillardf403d292003-10-05 13:51:35 +00004026 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00004027}
4028
4029/**
4030 * htmlFreeParserCtxt:
4031 * @ctxt: an HTML parser context
4032 *
4033 * Free all the memory used by a parser context. However the parsed
4034 * document in ctxt->myDoc is not freed.
4035 */
4036
4037void
4038htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4039{
4040 xmlFreeParserCtxt(ctxt);
4041}
4042
4043/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004044 * htmlNewParserCtxt:
4045 *
4046 * Allocate and initialize a new parser context.
4047 *
4048 * Returns the xmlParserCtxtPtr or NULL
4049 */
4050
4051static htmlParserCtxtPtr
4052htmlNewParserCtxt(void)
4053{
4054 xmlParserCtxtPtr ctxt;
4055
4056 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4057 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004058 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004059 return(NULL);
4060 }
4061 memset(ctxt, 0, sizeof(xmlParserCtxt));
Daniel Veillardf403d292003-10-05 13:51:35 +00004062 if (htmlInitParserCtxt(ctxt) < 0) {
4063 htmlFreeParserCtxt(ctxt);
4064 return(NULL);
4065 }
Daniel Veillard1d995272002-07-22 16:43:32 +00004066 return(ctxt);
4067}
4068
4069/**
4070 * htmlCreateMemoryParserCtxt:
4071 * @buffer: a pointer to a char array
4072 * @size: the size of the array
4073 *
4074 * Create a parser context for an HTML in-memory document.
4075 *
4076 * Returns the new parser context or NULL
4077 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004078htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004079htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4080 xmlParserCtxtPtr ctxt;
4081 xmlParserInputPtr input;
4082 xmlParserInputBufferPtr buf;
4083
4084 if (buffer == NULL)
4085 return(NULL);
4086 if (size <= 0)
4087 return(NULL);
4088
4089 ctxt = htmlNewParserCtxt();
4090 if (ctxt == NULL)
4091 return(NULL);
4092
4093 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4094 if (buf == NULL) return(NULL);
4095
4096 input = xmlNewInputStream(ctxt);
4097 if (input == NULL) {
4098 xmlFreeParserCtxt(ctxt);
4099 return(NULL);
4100 }
4101
4102 input->filename = NULL;
4103 input->buf = buf;
4104 input->base = input->buf->buffer->content;
4105 input->cur = input->buf->buffer->content;
4106 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4107
4108 inputPush(ctxt, input);
4109 return(ctxt);
4110}
4111
4112/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004113 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004114 * @cur: a pointer to an array of xmlChar
4115 * @encoding: a free form C string describing the HTML document encoding, or NULL
4116 *
4117 * Create a parser context for an HTML document.
4118 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004119 * TODO: check the need to add encoding handling there
4120 *
Owen Taylor3473f882001-02-23 17:55:21 +00004121 * Returns the new parser context or NULL
4122 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004123static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00004124htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004125 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004126 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004127
Daniel Veillard1d995272002-07-22 16:43:32 +00004128 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004129 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004130 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004131 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4132
4133 if (encoding != NULL) {
4134 xmlCharEncoding enc;
4135 xmlCharEncodingHandlerPtr handler;
4136
4137 if (ctxt->input->encoding != NULL)
4138 xmlFree((xmlChar *) ctxt->input->encoding);
Daniel Veillarde8ed6202003-08-14 23:39:01 +00004139 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004140
4141 enc = xmlParseCharEncoding(encoding);
4142 /*
4143 * registered set of known encodings
4144 */
4145 if (enc != XML_CHAR_ENCODING_ERROR) {
4146 xmlSwitchEncoding(ctxt, enc);
4147 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004148 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4149 "Unsupported encoding %s\n",
4150 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004151 }
4152 } else {
4153 /*
4154 * fallback for unknown encodings
4155 */
4156 handler = xmlFindCharEncodingHandler((const char *) encoding);
4157 if (handler != NULL) {
4158 xmlSwitchToEncoding(ctxt, handler);
4159 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004160 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4161 "Unsupported encoding %s\n",
4162 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004163 }
4164 }
4165 }
4166 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004167}
4168
Daniel Veillard73b013f2003-09-30 12:36:01 +00004169#ifdef LIBXML_PUSH_ENABLED
Owen Taylor3473f882001-02-23 17:55:21 +00004170/************************************************************************
4171 * *
4172 * Progressive parsing interfaces *
4173 * *
4174 ************************************************************************/
4175
4176/**
4177 * htmlParseLookupSequence:
4178 * @ctxt: an HTML parser context
4179 * @first: the first char to lookup
4180 * @next: the next char to lookup or zero
4181 * @third: the next char to lookup or zero
William M. Brack78637da2003-07-31 14:47:38 +00004182 * @comment: flag to force checking inside comments
Owen Taylor3473f882001-02-23 17:55:21 +00004183 *
4184 * Try to find if a sequence (first, next, third) or just (first next) or
4185 * (first) is available in the input stream.
4186 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4187 * to avoid rescanning sequences of bytes, it DOES change the state of the
4188 * parser, do not use liberally.
4189 * This is basically similar to xmlParseLookupSequence()
4190 *
4191 * Returns the index to the current parsing point if the full sequence
4192 * is available, -1 otherwise.
4193 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004194static int
Owen Taylor3473f882001-02-23 17:55:21 +00004195htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
William M. Brackc1939562003-08-05 15:52:22 +00004196 xmlChar next, xmlChar third, int iscomment) {
Owen Taylor3473f882001-02-23 17:55:21 +00004197 int base, len;
4198 htmlParserInputPtr in;
4199 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004200 int incomment = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004201
4202 in = ctxt->input;
4203 if (in == NULL) return(-1);
4204 base = in->cur - in->base;
4205 if (base < 0) return(-1);
4206 if (ctxt->checkIndex > base)
4207 base = ctxt->checkIndex;
4208 if (in->buf == NULL) {
4209 buf = in->base;
4210 len = in->length;
4211 } else {
4212 buf = in->buf->buffer->content;
4213 len = in->buf->buffer->use;
4214 }
4215 /* take into account the sequence length */
4216 if (third) len -= 2;
4217 else if (next) len --;
4218 for (;base < len;base++) {
William M. Brackc1939562003-08-05 15:52:22 +00004219 if (!incomment && (base + 4 < len) && !iscomment) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00004220 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4221 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4222 incomment = 1;
Daniel Veillard97e01882003-07-30 18:59:19 +00004223 /* do not increment past <! - some people use <!--> */
4224 base += 2;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004225 }
Daniel Veillardc1f78342001-11-10 11:43:05 +00004226 }
4227 if (incomment) {
William M. Brack4a557d92003-07-29 04:28:04 +00004228 if (base + 3 > len)
Daniel Veillardc1f78342001-11-10 11:43:05 +00004229 return(-1);
4230 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4231 (buf[base + 2] == '>')) {
4232 incomment = 0;
4233 base += 2;
4234 }
4235 continue;
4236 }
Owen Taylor3473f882001-02-23 17:55:21 +00004237 if (buf[base] == first) {
4238 if (third != 0) {
4239 if ((buf[base + 1] != next) ||
4240 (buf[base + 2] != third)) continue;
4241 } else if (next != 0) {
4242 if (buf[base + 1] != next) continue;
4243 }
4244 ctxt->checkIndex = 0;
4245#ifdef DEBUG_PUSH
4246 if (next == 0)
4247 xmlGenericError(xmlGenericErrorContext,
4248 "HPP: lookup '%c' found at %d\n",
4249 first, base);
4250 else if (third == 0)
4251 xmlGenericError(xmlGenericErrorContext,
4252 "HPP: lookup '%c%c' found at %d\n",
4253 first, next, base);
4254 else
4255 xmlGenericError(xmlGenericErrorContext,
4256 "HPP: lookup '%c%c%c' found at %d\n",
4257 first, next, third, base);
4258#endif
4259 return(base - (in->cur - in->base));
4260 }
4261 }
4262 ctxt->checkIndex = base;
4263#ifdef DEBUG_PUSH
4264 if (next == 0)
4265 xmlGenericError(xmlGenericErrorContext,
4266 "HPP: lookup '%c' failed\n", first);
4267 else if (third == 0)
4268 xmlGenericError(xmlGenericErrorContext,
4269 "HPP: lookup '%c%c' failed\n", first, next);
4270 else
4271 xmlGenericError(xmlGenericErrorContext,
4272 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4273#endif
4274 return(-1);
4275}
4276
4277/**
4278 * htmlParseTryOrFinish:
4279 * @ctxt: an HTML parser context
4280 * @terminate: last chunk indicator
4281 *
4282 * Try to progress on parsing
4283 *
4284 * Returns zero if no parsing was possible
4285 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004286static int
Owen Taylor3473f882001-02-23 17:55:21 +00004287htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4288 int ret = 0;
4289 htmlParserInputPtr in;
4290 int avail = 0;
4291 xmlChar cur, next;
4292
4293#ifdef DEBUG_PUSH
4294 switch (ctxt->instate) {
4295 case XML_PARSER_EOF:
4296 xmlGenericError(xmlGenericErrorContext,
4297 "HPP: try EOF\n"); break;
4298 case XML_PARSER_START:
4299 xmlGenericError(xmlGenericErrorContext,
4300 "HPP: try START\n"); break;
4301 case XML_PARSER_MISC:
4302 xmlGenericError(xmlGenericErrorContext,
4303 "HPP: try MISC\n");break;
4304 case XML_PARSER_COMMENT:
4305 xmlGenericError(xmlGenericErrorContext,
4306 "HPP: try COMMENT\n");break;
4307 case XML_PARSER_PROLOG:
4308 xmlGenericError(xmlGenericErrorContext,
4309 "HPP: try PROLOG\n");break;
4310 case XML_PARSER_START_TAG:
4311 xmlGenericError(xmlGenericErrorContext,
4312 "HPP: try START_TAG\n");break;
4313 case XML_PARSER_CONTENT:
4314 xmlGenericError(xmlGenericErrorContext,
4315 "HPP: try CONTENT\n");break;
4316 case XML_PARSER_CDATA_SECTION:
4317 xmlGenericError(xmlGenericErrorContext,
4318 "HPP: try CDATA_SECTION\n");break;
4319 case XML_PARSER_END_TAG:
4320 xmlGenericError(xmlGenericErrorContext,
4321 "HPP: try END_TAG\n");break;
4322 case XML_PARSER_ENTITY_DECL:
4323 xmlGenericError(xmlGenericErrorContext,
4324 "HPP: try ENTITY_DECL\n");break;
4325 case XML_PARSER_ENTITY_VALUE:
4326 xmlGenericError(xmlGenericErrorContext,
4327 "HPP: try ENTITY_VALUE\n");break;
4328 case XML_PARSER_ATTRIBUTE_VALUE:
4329 xmlGenericError(xmlGenericErrorContext,
4330 "HPP: try ATTRIBUTE_VALUE\n");break;
4331 case XML_PARSER_DTD:
4332 xmlGenericError(xmlGenericErrorContext,
4333 "HPP: try DTD\n");break;
4334 case XML_PARSER_EPILOG:
4335 xmlGenericError(xmlGenericErrorContext,
4336 "HPP: try EPILOG\n");break;
4337 case XML_PARSER_PI:
4338 xmlGenericError(xmlGenericErrorContext,
4339 "HPP: try PI\n");break;
4340 case XML_PARSER_SYSTEM_LITERAL:
4341 xmlGenericError(xmlGenericErrorContext,
4342 "HPP: try SYSTEM_LITERAL\n");break;
4343 }
4344#endif
4345
4346 while (1) {
4347
4348 in = ctxt->input;
4349 if (in == NULL) break;
4350 if (in->buf == NULL)
4351 avail = in->length - (in->cur - in->base);
4352 else
4353 avail = in->buf->buffer->use - (in->cur - in->base);
4354 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004355 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004356 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4357 /*
4358 * SAX: end of the document processing.
4359 */
4360 ctxt->instate = XML_PARSER_EOF;
4361 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4362 ctxt->sax->endDocument(ctxt->userData);
4363 }
4364 }
4365 if (avail < 1)
4366 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00004367 cur = in->cur[0];
4368 if (cur == 0) {
4369 SKIP(1);
4370 continue;
4371 }
4372
Owen Taylor3473f882001-02-23 17:55:21 +00004373 switch (ctxt->instate) {
4374 case XML_PARSER_EOF:
4375 /*
4376 * Document parsing is done !
4377 */
4378 goto done;
4379 case XML_PARSER_START:
4380 /*
4381 * Very first chars read from the document flow.
4382 */
4383 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004384 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004385 SKIP_BLANKS;
4386 if (in->buf == NULL)
4387 avail = in->length - (in->cur - in->base);
4388 else
4389 avail = in->buf->buffer->use - (in->cur - in->base);
4390 }
4391 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4392 ctxt->sax->setDocumentLocator(ctxt->userData,
4393 &xmlDefaultSAXLocator);
4394 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4395 (!ctxt->disableSAX))
4396 ctxt->sax->startDocument(ctxt->userData);
4397
4398 cur = in->cur[0];
4399 next = in->cur[1];
4400 if ((cur == '<') && (next == '!') &&
4401 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4402 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4403 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4404 (UPP(8) == 'E')) {
4405 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004406 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004407 goto done;
4408#ifdef DEBUG_PUSH
4409 xmlGenericError(xmlGenericErrorContext,
4410 "HPP: Parsing internal subset\n");
4411#endif
4412 htmlParseDocTypeDecl(ctxt);
4413 ctxt->instate = XML_PARSER_PROLOG;
4414#ifdef DEBUG_PUSH
4415 xmlGenericError(xmlGenericErrorContext,
4416 "HPP: entering PROLOG\n");
4417#endif
4418 } else {
4419 ctxt->instate = XML_PARSER_MISC;
4420 }
4421#ifdef DEBUG_PUSH
4422 xmlGenericError(xmlGenericErrorContext,
4423 "HPP: entering MISC\n");
4424#endif
4425 break;
4426 case XML_PARSER_MISC:
4427 SKIP_BLANKS;
4428 if (in->buf == NULL)
4429 avail = in->length - (in->cur - in->base);
4430 else
4431 avail = in->buf->buffer->use - (in->cur - in->base);
4432 if (avail < 2)
4433 goto done;
4434 cur = in->cur[0];
4435 next = in->cur[1];
4436 if ((cur == '<') && (next == '!') &&
4437 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4438 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004439 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004440 goto done;
4441#ifdef DEBUG_PUSH
4442 xmlGenericError(xmlGenericErrorContext,
4443 "HPP: Parsing Comment\n");
4444#endif
4445 htmlParseComment(ctxt);
4446 ctxt->instate = XML_PARSER_MISC;
4447 } else if ((cur == '<') && (next == '!') &&
4448 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4449 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4450 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4451 (UPP(8) == 'E')) {
4452 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004453 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004454 goto done;
4455#ifdef DEBUG_PUSH
4456 xmlGenericError(xmlGenericErrorContext,
4457 "HPP: Parsing internal subset\n");
4458#endif
4459 htmlParseDocTypeDecl(ctxt);
4460 ctxt->instate = XML_PARSER_PROLOG;
4461#ifdef DEBUG_PUSH
4462 xmlGenericError(xmlGenericErrorContext,
4463 "HPP: entering PROLOG\n");
4464#endif
4465 } else if ((cur == '<') && (next == '!') &&
4466 (avail < 9)) {
4467 goto done;
4468 } else {
4469 ctxt->instate = XML_PARSER_START_TAG;
4470#ifdef DEBUG_PUSH
4471 xmlGenericError(xmlGenericErrorContext,
4472 "HPP: entering START_TAG\n");
4473#endif
4474 }
4475 break;
4476 case XML_PARSER_PROLOG:
4477 SKIP_BLANKS;
4478 if (in->buf == NULL)
4479 avail = in->length - (in->cur - in->base);
4480 else
4481 avail = in->buf->buffer->use - (in->cur - in->base);
4482 if (avail < 2)
4483 goto done;
4484 cur = in->cur[0];
4485 next = in->cur[1];
4486 if ((cur == '<') && (next == '!') &&
4487 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4488 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004489 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004490 goto done;
4491#ifdef DEBUG_PUSH
4492 xmlGenericError(xmlGenericErrorContext,
4493 "HPP: Parsing Comment\n");
4494#endif
4495 htmlParseComment(ctxt);
4496 ctxt->instate = XML_PARSER_PROLOG;
4497 } else if ((cur == '<') && (next == '!') &&
4498 (avail < 4)) {
4499 goto done;
4500 } else {
4501 ctxt->instate = XML_PARSER_START_TAG;
4502#ifdef DEBUG_PUSH
4503 xmlGenericError(xmlGenericErrorContext,
4504 "HPP: entering START_TAG\n");
4505#endif
4506 }
4507 break;
4508 case XML_PARSER_EPILOG:
4509 if (in->buf == NULL)
4510 avail = in->length - (in->cur - in->base);
4511 else
4512 avail = in->buf->buffer->use - (in->cur - in->base);
4513 if (avail < 1)
4514 goto done;
4515 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004516 if (IS_BLANK_CH(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004517 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004518 goto done;
4519 }
4520 if (avail < 2)
4521 goto done;
4522 next = in->cur[1];
4523 if ((cur == '<') && (next == '!') &&
4524 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4525 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004526 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004527 goto done;
4528#ifdef DEBUG_PUSH
4529 xmlGenericError(xmlGenericErrorContext,
4530 "HPP: Parsing Comment\n");
4531#endif
4532 htmlParseComment(ctxt);
4533 ctxt->instate = XML_PARSER_EPILOG;
4534 } else if ((cur == '<') && (next == '!') &&
4535 (avail < 4)) {
4536 goto done;
4537 } else {
4538 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004539 ctxt->wellFormed = 0;
4540 ctxt->instate = XML_PARSER_EOF;
4541#ifdef DEBUG_PUSH
4542 xmlGenericError(xmlGenericErrorContext,
4543 "HPP: entering EOF\n");
4544#endif
4545 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4546 ctxt->sax->endDocument(ctxt->userData);
4547 goto done;
4548 }
4549 break;
4550 case XML_PARSER_START_TAG: {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004551 const xmlChar *name, *oldname;
Owen Taylor3473f882001-02-23 17:55:21 +00004552 int depth = ctxt->nameNr;
Daniel Veillardbb371292001-08-16 23:26:59 +00004553 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004554
4555 if (avail < 2)
4556 goto done;
4557 cur = in->cur[0];
4558 if (cur != '<') {
4559 ctxt->instate = XML_PARSER_CONTENT;
4560#ifdef DEBUG_PUSH
4561 xmlGenericError(xmlGenericErrorContext,
4562 "HPP: entering CONTENT\n");
4563#endif
4564 break;
4565 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004566 if (in->cur[1] == '/') {
4567 ctxt->instate = XML_PARSER_END_TAG;
4568 ctxt->checkIndex = 0;
4569#ifdef DEBUG_PUSH
4570 xmlGenericError(xmlGenericErrorContext,
4571 "HPP: entering END_TAG\n");
4572#endif
4573 break;
4574 }
Owen Taylor3473f882001-02-23 17:55:21 +00004575 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004576 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004577 goto done;
4578
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004579 oldname = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00004580 htmlParseStartTag(ctxt);
4581 name = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00004582 if (((depth == ctxt->nameNr) &&
4583 (xmlStrEqual(oldname, ctxt->name))) ||
4584 (name == NULL)) {
4585 if (CUR == '>')
4586 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004587 break;
4588 }
Owen Taylor3473f882001-02-23 17:55:21 +00004589
4590 /*
4591 * Lookup the info for that element.
4592 */
4593 info = htmlTagLookup(name);
4594 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004595 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4596 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004597 }
4598
4599 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004600 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004601 */
4602 if ((CUR == '/') && (NXT(1) == '>')) {
4603 SKIP(2);
4604 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4605 ctxt->sax->endElement(ctxt->userData, name);
4606 oldname = htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004607 ctxt->instate = XML_PARSER_CONTENT;
4608#ifdef DEBUG_PUSH
4609 xmlGenericError(xmlGenericErrorContext,
4610 "HPP: entering CONTENT\n");
4611#endif
4612 break;
4613 }
4614
4615 if (CUR == '>') {
4616 NEXT;
4617 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004618 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4619 "Couldn't find end of Start Tag %s\n",
4620 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004621
4622 /*
4623 * end of parsing of this node.
4624 */
4625 if (xmlStrEqual(name, ctxt->name)) {
4626 nodePop(ctxt);
4627 oldname = htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004628 }
4629
4630 ctxt->instate = XML_PARSER_CONTENT;
4631#ifdef DEBUG_PUSH
4632 xmlGenericError(xmlGenericErrorContext,
4633 "HPP: entering CONTENT\n");
4634#endif
4635 break;
4636 }
4637
4638 /*
4639 * Check for an Empty Element from DTD definition
4640 */
4641 if ((info != NULL) && (info->empty)) {
4642 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4643 ctxt->sax->endElement(ctxt->userData, name);
4644 oldname = htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004645 }
4646 ctxt->instate = XML_PARSER_CONTENT;
4647#ifdef DEBUG_PUSH
4648 xmlGenericError(xmlGenericErrorContext,
4649 "HPP: entering CONTENT\n");
4650#endif
4651 break;
4652 }
4653 case XML_PARSER_CONTENT: {
4654 long cons;
4655 /*
4656 * Handle preparsed entities and charRef
4657 */
4658 if (ctxt->token != 0) {
4659 xmlChar chr[2] = { 0 , 0 } ;
4660
4661 chr[0] = (xmlChar) ctxt->token;
4662 htmlCheckParagraph(ctxt);
4663 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4664 ctxt->sax->characters(ctxt->userData, chr, 1);
4665 ctxt->token = 0;
4666 ctxt->checkIndex = 0;
4667 }
4668 if ((avail == 1) && (terminate)) {
4669 cur = in->cur[0];
4670 if ((cur != '<') && (cur != '&')) {
4671 if (ctxt->sax != NULL) {
William M. Brack76e95df2003-10-18 16:20:14 +00004672 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004673 if (ctxt->sax->ignorableWhitespace != NULL)
4674 ctxt->sax->ignorableWhitespace(
4675 ctxt->userData, &cur, 1);
4676 } else {
4677 htmlCheckParagraph(ctxt);
4678 if (ctxt->sax->characters != NULL)
4679 ctxt->sax->characters(
4680 ctxt->userData, &cur, 1);
4681 }
4682 }
4683 ctxt->token = 0;
4684 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00004685 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00004686 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004687 }
Owen Taylor3473f882001-02-23 17:55:21 +00004688 }
4689 if (avail < 2)
4690 goto done;
4691 cur = in->cur[0];
4692 next = in->cur[1];
4693 cons = ctxt->nbChars;
4694 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4695 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4696 /*
4697 * Handle SCRIPT/STYLE separately
4698 */
4699 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004700 (htmlParseLookupSequence(ctxt, '<', '/', 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004701 goto done;
4702 htmlParseScript(ctxt);
4703 if ((cur == '<') && (next == '/')) {
4704 ctxt->instate = XML_PARSER_END_TAG;
4705 ctxt->checkIndex = 0;
4706#ifdef DEBUG_PUSH
4707 xmlGenericError(xmlGenericErrorContext,
4708 "HPP: entering END_TAG\n");
4709#endif
4710 break;
4711 }
4712 } else {
4713 /*
4714 * Sometimes DOCTYPE arrives in the middle of the document
4715 */
4716 if ((cur == '<') && (next == '!') &&
4717 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4718 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4719 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4720 (UPP(8) == 'E')) {
4721 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004722 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004723 goto done;
Daniel Veillardf403d292003-10-05 13:51:35 +00004724 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4725 "Misplaced DOCTYPE declaration\n",
4726 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004727 htmlParseDocTypeDecl(ctxt);
4728 } else if ((cur == '<') && (next == '!') &&
4729 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4730 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004731 (htmlParseLookupSequence(
4732 ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004733 goto done;
4734#ifdef DEBUG_PUSH
4735 xmlGenericError(xmlGenericErrorContext,
4736 "HPP: Parsing Comment\n");
4737#endif
4738 htmlParseComment(ctxt);
4739 ctxt->instate = XML_PARSER_CONTENT;
4740 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4741 goto done;
4742 } else if ((cur == '<') && (next == '/')) {
4743 ctxt->instate = XML_PARSER_END_TAG;
4744 ctxt->checkIndex = 0;
4745#ifdef DEBUG_PUSH
4746 xmlGenericError(xmlGenericErrorContext,
4747 "HPP: entering END_TAG\n");
4748#endif
4749 break;
4750 } else if (cur == '<') {
4751 ctxt->instate = XML_PARSER_START_TAG;
4752 ctxt->checkIndex = 0;
4753#ifdef DEBUG_PUSH
4754 xmlGenericError(xmlGenericErrorContext,
4755 "HPP: entering START_TAG\n");
4756#endif
4757 break;
4758 } else if (cur == '&') {
4759 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004760 (htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004761 goto done;
4762#ifdef DEBUG_PUSH
4763 xmlGenericError(xmlGenericErrorContext,
4764 "HPP: Parsing Reference\n");
4765#endif
4766 /* TODO: check generation of subtrees if noent !!! */
4767 htmlParseReference(ctxt);
4768 } else {
Daniel Veillard14f752c2003-08-09 11:44:50 +00004769 /*
4770 * check that the text sequence is complete
4771 * before handing out the data to the parser
4772 * to avoid problems with erroneous end of
4773 * data detection.
Owen Taylor3473f882001-02-23 17:55:21 +00004774 */
Daniel Veillard14f752c2003-08-09 11:44:50 +00004775 if ((!terminate) &&
4776 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
4777 goto done;
Owen Taylor3473f882001-02-23 17:55:21 +00004778 ctxt->checkIndex = 0;
4779#ifdef DEBUG_PUSH
4780 xmlGenericError(xmlGenericErrorContext,
4781 "HPP: Parsing char data\n");
4782#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004783 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004784 }
4785 }
4786 if (cons == ctxt->nbChars) {
4787 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004788 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4789 "detected an error in element content\n",
4790 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004791 }
4792 NEXT;
4793 break;
4794 }
4795
4796 break;
4797 }
4798 case XML_PARSER_END_TAG:
4799 if (avail < 2)
4800 goto done;
4801 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004802 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004803 goto done;
4804 htmlParseEndTag(ctxt);
4805 if (ctxt->nameNr == 0) {
4806 ctxt->instate = XML_PARSER_EPILOG;
4807 } else {
4808 ctxt->instate = XML_PARSER_CONTENT;
4809 }
4810 ctxt->checkIndex = 0;
4811#ifdef DEBUG_PUSH
4812 xmlGenericError(xmlGenericErrorContext,
4813 "HPP: entering CONTENT\n");
4814#endif
4815 break;
4816 case XML_PARSER_CDATA_SECTION:
Daniel Veillardf403d292003-10-05 13:51:35 +00004817 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4818 "HPP: internal error, state == CDATA\n",
4819 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004820 ctxt->instate = XML_PARSER_CONTENT;
4821 ctxt->checkIndex = 0;
4822#ifdef DEBUG_PUSH
4823 xmlGenericError(xmlGenericErrorContext,
4824 "HPP: entering CONTENT\n");
4825#endif
4826 break;
4827 case XML_PARSER_DTD:
Daniel Veillardf403d292003-10-05 13:51:35 +00004828 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4829 "HPP: internal error, state == DTD\n",
4830 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004831 ctxt->instate = XML_PARSER_CONTENT;
4832 ctxt->checkIndex = 0;
4833#ifdef DEBUG_PUSH
4834 xmlGenericError(xmlGenericErrorContext,
4835 "HPP: entering CONTENT\n");
4836#endif
4837 break;
4838 case XML_PARSER_COMMENT:
Daniel Veillardf403d292003-10-05 13:51:35 +00004839 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4840 "HPP: internal error, state == COMMENT\n",
4841 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004842 ctxt->instate = XML_PARSER_CONTENT;
4843 ctxt->checkIndex = 0;
4844#ifdef DEBUG_PUSH
4845 xmlGenericError(xmlGenericErrorContext,
4846 "HPP: entering CONTENT\n");
4847#endif
4848 break;
4849 case XML_PARSER_PI:
Daniel Veillardf403d292003-10-05 13:51:35 +00004850 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4851 "HPP: internal error, state == PI\n",
4852 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004853 ctxt->instate = XML_PARSER_CONTENT;
4854 ctxt->checkIndex = 0;
4855#ifdef DEBUG_PUSH
4856 xmlGenericError(xmlGenericErrorContext,
4857 "HPP: entering CONTENT\n");
4858#endif
4859 break;
4860 case XML_PARSER_ENTITY_DECL:
Daniel Veillardf403d292003-10-05 13:51:35 +00004861 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4862 "HPP: internal error, state == ENTITY_DECL\n",
4863 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004864 ctxt->instate = XML_PARSER_CONTENT;
4865 ctxt->checkIndex = 0;
4866#ifdef DEBUG_PUSH
4867 xmlGenericError(xmlGenericErrorContext,
4868 "HPP: entering CONTENT\n");
4869#endif
4870 break;
4871 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00004872 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4873 "HPP: internal error, state == ENTITY_VALUE\n",
4874 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004875 ctxt->instate = XML_PARSER_CONTENT;
4876 ctxt->checkIndex = 0;
4877#ifdef DEBUG_PUSH
4878 xmlGenericError(xmlGenericErrorContext,
4879 "HPP: entering DTD\n");
4880#endif
4881 break;
4882 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00004883 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4884 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
4885 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004886 ctxt->instate = XML_PARSER_START_TAG;
4887 ctxt->checkIndex = 0;
4888#ifdef DEBUG_PUSH
4889 xmlGenericError(xmlGenericErrorContext,
4890 "HPP: entering START_TAG\n");
4891#endif
4892 break;
4893 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00004894 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4895 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
4896 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004897 ctxt->instate = XML_PARSER_CONTENT;
4898 ctxt->checkIndex = 0;
4899#ifdef DEBUG_PUSH
4900 xmlGenericError(xmlGenericErrorContext,
4901 "HPP: entering CONTENT\n");
4902#endif
4903 break;
4904 case XML_PARSER_IGNORE:
Daniel Veillardf403d292003-10-05 13:51:35 +00004905 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4906 "HPP: internal error, state == XML_PARSER_IGNORE\n",
4907 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004908 ctxt->instate = XML_PARSER_CONTENT;
4909 ctxt->checkIndex = 0;
4910#ifdef DEBUG_PUSH
4911 xmlGenericError(xmlGenericErrorContext,
4912 "HPP: entering CONTENT\n");
4913#endif
4914 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00004915 case XML_PARSER_PUBLIC_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00004916 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4917 "HPP: internal error, state == XML_PARSER_LITERAL\n",
4918 NULL, NULL);
Daniel Veillard044fc6b2002-03-04 17:09:44 +00004919 ctxt->instate = XML_PARSER_CONTENT;
4920 ctxt->checkIndex = 0;
4921#ifdef DEBUG_PUSH
4922 xmlGenericError(xmlGenericErrorContext,
4923 "HPP: entering CONTENT\n");
4924#endif
4925 break;
4926
Owen Taylor3473f882001-02-23 17:55:21 +00004927 }
4928 }
4929done:
4930 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004931 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004932 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4933 /*
4934 * SAX: end of the document processing.
4935 */
4936 ctxt->instate = XML_PARSER_EOF;
4937 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4938 ctxt->sax->endDocument(ctxt->userData);
4939 }
4940 }
4941 if ((ctxt->myDoc != NULL) &&
4942 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4943 (ctxt->instate == XML_PARSER_EPILOG))) {
4944 xmlDtdPtr dtd;
4945 dtd = xmlGetIntSubset(ctxt->myDoc);
4946 if (dtd == NULL)
4947 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00004948 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00004949 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4950 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4951 }
4952#ifdef DEBUG_PUSH
4953 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4954#endif
4955 return(ret);
4956}
4957
4958/**
Owen Taylor3473f882001-02-23 17:55:21 +00004959 * htmlParseChunk:
Daniel Veillardf403d292003-10-05 13:51:35 +00004960 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00004961 * @chunk: an char array
4962 * @size: the size in byte of the chunk
4963 * @terminate: last chunk indicator
4964 *
4965 * Parse a Chunk of memory
4966 *
4967 * Returns zero if no error, the xmlParserErrors otherwise.
4968 */
4969int
4970htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4971 int terminate) {
4972 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4973 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4974 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4975 int cur = ctxt->input->cur - ctxt->input->base;
4976
4977 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4978 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4979 ctxt->input->cur = ctxt->input->base + cur;
4980#ifdef DEBUG_PUSH
4981 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4982#endif
4983
Daniel Veillard14f752c2003-08-09 11:44:50 +00004984#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00004985 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4986 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00004987#endif
Owen Taylor3473f882001-02-23 17:55:21 +00004988 } else if (ctxt->instate != XML_PARSER_EOF) {
Daniel Veillard14f752c2003-08-09 11:44:50 +00004989 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
4990 xmlParserInputBufferPtr in = ctxt->input->buf;
4991 if ((in->encoder != NULL) && (in->buffer != NULL) &&
4992 (in->raw != NULL)) {
4993 int nbchars;
4994
4995 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
4996 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004997 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
4998 "encoder error\n", NULL, NULL);
Daniel Veillard14f752c2003-08-09 11:44:50 +00004999 return(XML_ERR_INVALID_ENCODING);
5000 }
5001 }
5002 }
Owen Taylor3473f882001-02-23 17:55:21 +00005003 }
Daniel Veillard14f752c2003-08-09 11:44:50 +00005004 htmlParseTryOrFinish(ctxt, terminate);
Owen Taylor3473f882001-02-23 17:55:21 +00005005 if (terminate) {
5006 if ((ctxt->instate != XML_PARSER_EOF) &&
5007 (ctxt->instate != XML_PARSER_EPILOG) &&
5008 (ctxt->instate != XML_PARSER_MISC)) {
5009 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005010 ctxt->wellFormed = 0;
5011 }
5012 if (ctxt->instate != XML_PARSER_EOF) {
5013 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5014 ctxt->sax->endDocument(ctxt->userData);
5015 }
5016 ctxt->instate = XML_PARSER_EOF;
5017 }
5018 return((xmlParserErrors) ctxt->errNo);
5019}
Daniel Veillard73b013f2003-09-30 12:36:01 +00005020#endif /* LIBXML_PUSH_ENABLED */
Owen Taylor3473f882001-02-23 17:55:21 +00005021
5022/************************************************************************
5023 * *
5024 * User entry points *
5025 * *
5026 ************************************************************************/
5027
5028/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005029 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005030 * @sax: a SAX handler
5031 * @user_data: The user data returned on SAX callbacks
5032 * @chunk: a pointer to an array of chars
5033 * @size: number of chars in the array
5034 * @filename: an optional file name or URI
5035 * @enc: an optional encoding
5036 *
5037 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00005038 * The value of @filename is used for fetching external entities
5039 * and error/warning reports.
5040 *
5041 * Returns the new parser context or NULL
5042 */
5043htmlParserCtxtPtr
5044htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5045 const char *chunk, int size, const char *filename,
5046 xmlCharEncoding enc) {
5047 htmlParserCtxtPtr ctxt;
5048 htmlParserInputPtr inputStream;
5049 xmlParserInputBufferPtr buf;
5050
Daniel Veillardd0463562001-10-13 09:15:48 +00005051 xmlInitParser();
5052
Owen Taylor3473f882001-02-23 17:55:21 +00005053 buf = xmlAllocParserInputBuffer(enc);
5054 if (buf == NULL) return(NULL);
5055
Daniel Veillardf403d292003-10-05 13:51:35 +00005056 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005057 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005058 xmlFreeParserInputBuffer(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005059 return(NULL);
5060 }
Daniel Veillard77a90a72003-03-22 00:04:05 +00005061 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5062 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00005063 if (sax != NULL) {
Daniel Veillard092643b2003-09-25 14:29:29 +00005064 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
Owen Taylor3473f882001-02-23 17:55:21 +00005065 xmlFree(ctxt->sax);
5066 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5067 if (ctxt->sax == NULL) {
5068 xmlFree(buf);
5069 xmlFree(ctxt);
5070 return(NULL);
5071 }
5072 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5073 if (user_data != NULL)
5074 ctxt->userData = user_data;
5075 }
5076 if (filename == NULL) {
5077 ctxt->directory = NULL;
5078 } else {
5079 ctxt->directory = xmlParserGetDirectory(filename);
5080 }
5081
5082 inputStream = htmlNewInputStream(ctxt);
5083 if (inputStream == NULL) {
5084 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005085 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005086 return(NULL);
5087 }
5088
5089 if (filename == NULL)
5090 inputStream->filename = NULL;
5091 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00005092 inputStream->filename = (char *)
5093 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00005094 inputStream->buf = buf;
5095 inputStream->base = inputStream->buf->buffer->content;
5096 inputStream->cur = inputStream->buf->buffer->content;
Daniel Veillard5f704af2003-03-05 10:01:43 +00005097 inputStream->end =
5098 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005099
5100 inputPush(ctxt, inputStream);
5101
5102 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5103 (ctxt->input->buf != NULL)) {
Daniel Veillard5f704af2003-03-05 10:01:43 +00005104 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5105 int cur = ctxt->input->cur - ctxt->input->base;
5106
Owen Taylor3473f882001-02-23 17:55:21 +00005107 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00005108
5109 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5110 ctxt->input->cur = ctxt->input->base + cur;
5111 ctxt->input->end =
5112 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005113#ifdef DEBUG_PUSH
5114 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5115#endif
5116 }
5117
5118 return(ctxt);
5119}
5120
5121/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005122 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005123 * @cur: a pointer to an array of xmlChar
5124 * @encoding: a free form C string describing the HTML document encoding, or NULL
5125 * @sax: the SAX handler block
5126 * @userData: if using SAX, this pointer will be provided on callbacks.
5127 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005128 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5129 * to handle parse events. If sax is NULL, fallback to the default DOM
5130 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00005131 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005132 * Returns the resulting document tree unless SAX is NULL or the document is
5133 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005134 */
5135
5136htmlDocPtr
5137htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5138 htmlDocPtr ret;
5139 htmlParserCtxtPtr ctxt;
5140
Daniel Veillardd0463562001-10-13 09:15:48 +00005141 xmlInitParser();
5142
Owen Taylor3473f882001-02-23 17:55:21 +00005143 if (cur == NULL) return(NULL);
5144
5145
5146 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5147 if (ctxt == NULL) return(NULL);
5148 if (sax != NULL) {
Daniel Veillardb19ba832003-08-14 00:33:46 +00005149 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
Owen Taylor3473f882001-02-23 17:55:21 +00005150 ctxt->sax = sax;
5151 ctxt->userData = userData;
5152 }
5153
5154 htmlParseDocument(ctxt);
5155 ret = ctxt->myDoc;
5156 if (sax != NULL) {
5157 ctxt->sax = NULL;
5158 ctxt->userData = NULL;
5159 }
5160 htmlFreeParserCtxt(ctxt);
5161
5162 return(ret);
5163}
5164
5165/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005166 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005167 * @cur: a pointer to an array of xmlChar
5168 * @encoding: a free form C string describing the HTML document encoding, or NULL
5169 *
5170 * parse an HTML in-memory document and build a tree.
5171 *
5172 * Returns the resulting document tree
5173 */
5174
5175htmlDocPtr
5176htmlParseDoc(xmlChar *cur, const char *encoding) {
5177 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5178}
5179
5180
5181/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005182 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005183 * @filename: the filename
5184 * @encoding: a free form C string describing the HTML document encoding, or NULL
5185 *
5186 * Create a parser context for a file content.
5187 * Automatic support for ZLIB/Compress compressed document is provided
5188 * by default if found at compile-time.
5189 *
5190 * Returns the new parser context or NULL
5191 */
5192htmlParserCtxtPtr
5193htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5194{
5195 htmlParserCtxtPtr ctxt;
5196 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005197 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00005198 /* htmlCharEncoding enc; */
5199 xmlChar *content, *content_line = (xmlChar *) "charset=";
5200
Daniel Veillardf403d292003-10-05 13:51:35 +00005201 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005202 if (ctxt == NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +00005203 return(NULL);
5204 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005205 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5206 if (canonicFilename == NULL) {
Daniel Veillard87247e82004-01-13 20:42:02 +00005207#ifdef LIBXML_SAX1_ENABLED
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005208 if (xmlDefaultSAXHandler.error != NULL) {
5209 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5210 }
Daniel Veillard87247e82004-01-13 20:42:02 +00005211#endif
Daniel Veillard104caa32003-05-13 22:54:05 +00005212 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005213 return(NULL);
5214 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005215
5216 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5217 xmlFree(canonicFilename);
5218 if (inputStream == NULL) {
5219 xmlFreeParserCtxt(ctxt);
5220 return(NULL);
5221 }
Owen Taylor3473f882001-02-23 17:55:21 +00005222
5223 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005224
Owen Taylor3473f882001-02-23 17:55:21 +00005225 /* set encoding */
5226 if (encoding) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00005227 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
Owen Taylor3473f882001-02-23 17:55:21 +00005228 if (content) {
5229 strcpy ((char *)content, (char *)content_line);
5230 strcat ((char *)content, (char *)encoding);
5231 htmlCheckEncoding (ctxt, content);
5232 xmlFree (content);
5233 }
5234 }
5235
5236 return(ctxt);
5237}
5238
5239/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005240 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005241 * @filename: the filename
5242 * @encoding: a free form C string describing the HTML document encoding, or NULL
5243 * @sax: the SAX handler block
5244 * @userData: if using SAX, this pointer will be provided on callbacks.
5245 *
5246 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5247 * compressed document is provided by default if found at compile-time.
5248 * It use the given SAX function block to handle the parsing callback.
5249 * If sax is NULL, fallback to the default DOM tree building routines.
5250 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005251 * Returns the resulting document tree unless SAX is NULL or the document is
5252 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005253 */
5254
5255htmlDocPtr
5256htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5257 void *userData) {
5258 htmlDocPtr ret;
5259 htmlParserCtxtPtr ctxt;
5260 htmlSAXHandlerPtr oldsax = NULL;
5261
Daniel Veillardd0463562001-10-13 09:15:48 +00005262 xmlInitParser();
5263
Owen Taylor3473f882001-02-23 17:55:21 +00005264 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5265 if (ctxt == NULL) return(NULL);
5266 if (sax != NULL) {
5267 oldsax = ctxt->sax;
5268 ctxt->sax = sax;
5269 ctxt->userData = userData;
5270 }
5271
5272 htmlParseDocument(ctxt);
5273
5274 ret = ctxt->myDoc;
5275 if (sax != NULL) {
5276 ctxt->sax = oldsax;
5277 ctxt->userData = NULL;
5278 }
5279 htmlFreeParserCtxt(ctxt);
5280
5281 return(ret);
5282}
5283
5284/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005285 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005286 * @filename: the filename
5287 * @encoding: a free form C string describing the HTML document encoding, or NULL
5288 *
5289 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5290 * compressed document is provided by default if found at compile-time.
5291 *
5292 * Returns the resulting document tree
5293 */
5294
5295htmlDocPtr
5296htmlParseFile(const char *filename, const char *encoding) {
5297 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5298}
5299
5300/**
5301 * htmlHandleOmittedElem:
5302 * @val: int 0 or 1
5303 *
5304 * Set and return the previous value for handling HTML omitted tags.
5305 *
5306 * Returns the last value for 0 for no handling, 1 for auto insertion.
5307 */
5308
5309int
5310htmlHandleOmittedElem(int val) {
5311 int old = htmlOmittedDefaultValue;
5312
5313 htmlOmittedDefaultValue = val;
5314 return(old);
5315}
5316
Daniel Veillard930dfb62003-02-05 10:17:38 +00005317/**
5318 * htmlElementAllowedHere:
5319 * @parent: HTML parent element
5320 * @elt: HTML element
5321 *
5322 * Checks whether an HTML element may be a direct child of a parent element.
5323 * Note - doesn't check for deprecated elements
5324 *
5325 * Returns 1 if allowed; 0 otherwise.
5326 */
5327int
5328htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5329 const char** p ;
5330
5331 if ( ! elt || ! parent || ! parent->subelts )
5332 return 0 ;
5333
5334 for ( p = parent->subelts; *p; ++p )
5335 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5336 return 1 ;
5337
5338 return 0 ;
5339}
5340/**
5341 * htmlElementStatusHere:
5342 * @parent: HTML parent element
5343 * @elt: HTML element
5344 *
5345 * Checks whether an HTML element may be a direct child of a parent element.
5346 * and if so whether it is valid or deprecated.
5347 *
5348 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5349 */
5350htmlStatus
5351htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5352 if ( ! parent || ! elt )
5353 return HTML_INVALID ;
5354 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5355 return HTML_INVALID ;
5356
5357 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5358}
5359/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005360 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00005361 * @elt: HTML element
5362 * @attr: HTML attribute
5363 * @legacy: whether to allow deprecated attributes
5364 *
5365 * Checks whether an attribute is valid for an element
5366 * Has full knowledge of Required and Deprecated attributes
5367 *
5368 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5369 */
5370htmlStatus
5371htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5372 const char** p ;
5373
5374 if ( !elt || ! attr )
5375 return HTML_INVALID ;
5376
5377 if ( elt->attrs_req )
5378 for ( p = elt->attrs_req; *p; ++p)
5379 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5380 return HTML_REQUIRED ;
5381
5382 if ( elt->attrs_opt )
5383 for ( p = elt->attrs_opt; *p; ++p)
5384 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5385 return HTML_VALID ;
5386
5387 if ( legacy && elt->attrs_depr )
5388 for ( p = elt->attrs_depr; *p; ++p)
5389 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5390 return HTML_DEPRECATED ;
5391
5392 return HTML_INVALID ;
5393}
5394/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005395 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00005396 * @node: an htmlNodePtr in a tree
5397 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00005398 * for Element nodes)
5399 *
5400 * Checks whether the tree node is valid. Experimental (the author
5401 * only uses the HTML enhancements in a SAX parser)
5402 *
5403 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5404 * legacy allowed) or htmlElementStatusHere (otherwise).
5405 * for Attribute nodes, a return from htmlAttrAllowed
5406 * for other nodes, HTML_NA (no checks performed)
5407 */
5408htmlStatus
5409htmlNodeStatus(const htmlNodePtr node, int legacy) {
5410 if ( ! node )
5411 return HTML_INVALID ;
5412
5413 switch ( node->type ) {
5414 case XML_ELEMENT_NODE:
5415 return legacy
5416 ? ( htmlElementAllowedHere (
5417 htmlTagLookup(node->parent->name) , node->name
5418 ) ? HTML_VALID : HTML_INVALID )
5419 : htmlElementStatusHere(
5420 htmlTagLookup(node->parent->name) ,
5421 htmlTagLookup(node->name) )
5422 ;
5423 case XML_ATTRIBUTE_NODE:
5424 return htmlAttrAllowed(
5425 htmlTagLookup(node->parent->name) , node->name, legacy) ;
5426 default: return HTML_NA ;
5427 }
5428}
Daniel Veillard9475a352003-09-26 12:47:50 +00005429/************************************************************************
5430 * *
5431 * New set (2.6.0) of simpler and more flexible APIs *
5432 * *
5433 ************************************************************************/
5434/**
5435 * DICT_FREE:
5436 * @str: a string
5437 *
5438 * Free a string if it is not owned by the "dict" dictionnary in the
5439 * current scope
5440 */
5441#define DICT_FREE(str) \
5442 if ((str) && ((!dict) || \
5443 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
5444 xmlFree((char *)(str));
5445
5446/**
5447 * htmlCtxtReset:
Daniel Veillardf403d292003-10-05 13:51:35 +00005448 * @ctxt: an HTML parser context
Daniel Veillard9475a352003-09-26 12:47:50 +00005449 *
5450 * Reset a parser context
5451 */
5452void
5453htmlCtxtReset(htmlParserCtxtPtr ctxt)
5454{
5455 xmlParserInputPtr input;
5456 xmlDictPtr dict = ctxt->dict;
5457
5458 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5459 xmlFreeInputStream(input);
5460 }
5461 ctxt->inputNr = 0;
5462 ctxt->input = NULL;
5463
5464 ctxt->spaceNr = 0;
5465 ctxt->spaceTab[0] = -1;
5466 ctxt->space = &ctxt->spaceTab[0];
5467
5468
5469 ctxt->nodeNr = 0;
5470 ctxt->node = NULL;
5471
5472 ctxt->nameNr = 0;
5473 ctxt->name = NULL;
5474
5475 DICT_FREE(ctxt->version);
5476 ctxt->version = NULL;
5477 DICT_FREE(ctxt->encoding);
5478 ctxt->encoding = NULL;
5479 DICT_FREE(ctxt->directory);
5480 ctxt->directory = NULL;
5481 DICT_FREE(ctxt->extSubURI);
5482 ctxt->extSubURI = NULL;
5483 DICT_FREE(ctxt->extSubSystem);
5484 ctxt->extSubSystem = NULL;
5485 if (ctxt->myDoc != NULL)
5486 xmlFreeDoc(ctxt->myDoc);
5487 ctxt->myDoc = NULL;
5488
5489 ctxt->standalone = -1;
5490 ctxt->hasExternalSubset = 0;
5491 ctxt->hasPErefs = 0;
5492 ctxt->html = 1;
5493 ctxt->external = 0;
5494 ctxt->instate = XML_PARSER_START;
5495 ctxt->token = 0;
5496
5497 ctxt->wellFormed = 1;
5498 ctxt->nsWellFormed = 1;
5499 ctxt->valid = 1;
5500 ctxt->vctxt.userData = ctxt;
5501 ctxt->vctxt.error = xmlParserValidityError;
5502 ctxt->vctxt.warning = xmlParserValidityWarning;
5503 ctxt->record_info = 0;
5504 ctxt->nbChars = 0;
5505 ctxt->checkIndex = 0;
5506 ctxt->inSubset = 0;
5507 ctxt->errNo = XML_ERR_OK;
5508 ctxt->depth = 0;
5509 ctxt->charset = XML_CHAR_ENCODING_UTF8;
5510 ctxt->catalogs = NULL;
5511 xmlInitNodeInfoSeq(&ctxt->node_seq);
5512
5513 if (ctxt->attsDefault != NULL) {
5514 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
5515 ctxt->attsDefault = NULL;
5516 }
5517 if (ctxt->attsSpecial != NULL) {
5518 xmlHashFree(ctxt->attsSpecial, NULL);
5519 ctxt->attsSpecial = NULL;
5520 }
5521}
5522
5523/**
5524 * htmlCtxtUseOptions:
5525 * @ctxt: an HTML parser context
5526 * @options: a combination of htmlParserOption(s)
5527 *
5528 * Applies the options to the parser context
5529 *
5530 * Returns 0 in case of success, the set of unknown or unimplemented options
5531 * in case of error.
5532 */
5533int
5534htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
5535{
5536 if (options & HTML_PARSE_NOWARNING) {
5537 ctxt->sax->warning = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005538 ctxt->vctxt.warning = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00005539 options -= XML_PARSE_NOWARNING;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005540 ctxt->options |= XML_PARSE_NOWARNING;
Daniel Veillard9475a352003-09-26 12:47:50 +00005541 }
5542 if (options & HTML_PARSE_NOERROR) {
5543 ctxt->sax->error = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005544 ctxt->vctxt.error = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00005545 ctxt->sax->fatalError = NULL;
5546 options -= XML_PARSE_NOERROR;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005547 ctxt->options |= XML_PARSE_NOERROR;
Daniel Veillard9475a352003-09-26 12:47:50 +00005548 }
5549 if (options & HTML_PARSE_PEDANTIC) {
5550 ctxt->pedantic = 1;
5551 options -= XML_PARSE_PEDANTIC;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005552 ctxt->options |= XML_PARSE_PEDANTIC;
Daniel Veillard9475a352003-09-26 12:47:50 +00005553 } else
5554 ctxt->pedantic = 0;
5555 if (options & XML_PARSE_NOBLANKS) {
5556 ctxt->keepBlanks = 0;
5557 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5558 options -= XML_PARSE_NOBLANKS;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005559 ctxt->options |= XML_PARSE_NOBLANKS;
Daniel Veillard9475a352003-09-26 12:47:50 +00005560 } else
5561 ctxt->keepBlanks = 1;
5562 ctxt->dictNames = 0;
5563 return (options);
5564}
5565
5566/**
5567 * htmlDoRead:
5568 * @ctxt: an HTML parser context
5569 * @URL: the base URL to use for the document
5570 * @encoding: the document encoding, or NULL
5571 * @options: a combination of htmlParserOption(s)
5572 * @reuse: keep the context for reuse
5573 *
5574 * Common front-end for the htmlRead functions
5575 *
5576 * Returns the resulting document tree or NULL
5577 */
5578static htmlDocPtr
5579htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
5580 int options, int reuse)
5581{
5582 htmlDocPtr ret;
5583
5584 htmlCtxtUseOptions(ctxt, options);
5585 ctxt->html = 1;
5586 if (encoding != NULL) {
5587 xmlCharEncodingHandlerPtr hdlr;
5588
5589 hdlr = xmlFindCharEncodingHandler(encoding);
5590 if (hdlr != NULL)
5591 xmlSwitchToEncoding(ctxt, hdlr);
5592 }
5593 if ((URL != NULL) && (ctxt->input != NULL) &&
5594 (ctxt->input->filename == NULL))
5595 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
5596 htmlParseDocument(ctxt);
5597 ret = ctxt->myDoc;
5598 ctxt->myDoc = NULL;
5599 if (!reuse) {
5600 if ((ctxt->dictNames) &&
5601 (ret != NULL) &&
5602 (ret->dict == ctxt->dict))
5603 ctxt->dict = NULL;
5604 xmlFreeParserCtxt(ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00005605 }
5606 return (ret);
5607}
5608
5609/**
5610 * htmlReadDoc:
5611 * @cur: a pointer to a zero terminated string
5612 * @URL: the base URL to use for the document
5613 * @encoding: the document encoding, or NULL
5614 * @options: a combination of htmlParserOption(s)
5615 *
5616 * parse an XML in-memory document and build a tree.
5617 *
5618 * Returns the resulting document tree
5619 */
5620htmlDocPtr
5621htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
5622{
5623 htmlParserCtxtPtr ctxt;
5624
5625 if (cur == NULL)
5626 return (NULL);
5627
5628 ctxt = xmlCreateDocParserCtxt(cur);
5629 if (ctxt == NULL)
5630 return (NULL);
5631 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5632}
5633
5634/**
5635 * htmlReadFile:
5636 * @filename: a file or URL
5637 * @encoding: the document encoding, or NULL
5638 * @options: a combination of htmlParserOption(s)
5639 *
5640 * parse an XML file from the filesystem or the network.
5641 *
5642 * Returns the resulting document tree
5643 */
5644htmlDocPtr
5645htmlReadFile(const char *filename, const char *encoding, int options)
5646{
5647 htmlParserCtxtPtr ctxt;
5648
5649 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5650 if (ctxt == NULL)
5651 return (NULL);
5652 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
5653}
5654
5655/**
5656 * htmlReadMemory:
5657 * @buffer: a pointer to a char array
5658 * @size: the size of the array
5659 * @URL: the base URL to use for the document
5660 * @encoding: the document encoding, or NULL
5661 * @options: a combination of htmlParserOption(s)
5662 *
5663 * parse an XML in-memory document and build a tree.
5664 *
5665 * Returns the resulting document tree
5666 */
5667htmlDocPtr
5668htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
5669{
5670 htmlParserCtxtPtr ctxt;
5671
5672 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
5673 if (ctxt == NULL)
5674 return (NULL);
William M. Brackd43cdcd2004-08-03 15:13:29 +00005675 if (ctxt->sax != NULL)
5676 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Daniel Veillard9475a352003-09-26 12:47:50 +00005677 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5678}
5679
5680/**
5681 * htmlReadFd:
5682 * @fd: an open file descriptor
5683 * @URL: the base URL to use for the document
5684 * @encoding: the document encoding, or NULL
5685 * @options: a combination of htmlParserOption(s)
5686 *
5687 * parse an XML from a file descriptor and build a tree.
5688 *
5689 * Returns the resulting document tree
5690 */
5691htmlDocPtr
5692htmlReadFd(int fd, const char *URL, const char *encoding, int options)
5693{
5694 htmlParserCtxtPtr ctxt;
5695 xmlParserInputBufferPtr input;
5696 xmlParserInputPtr stream;
5697
5698 if (fd < 0)
5699 return (NULL);
5700
5701 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
5702 if (input == NULL)
5703 return (NULL);
5704 ctxt = xmlNewParserCtxt();
5705 if (ctxt == NULL) {
5706 xmlFreeParserInputBuffer(input);
5707 return (NULL);
5708 }
5709 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5710 if (stream == NULL) {
5711 xmlFreeParserInputBuffer(input);
5712 xmlFreeParserCtxt(ctxt);
5713 return (NULL);
5714 }
5715 inputPush(ctxt, stream);
5716 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5717}
5718
5719/**
5720 * htmlReadIO:
5721 * @ioread: an I/O read function
5722 * @ioclose: an I/O close function
5723 * @ioctx: an I/O handler
5724 * @URL: the base URL to use for the document
5725 * @encoding: the document encoding, or NULL
5726 * @options: a combination of htmlParserOption(s)
5727 *
5728 * parse an HTML document from I/O functions and source and build a tree.
5729 *
5730 * Returns the resulting document tree
5731 */
5732htmlDocPtr
5733htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
5734 void *ioctx, const char *URL, const char *encoding, int options)
5735{
5736 htmlParserCtxtPtr ctxt;
5737 xmlParserInputBufferPtr input;
5738 xmlParserInputPtr stream;
5739
5740 if (ioread == NULL)
5741 return (NULL);
5742
5743 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
5744 XML_CHAR_ENCODING_NONE);
5745 if (input == NULL)
5746 return (NULL);
5747 ctxt = xmlNewParserCtxt();
5748 if (ctxt == NULL) {
5749 xmlFreeParserInputBuffer(input);
5750 return (NULL);
5751 }
5752 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5753 if (stream == NULL) {
5754 xmlFreeParserInputBuffer(input);
5755 xmlFreeParserCtxt(ctxt);
5756 return (NULL);
5757 }
5758 inputPush(ctxt, stream);
5759 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5760}
5761
5762/**
5763 * htmlCtxtReadDoc:
5764 * @ctxt: an HTML parser context
5765 * @cur: a pointer to a zero terminated string
5766 * @URL: the base URL to use for the document
5767 * @encoding: the document encoding, or NULL
5768 * @options: a combination of htmlParserOption(s)
5769 *
5770 * parse an XML in-memory document and build a tree.
5771 * This reuses the existing @ctxt parser context
5772 *
5773 * Returns the resulting document tree
5774 */
5775htmlDocPtr
5776htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
5777 const char *URL, const char *encoding, int options)
5778{
5779 xmlParserInputPtr stream;
5780
5781 if (cur == NULL)
5782 return (NULL);
5783 if (ctxt == NULL)
5784 return (NULL);
5785
5786 htmlCtxtReset(ctxt);
5787
5788 stream = xmlNewStringInputStream(ctxt, cur);
5789 if (stream == NULL) {
5790 return (NULL);
5791 }
5792 inputPush(ctxt, stream);
5793 return (htmlDoRead(ctxt, URL, encoding, options, 1));
5794}
5795
5796/**
5797 * htmlCtxtReadFile:
5798 * @ctxt: an HTML parser context
5799 * @filename: a file or URL
5800 * @encoding: the document encoding, or NULL
5801 * @options: a combination of htmlParserOption(s)
5802 *
5803 * parse an XML file from the filesystem or the network.
5804 * This reuses the existing @ctxt parser context
5805 *
5806 * Returns the resulting document tree
5807 */
5808htmlDocPtr
5809htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
5810 const char *encoding, int options)
5811{
5812 xmlParserInputPtr stream;
5813
5814 if (filename == NULL)
5815 return (NULL);
5816 if (ctxt == NULL)
5817 return (NULL);
5818
5819 htmlCtxtReset(ctxt);
5820
5821 stream = xmlNewInputFromFile(ctxt, filename);
5822 if (stream == NULL) {
5823 return (NULL);
5824 }
5825 inputPush(ctxt, stream);
5826 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
5827}
5828
5829/**
5830 * htmlCtxtReadMemory:
5831 * @ctxt: an HTML parser context
5832 * @buffer: a pointer to a char array
5833 * @size: the size of the array
5834 * @URL: the base URL to use for the document
5835 * @encoding: the document encoding, or NULL
5836 * @options: a combination of htmlParserOption(s)
5837 *
5838 * parse an XML in-memory document and build a tree.
5839 * This reuses the existing @ctxt parser context
5840 *
5841 * Returns the resulting document tree
5842 */
5843htmlDocPtr
5844htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
5845 const char *URL, const char *encoding, int options)
5846{
5847 xmlParserInputBufferPtr input;
5848 xmlParserInputPtr stream;
5849
5850 if (ctxt == NULL)
5851 return (NULL);
5852 if (buffer == NULL)
5853 return (NULL);
5854
5855 htmlCtxtReset(ctxt);
5856
5857 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5858 if (input == NULL) {
5859 return(NULL);
5860 }
5861
5862 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5863 if (stream == NULL) {
5864 xmlFreeParserInputBuffer(input);
5865 return(NULL);
5866 }
5867
5868 inputPush(ctxt, stream);
5869 return (htmlDoRead(ctxt, URL, encoding, options, 1));
5870}
5871
5872/**
5873 * htmlCtxtReadFd:
5874 * @ctxt: an HTML parser context
5875 * @fd: an open file descriptor
5876 * @URL: the base URL to use for the document
5877 * @encoding: the document encoding, or NULL
5878 * @options: a combination of htmlParserOption(s)
5879 *
5880 * parse an XML from a file descriptor and build a tree.
5881 * This reuses the existing @ctxt parser context
5882 *
5883 * Returns the resulting document tree
5884 */
5885htmlDocPtr
5886htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
5887 const char *URL, const char *encoding, int options)
5888{
5889 xmlParserInputBufferPtr input;
5890 xmlParserInputPtr stream;
5891
5892 if (fd < 0)
5893 return (NULL);
5894 if (ctxt == NULL)
5895 return (NULL);
5896
5897 htmlCtxtReset(ctxt);
5898
5899
5900 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
5901 if (input == NULL)
5902 return (NULL);
5903 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5904 if (stream == NULL) {
5905 xmlFreeParserInputBuffer(input);
5906 return (NULL);
5907 }
5908 inputPush(ctxt, stream);
5909 return (htmlDoRead(ctxt, URL, encoding, options, 1));
5910}
5911
5912/**
5913 * htmlCtxtReadIO:
5914 * @ctxt: an HTML parser context
5915 * @ioread: an I/O read function
5916 * @ioclose: an I/O close function
5917 * @ioctx: an I/O handler
5918 * @URL: the base URL to use for the document
5919 * @encoding: the document encoding, or NULL
5920 * @options: a combination of htmlParserOption(s)
5921 *
5922 * parse an HTML document from I/O functions and source and build a tree.
5923 * This reuses the existing @ctxt parser context
5924 *
5925 * Returns the resulting document tree
5926 */
5927htmlDocPtr
5928htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
5929 xmlInputCloseCallback ioclose, void *ioctx,
5930 const char *URL,
5931 const char *encoding, int options)
5932{
5933 xmlParserInputBufferPtr input;
5934 xmlParserInputPtr stream;
5935
5936 if (ioread == NULL)
5937 return (NULL);
5938 if (ctxt == NULL)
5939 return (NULL);
5940
5941 htmlCtxtReset(ctxt);
5942
5943 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
5944 XML_CHAR_ENCODING_NONE);
5945 if (input == NULL)
5946 return (NULL);
5947 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5948 if (stream == NULL) {
5949 xmlFreeParserInputBuffer(input);
5950 return (NULL);
5951 }
5952 inputPush(ctxt, stream);
5953 return (htmlDoRead(ctxt, URL, encoding, options, 1));
5954}
5955
Owen Taylor3473f882001-02-23 17:55:21 +00005956#endif /* LIBXML_HTML_ENABLED */