blob: 309d93c6292515a3ae7d10769d9774b989be3f28 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
Daniel Veillard22090732001-07-16 00:06:07 +000054static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000055
Daniel Veillard56a4cb82001-03-24 17:00:36 +000056xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000058static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059
60/************************************************************************
61 * *
Daniel Veillardf403d292003-10-05 13:51:35 +000062 * Some factorized error routines *
63 * *
64 ************************************************************************/
65
66/**
William M. Brackedb65a72004-02-06 07:36:04 +000067 * htmlErrMemory:
Daniel Veillardf403d292003-10-05 13:51:35 +000068 * @ctxt: an HTML parser context
69 * @extra: extra informations
70 *
71 * Handle a redefinition of attribute error
72 */
73static void
74htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75{
Daniel Veillard157fee02003-10-31 10:36:03 +000076 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77 (ctxt->instate == XML_PARSER_EOF))
78 return;
Daniel Veillardf403d292003-10-05 13:51:35 +000079 if (ctxt != NULL) {
80 ctxt->errNo = XML_ERR_NO_MEMORY;
81 ctxt->instate = XML_PARSER_EOF;
82 ctxt->disableSAX = 1;
83 }
84 if (extra)
Daniel Veillard659e71e2003-10-10 14:10:40 +000085 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000086 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87 NULL, NULL, 0, 0,
88 "Memory allocation failed : %s\n", extra);
89 else
Daniel Veillard659e71e2003-10-10 14:10:40 +000090 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000091 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92 NULL, NULL, 0, 0, "Memory allocation failed\n");
93}
94
95/**
96 * htmlParseErr:
97 * @ctxt: an HTML parser context
98 * @error: the error number
99 * @msg: the error message
100 * @str1: string infor
101 * @str2: string infor
102 *
103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
104 */
105static void
106htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107 const char *msg, const xmlChar *str1, const xmlChar *str2)
108{
Daniel Veillard157fee02003-10-31 10:36:03 +0000109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110 (ctxt->instate == XML_PARSER_EOF))
111 return;
Daniel Veillardf403d292003-10-05 13:51:35 +0000112 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000113 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000114 XML_ERR_ERROR, NULL, 0,
115 (const char *) str1, (const char *) str2,
116 NULL, 0, 0,
117 msg, str1, str2);
118 ctxt->wellFormed = 0;
119}
120
121/**
122 * htmlParseErrInt:
123 * @ctxt: an HTML parser context
124 * @error: the error number
125 * @msg: the error message
126 * @val: integer info
127 *
128 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
129 */
130static void
131htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
132 const char *msg, int val)
133{
Daniel Veillard157fee02003-10-31 10:36:03 +0000134 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
135 (ctxt->instate == XML_PARSER_EOF))
136 return;
Daniel Veillardf403d292003-10-05 13:51:35 +0000137 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000138 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000139 XML_ERR_ERROR, NULL, 0, NULL, NULL,
140 NULL, val, 0, msg, val);
141 ctxt->wellFormed = 0;
142}
143
144/************************************************************************
145 * *
Owen Taylor3473f882001-02-23 17:55:21 +0000146 * Parser stacks related functions and macros *
147 * *
148 ************************************************************************/
149
Daniel Veillard1c732d22002-11-30 11:22:59 +0000150/**
151 * htmlnamePush:
152 * @ctxt: an HTML parser context
153 * @value: the element name
154 *
155 * Pushes a new element name on top of the name stack
156 *
157 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +0000158 */
Daniel Veillard1c732d22002-11-30 11:22:59 +0000159static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000160htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
Daniel Veillard1c732d22002-11-30 11:22:59 +0000161{
162 if (ctxt->nameNr >= ctxt->nameMax) {
163 ctxt->nameMax *= 2;
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000164 ctxt->nameTab = (const xmlChar * *)
Igor Zlatkovicd37c1392003-08-28 10:34:33 +0000165 xmlRealloc((xmlChar * *)ctxt->nameTab,
Daniel Veillard1c732d22002-11-30 11:22:59 +0000166 ctxt->nameMax *
167 sizeof(ctxt->nameTab[0]));
168 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000169 htmlErrMemory(ctxt, NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000170 return (0);
171 }
172 }
173 ctxt->nameTab[ctxt->nameNr] = value;
174 ctxt->name = value;
175 return (ctxt->nameNr++);
176}
177/**
178 * htmlnamePop:
179 * @ctxt: an HTML parser context
180 *
181 * Pops the top element name from the name stack
182 *
183 * Returns the name just removed
184 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000185static const xmlChar *
Daniel Veillard1c732d22002-11-30 11:22:59 +0000186htmlnamePop(htmlParserCtxtPtr ctxt)
187{
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000188 const xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000189
Daniel Veillard1c732d22002-11-30 11:22:59 +0000190 if (ctxt->nameNr <= 0)
191 return (0);
192 ctxt->nameNr--;
193 if (ctxt->nameNr < 0)
194 return (0);
195 if (ctxt->nameNr > 0)
196 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
197 else
198 ctxt->name = NULL;
199 ret = ctxt->nameTab[ctxt->nameNr];
200 ctxt->nameTab[ctxt->nameNr] = 0;
201 return (ret);
202}
Owen Taylor3473f882001-02-23 17:55:21 +0000203
204/*
205 * Macros for accessing the content. Those should be used only by the parser,
206 * and not exported.
207 *
208 * Dirty macros, i.e. one need to make assumption on the context to use them
209 *
210 * CUR_PTR return the current pointer to the xmlChar to be parsed.
211 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
212 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
213 * in UNICODE mode. This should be used internally by the parser
214 * only to compare to ASCII values otherwise it would break when
215 * running with UTF-8 encoding.
216 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
217 * to compare on ASCII based substring.
218 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
219 * it should be used only to compare on ASCII based substring.
220 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000221 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000222 *
223 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
224 *
225 * CURRENT Returns the current char value, with the full decoding of
226 * UTF-8 if we are using this mode. It returns an int.
227 * NEXT Skip to the next character, this does the proper decoding
228 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000229 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000230 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
231 */
232
233#define UPPER (toupper(*ctxt->input->cur))
234
Daniel Veillard77a90a72003-03-22 00:04:05 +0000235#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000236
237#define NXT(val) ctxt->input->cur[(val)]
238
239#define UPP(val) (toupper(ctxt->input->cur[(val)]))
240
241#define CUR_PTR ctxt->input->cur
242
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000243#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
244 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
245 xmlParserInputShrink(ctxt->input)
Owen Taylor3473f882001-02-23 17:55:21 +0000246
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000247#define GROW if ((ctxt->progressive == 0) && \
248 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
249 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Owen Taylor3473f882001-02-23 17:55:21 +0000250
251#define CURRENT ((int) (*ctxt->input->cur))
252
253#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
254
255/* Inported from XML */
256
Daniel Veillard561b7f82002-03-20 21:55:57 +0000257/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
258#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000259#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000260
Daniel Veillard561b7f82002-03-20 21:55:57 +0000261#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000262#define NXT(val) ctxt->input->cur[(val)]
263#define CUR_PTR ctxt->input->cur
264
265
266#define NEXTL(l) do { \
267 if (*(ctxt->input->cur) == '\n') { \
268 ctxt->input->line++; ctxt->input->col = 1; \
269 } else ctxt->input->col++; \
270 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
271 } while (0)
272
273/************
274 \
275 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
276 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
277 ************/
278
279#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
280#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
281
282#define COPY_BUF(l,b,i,v) \
283 if (l == 1) b[i++] = (xmlChar) v; \
284 else i += xmlCopyChar(l,&b[i],v)
285
286/**
287 * htmlCurrentChar:
288 * @ctxt: the HTML parser context
289 * @len: pointer to the length of the char read
290 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000291 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000292 * bytes in the input buffer. Implement the end of line normalization:
293 * 2.11 End-of-Line Handling
294 * If the encoding is unspecified, in the case we find an ISO-Latin-1
295 * char, then the encoding converter is plugged in automatically.
296 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000297 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000298 */
299
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000300static int
Owen Taylor3473f882001-02-23 17:55:21 +0000301htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
302 if (ctxt->instate == XML_PARSER_EOF)
303 return(0);
304
305 if (ctxt->token != 0) {
306 *len = 0;
307 return(ctxt->token);
308 }
309 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
310 /*
311 * We are supposed to handle UTF8, check it's valid
312 * From rfc2044: encoding of the Unicode values on UTF-8:
313 *
314 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
315 * 0000 0000-0000 007F 0xxxxxxx
316 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
317 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
318 *
319 * Check for the 0x110000 limit too
320 */
321 const unsigned char *cur = ctxt->input->cur;
322 unsigned char c;
323 unsigned int val;
324
325 c = *cur;
326 if (c & 0x80) {
327 if (cur[1] == 0)
328 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
329 if ((cur[1] & 0xc0) != 0x80)
330 goto encoding_error;
331 if ((c & 0xe0) == 0xe0) {
332
333 if (cur[2] == 0)
334 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
335 if ((cur[2] & 0xc0) != 0x80)
336 goto encoding_error;
337 if ((c & 0xf0) == 0xf0) {
338 if (cur[3] == 0)
339 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
340 if (((c & 0xf8) != 0xf0) ||
341 ((cur[3] & 0xc0) != 0x80))
342 goto encoding_error;
343 /* 4-byte code */
344 *len = 4;
345 val = (cur[0] & 0x7) << 18;
346 val |= (cur[1] & 0x3f) << 12;
347 val |= (cur[2] & 0x3f) << 6;
348 val |= cur[3] & 0x3f;
349 } else {
350 /* 3-byte code */
351 *len = 3;
352 val = (cur[0] & 0xf) << 12;
353 val |= (cur[1] & 0x3f) << 6;
354 val |= cur[2] & 0x3f;
355 }
356 } else {
357 /* 2-byte code */
358 *len = 2;
359 val = (cur[0] & 0x1f) << 6;
360 val |= cur[1] & 0x3f;
361 }
362 if (!IS_CHAR(val)) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000363 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
364 "Char 0x%X out of allowed range\n", val);
Owen Taylor3473f882001-02-23 17:55:21 +0000365 }
366 return(val);
367 } else {
368 /* 1-byte code */
369 *len = 1;
370 return((int) *ctxt->input->cur);
371 }
372 }
373 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000374 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000375 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000376 * XML constructs only use < 128 chars
377 */
378 *len = 1;
379 if ((int) *ctxt->input->cur < 0x80)
380 return((int) *ctxt->input->cur);
381
382 /*
383 * Humm this is bad, do an automatic flow conversion
384 */
385 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
386 ctxt->charset = XML_CHAR_ENCODING_UTF8;
387 return(xmlCurrentChar(ctxt, len));
388
389encoding_error:
390 /*
391 * If we detect an UTF8 error that probably mean that the
392 * input encoding didn't get properly advertized in the
393 * declaration header. Report the error and switch the encoding
394 * to ISO-Latin-1 (if you don't like this policy, just declare the
395 * encoding !)
396 */
Daniel Veillardf403d292003-10-05 13:51:35 +0000397 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
398 "Input is not proper UTF-8, indicate encoding !\n",
399 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +0000400 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +0000401 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
402 ctxt->input->cur[0], ctxt->input->cur[1],
403 ctxt->input->cur[2], ctxt->input->cur[3]);
404 }
405
406 ctxt->charset = XML_CHAR_ENCODING_8859_1;
407 *len = 1;
408 return((int) *ctxt->input->cur);
409}
410
411/**
Owen Taylor3473f882001-02-23 17:55:21 +0000412 * htmlSkipBlankChars:
413 * @ctxt: the HTML parser context
414 *
415 * skip all blanks character found at that point in the input streams.
416 *
417 * Returns the number of space chars skipped
418 */
419
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000420static int
Owen Taylor3473f882001-02-23 17:55:21 +0000421htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
422 int res = 0;
423
William M. Brack76e95df2003-10-18 16:20:14 +0000424 while (IS_BLANK_CH(*(ctxt->input->cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000425 if ((*ctxt->input->cur == 0) &&
426 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
427 xmlPopInput(ctxt);
428 } else {
429 if (*(ctxt->input->cur) == '\n') {
430 ctxt->input->line++; ctxt->input->col = 1;
431 } else ctxt->input->col++;
432 ctxt->input->cur++;
433 ctxt->nbChars++;
434 if (*ctxt->input->cur == 0)
435 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
436 }
437 res++;
438 }
439 return(res);
440}
441
442
443
444/************************************************************************
445 * *
446 * The list of HTML elements and their properties *
447 * *
448 ************************************************************************/
449
450/*
451 * Start Tag: 1 means the start tag can be ommited
452 * End Tag: 1 means the end tag can be ommited
453 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000454 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000455 * Depr: this element is deprecated
456 * DTD: 1 means that this element is valid only in the Loose DTD
457 * 2 means that this element is valid only in the Frameset DTD
458 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000459 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000460 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000461 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000462
463/* Definitions and a couple of vars for HTML Elements */
464
465#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
466#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
467#define SPECIAL "a", "img", "applet", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
468#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
469#define BLOCK HEADING LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
470#define FORMCTRL "input", "select", "textarea", "label", "button"
471#define PCDATA
472#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
473#define LIST "ul", "ol", "dir", "menu"
474#define MODIFIER
475#define FLOW BLOCK,INLINE
476#define EMPTY NULL
477
478
479static const char* html_flow[] = { FLOW, NULL } ;
480static const char* html_inline[] = { INLINE, NULL } ;
481
482/* placeholders: elts with content but no subelements */
483static const char* html_pcdata[] = { NULL } ;
484#define html_cdata html_pcdata
485
486
487/* ... and for HTML Attributes */
488
489#define COREATTRS "id", "class", "style", "title"
490#define I18N "lang", "dir"
491#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
492#define ATTRS COREATTRS,I18N,EVENTS
493#define CELLHALIGN "align", "char", "charoff"
494#define CELLVALIGN "valign"
495
496static const char* html_attrs[] = { ATTRS, NULL } ;
497static const char* core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
498static const char* core_attrs[] = { COREATTRS, NULL } ;
499static const char* i18n_attrs[] = { I18N, NULL } ;
500
501
502/* Other declarations that should go inline ... */
503static const char* a_attrs[] = { ATTRS, "charset", "type", "name",
504 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
505 "tabindex", "onfocus", "onblur", NULL } ;
506static const char* target_attr[] = { "target", NULL } ;
507static const char* rows_cols_attr[] = { "rows", "cols", NULL } ;
508static const char* alt_attr[] = { "alt", NULL } ;
509static const char* src_alt_attrs[] = { "src", "alt", NULL } ;
510static const char* href_attrs[] = { "href", NULL } ;
511static const char* clear_attrs[] = { "clear", NULL } ;
512static const char* inline_p[] = { INLINE, "p", NULL } ;
513static const char* flow_param[] = { FLOW, "param", NULL } ;
514static const char* applet_attrs[] = { COREATTRS , "codebase",
515 "archive", "alt", "name", "height", "width", "align",
516 "hspace", "vspace", NULL } ;
517static const char* area_attrs[] = { "shape", "coords", "href", "nohref",
518 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
519static const char* basefont_attrs[] =
520 { "id", "size", "color", "face", NULL } ;
521static const char* quote_attrs[] = { ATTRS, "cite", NULL } ;
522static const char* body_contents[] = { FLOW, "ins", "del", NULL } ;
523static const char* body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
524static const char* body_depr[] = { "background", "bgcolor", "text",
525 "link", "vlink", "alink", NULL } ;
526static const char* button_attrs[] = { ATTRS, "name", "value", "type",
527 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
528
529
530static const char* col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
531static const char* col_elt[] = { "col", NULL } ;
532static const char* edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
533static const char* compact_attrs[] = { ATTRS, "compact", NULL } ;
534static const char* dl_contents[] = { "dt", "dd", NULL } ;
535static const char* compact_attr[] = { "compact", NULL } ;
536static const char* label_attr[] = { "label", NULL } ;
537static const char* fieldset_contents[] = { FLOW, "legend" } ;
538static const char* font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
539static const char* form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
540static const char* form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
541static const char* frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
542static const char* frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
543static const char* frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
544static const char* head_attrs[] = { I18N, "profile", NULL } ;
545static const char* head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
546static const char* hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
547static const char* version_attr[] = { "version", NULL } ;
548static const char* html_content[] = { "head", "body", "frameset", NULL } ;
549static const char* iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
550static const char* img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
551static const char* input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
552static const char* prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
553static const char* label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
554static const char* legend_attrs[] = { ATTRS, "accesskey", NULL } ;
555static const char* align_attr[] = { "align", NULL } ;
556static const char* link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
557static const char* map_contents[] = { BLOCK, "area", NULL } ;
558static const char* name_attr[] = { "name", NULL } ;
559static const char* action_attr[] = { "action", NULL } ;
560static const char* blockli_elt[] = { BLOCK, "li", NULL } ;
561static const char* meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
562static const char* content_attr[] = { "content", NULL } ;
563static const char* type_attr[] = { "type", NULL } ;
564static const char* noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
565static const char* object_contents[] = { FLOW, "param", NULL } ;
566static const char* object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
567static const char* object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
568static const char* ol_attrs[] = { "type", "compact", "start", NULL} ;
569static const char* option_elt[] = { "option", NULL } ;
570static const char* optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
571static const char* option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
572static const char* param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
573static const char* width_attr[] = { "width", NULL } ;
574static const char* pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
575static const char* script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
576static const char* language_attr[] = { "language", NULL } ;
577static const char* select_content[] = { "optgroup", "option", NULL } ;
578static const char* select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
579static const char* style_attrs[] = { I18N, "media", "title", NULL } ;
580static const char* table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
581static const char* table_depr[] = { "align", "bgcolor", NULL } ;
582static const char* table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
583static const char* tr_elt[] = { "tr", NULL } ;
584static const char* talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
585static const char* th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
586static const char* th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
587static const char* textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
588static const char* tr_contents[] = { "th", "td", NULL } ;
589static const char* bgcolor_attr[] = { "bgcolor", NULL } ;
590static const char* li_elt[] = { "li", NULL } ;
591static const char* ul_depr[] = { "type", "compact", NULL} ;
592static const char* dir_attr[] = { "dir", NULL} ;
593
594#define DECL (const char**)
595
Daniel Veillard22090732001-07-16 00:06:07 +0000596static const htmlElemDesc
597html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000598{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
599 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
600},
601{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
602 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
603},
604{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
605 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
606},
607{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
608 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
609},
610{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
611 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
612},
613{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
614 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
615},
616{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
617 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
618},
619{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
620 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
621},
622{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
623 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
624},
625{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
626 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
627},
628{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
629 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
630},
631{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
632 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
633},
634{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
635 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
636},
637{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
638 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
639},
640{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
641 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
642},
643{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
644 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
645},
646{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
647 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
648},
649{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
650 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
651},
652{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
653 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
654},
655{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
656 EMPTY , NULL , DECL col_attrs , NULL, NULL
657},
658{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
659 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
660},
661{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
662 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
663},
664{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
665 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
666},
667{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
668 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
669},
670{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
671 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
672},
673{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
674 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
675},
676{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
677 DECL dl_contents , "dd" , html_attrs, DECL compact_attr, NULL
678},
679{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
680 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
681},
682{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
683 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
684},
685{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
686 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
687},
688{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
689 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
690},
691{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
692 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
693},
694{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
695 EMPTY, NULL, NULL, DECL frame_attrs, NULL
696},
697{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
698 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
699},
700{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
701 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
702},
703{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
704 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
705},
706{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
707 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
708},
709{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
710 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
711},
712{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
713 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
714},
715{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
716 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
717},
718{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
719 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
720},
721{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
722 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
723},
724{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
725 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
726},
727{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
728 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
729},
730{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
731 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
732},
733{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
734 EMPTY, NULL, DECL img_attrs, DECL align_attr, src_alt_attrs
735},
736{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
737 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
738},
739{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
740 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
741},
742{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
743 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
744},
745{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
746 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
747},
748{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
749 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
750},
751{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
752 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
753},
754{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
755 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
756},
757{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
758 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
759},
760{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
761 DECL map_contents , NULL, DECL html_attrs , NULL, name_attr
762},
763{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
764 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
765},
766{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
767 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
768},
769{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
770 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
771},
772{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
773 DECL html_flow, "div", DECL html_attrs, NULL, NULL
774},
775{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
776 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
777},
778{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
779 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
780},
781{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
782 option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
783},
784{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
785 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
786},
787{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
788 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
789},
790{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
791 EMPTY, NULL, DECL param_attrs, NULL, name_attr
792},
793{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
794 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
795},
796{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
797 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
798},
799{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
800 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
801},
802{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
803 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
804},
805{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
806 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
807},
808{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
809 DECL select_content, NULL, DECL select_attrs, NULL, NULL
810},
811{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
812 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
813},
814{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
815 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
816},
817{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
818 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
819},
820{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
821 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
822},
823{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
824 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
825},
826{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
827 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
828},
829{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
830 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
831},
832{ "table", 0, 0, 0, 0, 0, 0, 0, "",
833 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
834},
835{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
836 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
837},
838{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
839 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
840},
841{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
842 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
843},
844{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
845 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
846},
847{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
848 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
849},
850{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
851 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
852},
853{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
854 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
855},
856{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
857 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
858},
859{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
860 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
861},
862{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
863 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
864},
865{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
866 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
867},
868{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
869 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
870}
Owen Taylor3473f882001-02-23 17:55:21 +0000871};
872
873/*
Owen Taylor3473f882001-02-23 17:55:21 +0000874 * start tags that imply the end of current element
875 */
Daniel Veillard22090732001-07-16 00:06:07 +0000876static const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000877"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
878 "dl", "ul", "ol", "menu", "dir", "address", "pre",
879 "listing", "xmp", "head", NULL,
880"head", "p", NULL,
881"title", "p", NULL,
882"body", "head", "style", "link", "title", "p", NULL,
Daniel Veillard25d5d9a2004-04-05 07:08:42 +0000883"frameset", "head", "style", "link", "title", "p", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000884"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
885 "pre", "listing", "xmp", "head", "li", NULL,
886"hr", "p", "head", NULL,
887"h1", "p", "head", NULL,
888"h2", "p", "head", NULL,
889"h3", "p", "head", NULL,
890"h4", "p", "head", NULL,
891"h5", "p", "head", NULL,
892"h6", "p", "head", NULL,
893"dir", "p", "head", NULL,
894"address", "p", "head", "ul", NULL,
895"pre", "p", "head", "ul", NULL,
896"listing", "p", "head", NULL,
897"xmp", "p", "head", NULL,
898"blockquote", "p", "head", NULL,
899"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
900 "xmp", "head", NULL,
901"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
902 "head", "dd", NULL,
903"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
904 "head", "dt", NULL,
905"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
906 "listing", "xmp", NULL,
907"ol", "p", "head", "ul", NULL,
908"menu", "p", "head", "ul", NULL,
909"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
910"div", "p", "head", NULL,
911"noscript", "p", "head", NULL,
912"center", "font", "b", "i", "p", "head", NULL,
913"a", "a", NULL,
914"caption", "p", NULL,
915"colgroup", "caption", "colgroup", "col", "p", NULL,
916"col", "caption", "col", "p", NULL,
917"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
918 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000919"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
920"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000921"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
922"thead", "caption", "col", "colgroup", NULL,
923"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
924 "tbody", "p", NULL,
925"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
926 "tfoot", "tbody", "p", NULL,
927"optgroup", "option", NULL,
928"option", "option", NULL,
929"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
930 "pre", "listing", "xmp", "a", NULL,
931NULL
932};
933
934/*
935 * The list of HTML elements which are supposed not to have
936 * CDATA content and where a p element will be implied
937 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000938 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +0000939 * implied paragraph
940 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000941static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000942 "html",
943 "head",
944 "body",
945 NULL
946};
947
948/*
949 * The list of HTML attributes which are of content %Script;
950 * NOTE: when adding ones, check htmlIsScriptAttribute() since
951 * it assumes the name starts with 'on'
952 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000953static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000954 "onclick",
955 "ondblclick",
956 "onmousedown",
957 "onmouseup",
958 "onmouseover",
959 "onmousemove",
960 "onmouseout",
961 "onkeypress",
962 "onkeydown",
963 "onkeyup",
964 "onload",
965 "onunload",
966 "onfocus",
967 "onblur",
968 "onsubmit",
969 "onrest",
970 "onchange",
971 "onselect"
972};
973
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000974/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000975 * This table is used by the htmlparser to know what to do with
976 * broken html pages. By assigning different priorities to different
977 * elements the parser can decide how to handle extra endtags.
978 * Endtags are only allowed to close elements with lower or equal
979 * priority.
980 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000981
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000982typedef struct {
983 const char *name;
984 int priority;
985} elementPriority;
986
Daniel Veillard22090732001-07-16 00:06:07 +0000987static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000988 {"div", 150},
989 {"td", 160},
990 {"th", 160},
991 {"tr", 170},
992 {"thead", 180},
993 {"tbody", 180},
994 {"tfoot", 180},
995 {"table", 190},
996 {"head", 200},
997 {"body", 200},
998 {"html", 220},
999 {NULL, 100} /* Default priority */
1000};
Owen Taylor3473f882001-02-23 17:55:21 +00001001
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001002static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +00001003static int htmlStartCloseIndexinitialized = 0;
1004
1005/************************************************************************
1006 * *
1007 * functions to handle HTML specific data *
1008 * *
1009 ************************************************************************/
1010
1011/**
1012 * htmlInitAutoClose:
1013 *
1014 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1015 * This is not reentrant. Call xmlInitParser() once before processing in
1016 * case of use in multithreaded programs.
1017 */
1018void
1019htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001020 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001021
1022 if (htmlStartCloseIndexinitialized) return;
1023
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001024 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1025 indx = 0;
1026 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1027 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +00001028 while (htmlStartClose[i] != NULL) i++;
1029 i++;
1030 }
1031 htmlStartCloseIndexinitialized = 1;
1032}
1033
1034/**
1035 * htmlTagLookup:
1036 * @tag: The tag name in lowercase
1037 *
1038 * Lookup the HTML tag in the ElementTable
1039 *
1040 * Returns the related htmlElemDescPtr or NULL if not found.
1041 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001042const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001043htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001044 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001045
1046 for (i = 0; i < (sizeof(html40ElementTable) /
1047 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +00001048 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
William M. Brack78637da2003-07-31 14:47:38 +00001049 return((htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001050 }
1051 return(NULL);
1052}
1053
1054/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001055 * htmlGetEndPriority:
1056 * @name: The name of the element to look up the priority for.
1057 *
1058 * Return value: The "endtag" priority.
1059 **/
1060static int
1061htmlGetEndPriority (const xmlChar *name) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001062 int i = 0;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001063
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001064 while ((htmlEndPriority[i].name != NULL) &&
1065 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1066 i++;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001067
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001068 return(htmlEndPriority[i].priority);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001069}
1070
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001071
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001072/**
Owen Taylor3473f882001-02-23 17:55:21 +00001073 * htmlCheckAutoClose:
1074 * @newtag: The new tag name
1075 * @oldtag: The old tag name
1076 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001077 * Checks whether the new tag is one of the registered valid tags for
1078 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +00001079 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1080 *
1081 * Returns 0 if no, 1 if yes.
1082 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001083static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001084htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1085{
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001086 int i, indx;
1087 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001088
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001089 if (htmlStartCloseIndexinitialized == 0)
1090 htmlInitAutoClose();
Owen Taylor3473f882001-02-23 17:55:21 +00001091
1092 /* inefficient, but not a big deal */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001093 for (indx = 0; indx < 100; indx++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001094 closed = htmlStartCloseIndex[indx];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001095 if (closed == NULL)
1096 return (0);
1097 if (xmlStrEqual(BAD_CAST * closed, newtag))
1098 break;
Owen Taylor3473f882001-02-23 17:55:21 +00001099 }
1100
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001101 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001102 i++;
1103 while (htmlStartClose[i] != NULL) {
1104 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001105 return (1);
1106 }
1107 i++;
Owen Taylor3473f882001-02-23 17:55:21 +00001108 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001109 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00001110}
1111
1112/**
1113 * htmlAutoCloseOnClose:
1114 * @ctxt: an HTML parser context
1115 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001116 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001117 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001118 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001119 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001120static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001121htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1122{
1123 const htmlElemDesc *info;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001124 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001125
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001126 priority = htmlGetEndPriority(newtag);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001127
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001128 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001129
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001130 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1131 break;
1132 /*
1133 * A missplaced endtag can only close elements with lower
1134 * or equal priority, so if we find an element with higher
1135 * priority before we find an element with
1136 * matching name, we just ignore this endtag
1137 */
1138 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1139 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001140 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001141 if (i < 0)
1142 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001143
1144 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001145 info = htmlTagLookup(ctxt->name);
Daniel Veillardf403d292003-10-05 13:51:35 +00001146 if ((info != NULL) && (info->endTag == 3)) {
1147 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1148 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard05bcb7e2003-10-19 14:26:34 +00001149 newtag, ctxt->name);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001150 }
1151 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1152 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001153 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001154 }
1155}
1156
1157/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001158 * htmlAutoCloseOnEnd:
1159 * @ctxt: an HTML parser context
1160 *
1161 * Close all remaining tags at the end of the stream
1162 */
1163static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001164htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1165{
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001166 int i;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001167
William M. Brack899e64a2003-09-26 18:03:42 +00001168 if (ctxt->nameNr == 0)
1169 return;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001170 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001171 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1172 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001173 htmlnamePop(ctxt);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001174 }
1175}
1176
1177/**
Owen Taylor3473f882001-02-23 17:55:21 +00001178 * htmlAutoClose:
1179 * @ctxt: an HTML parser context
1180 * @newtag: The new tag name or NULL
1181 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001182 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001183 * The list is kept in htmlStartClose array. This function is
1184 * called when a new tag has been detected and generates the
1185 * appropriates closes if possible/needed.
1186 * If newtag is NULL this mean we are at the end of the resource
1187 * and we should check
1188 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001189static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001190htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1191{
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001192 while ((newtag != NULL) && (ctxt->name != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001193 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001194 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1195 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001196 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001197 }
1198 if (newtag == NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001199 htmlAutoCloseOnEnd(ctxt);
1200 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001201 }
1202 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001203 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1204 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1205 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001206 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1207 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001208 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001209 }
Owen Taylor3473f882001-02-23 17:55:21 +00001210}
1211
1212/**
1213 * htmlAutoCloseTag:
1214 * @doc: the HTML document
1215 * @name: The tag name
1216 * @elem: the HTML element
1217 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001218 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001219 * The list is kept in htmlStartClose array. This function checks
1220 * if the element or one of it's children would autoclose the
1221 * given tag.
1222 *
1223 * Returns 1 if autoclose, 0 otherwise
1224 */
1225int
1226htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1227 htmlNodePtr child;
1228
1229 if (elem == NULL) return(1);
1230 if (xmlStrEqual(name, elem->name)) return(0);
1231 if (htmlCheckAutoClose(elem->name, name)) return(1);
1232 child = elem->children;
1233 while (child != NULL) {
1234 if (htmlAutoCloseTag(doc, name, child)) return(1);
1235 child = child->next;
1236 }
1237 return(0);
1238}
1239
1240/**
1241 * htmlIsAutoClosed:
1242 * @doc: the HTML document
1243 * @elem: the HTML element
1244 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001245 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001246 * The list is kept in htmlStartClose array. This function checks
1247 * if a tag is autoclosed by one of it's child
1248 *
1249 * Returns 1 if autoclosed, 0 otherwise
1250 */
1251int
1252htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1253 htmlNodePtr child;
1254
1255 if (elem == NULL) return(1);
1256 child = elem->children;
1257 while (child != NULL) {
1258 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1259 child = child->next;
1260 }
1261 return(0);
1262}
1263
1264/**
1265 * htmlCheckImplied:
1266 * @ctxt: an HTML parser context
1267 * @newtag: The new tag name
1268 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001269 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001270 * called when a new tag has been detected and generates the
1271 * appropriates implicit tags if missing
1272 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001273static void
Owen Taylor3473f882001-02-23 17:55:21 +00001274htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1275 if (!htmlOmittedDefaultValue)
1276 return;
1277 if (xmlStrEqual(newtag, BAD_CAST"html"))
1278 return;
1279 if (ctxt->nameNr <= 0) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001280 htmlnamePush(ctxt, BAD_CAST"html");
Owen Taylor3473f882001-02-23 17:55:21 +00001281 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1282 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1283 }
1284 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1285 return;
1286 if ((ctxt->nameNr <= 1) &&
1287 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1288 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1289 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1290 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1291 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1292 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1293 /*
1294 * dropped OBJECT ... i you put it first BODY will be
1295 * assumed !
1296 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001297 htmlnamePush(ctxt, BAD_CAST"head");
Owen Taylor3473f882001-02-23 17:55:21 +00001298 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1299 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1300 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1301 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1302 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1303 int i;
1304 for (i = 0;i < ctxt->nameNr;i++) {
1305 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1306 return;
1307 }
1308 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1309 return;
1310 }
1311 }
1312
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001313 htmlnamePush(ctxt, BAD_CAST"body");
Owen Taylor3473f882001-02-23 17:55:21 +00001314 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1315 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1316 }
1317}
1318
1319/**
1320 * htmlCheckParagraph
1321 * @ctxt: an HTML parser context
1322 *
1323 * Check whether a p element need to be implied before inserting
1324 * characters in the current element.
1325 *
1326 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1327 * in case of error.
1328 */
1329
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001330static int
Owen Taylor3473f882001-02-23 17:55:21 +00001331htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1332 const xmlChar *tag;
1333 int i;
1334
1335 if (ctxt == NULL)
1336 return(-1);
1337 tag = ctxt->name;
1338 if (tag == NULL) {
1339 htmlAutoClose(ctxt, BAD_CAST"p");
1340 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001341 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001342 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1343 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1344 return(1);
1345 }
1346 if (!htmlOmittedDefaultValue)
1347 return(0);
1348 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1349 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Owen Taylor3473f882001-02-23 17:55:21 +00001350 htmlAutoClose(ctxt, BAD_CAST"p");
1351 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001352 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001353 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1354 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1355 return(1);
1356 }
1357 }
1358 return(0);
1359}
1360
1361/**
1362 * htmlIsScriptAttribute:
1363 * @name: an attribute name
1364 *
1365 * Check if an attribute is of content type Script
1366 *
1367 * Returns 1 is the attribute is a script 0 otherwise
1368 */
1369int
1370htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001371 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001372
1373 if (name == NULL)
1374 return(0);
1375 /*
1376 * all script attributes start with 'on'
1377 */
1378 if ((name[0] != 'o') || (name[1] != 'n'))
1379 return(0);
1380 for (i = 0;
1381 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1382 i++) {
1383 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1384 return(1);
1385 }
1386 return(0);
1387}
1388
1389/************************************************************************
1390 * *
1391 * The list of HTML predefined entities *
1392 * *
1393 ************************************************************************/
1394
1395
Daniel Veillard22090732001-07-16 00:06:07 +00001396static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001397/*
1398 * the 4 absolute ones, plus apostrophe.
1399 */
1400{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1401{ 38, "amp", "ampersand, U+0026 ISOnum" },
1402{ 39, "apos", "single quote" },
1403{ 60, "lt", "less-than sign, U+003C ISOnum" },
1404{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1405
1406/*
1407 * A bunch still in the 128-255 range
1408 * Replacing them depend really on the charset used.
1409 */
1410{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1411{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1412{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1413{ 163, "pound","pound sign, U+00A3 ISOnum" },
1414{ 164, "curren","currency sign, U+00A4 ISOnum" },
1415{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1416{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1417{ 167, "sect", "section sign, U+00A7 ISOnum" },
1418{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1419{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1420{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1421{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1422{ 172, "not", "not sign, U+00AC ISOnum" },
1423{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1424{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1425{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1426{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1427{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1428{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1429{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1430{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1431{ 181, "micro","micro sign, U+00B5 ISOnum" },
1432{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1433{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1434{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1435{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1436{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1437{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1438{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1439{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1440{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1441{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1442{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1443{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1444{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1445{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1446{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1447{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1448{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1449{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1450{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1451{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1452{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1453{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1454{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1455{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1456{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1457{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1458{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1459{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1460{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1461{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1462{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1463{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1464{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1465{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1466{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1467{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1468{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1469{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1470{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1471{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1472{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1473{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1474{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1475{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1476{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1477{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1478{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1479{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1480{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1481{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1482{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1483{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1484{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1485{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1486{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1487{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1488{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1489{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1490{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1491{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1492{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1493{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1494{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1495{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1496{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1497{ 247, "divide","division sign, U+00F7 ISOnum" },
1498{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1499{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1500{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1501{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1502{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1503{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1504{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1505{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1506
1507{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1508{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1509{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1510{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1511{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1512
1513/*
1514 * Anything below should really be kept as entities references
1515 */
1516{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1517
1518{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1519{ 732, "tilde","small tilde, U+02DC ISOdia" },
1520
1521{ 913, "Alpha","greek capital letter alpha, U+0391" },
1522{ 914, "Beta", "greek capital letter beta, U+0392" },
1523{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1524{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1525{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1526{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1527{ 919, "Eta", "greek capital letter eta, U+0397" },
1528{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1529{ 921, "Iota", "greek capital letter iota, U+0399" },
1530{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001531{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001532{ 924, "Mu", "greek capital letter mu, U+039C" },
1533{ 925, "Nu", "greek capital letter nu, U+039D" },
1534{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1535{ 927, "Omicron","greek capital letter omicron, U+039F" },
1536{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1537{ 929, "Rho", "greek capital letter rho, U+03A1" },
1538{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1539{ 932, "Tau", "greek capital letter tau, U+03A4" },
1540{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1541{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1542{ 935, "Chi", "greek capital letter chi, U+03A7" },
1543{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1544{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1545
1546{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1547{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1548{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1549{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1550{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1551{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1552{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1553{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1554{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1555{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1556{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1557{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1558{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1559{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1560{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1561{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1562{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1563{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1564{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1565{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1566{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1567{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1568{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1569{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1570{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1571{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1572{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1573{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1574
1575{ 8194, "ensp", "en space, U+2002 ISOpub" },
1576{ 8195, "emsp", "em space, U+2003 ISOpub" },
1577{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1578{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1579{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1580{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1581{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1582{ 8211, "ndash","en dash, U+2013 ISOpub" },
1583{ 8212, "mdash","em dash, U+2014 ISOpub" },
1584{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1585{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1586{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1587{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1588{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1589{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1590{ 8224, "dagger","dagger, U+2020 ISOpub" },
1591{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1592
1593{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1594{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1595
1596{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1597
1598{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1599{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1600
1601{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1602{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1603
1604{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1605{ 8260, "frasl","fraction slash, U+2044 NEW" },
1606
1607{ 8364, "euro", "euro sign, U+20AC NEW" },
1608
1609{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1610{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1611{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1612{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1613{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1614{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1615{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1616{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1617{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1618{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1619{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1620{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1621{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1622{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1623{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1624{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1625
1626{ 8704, "forall","for all, U+2200 ISOtech" },
1627{ 8706, "part", "partial differential, U+2202 ISOtech" },
1628{ 8707, "exist","there exists, U+2203 ISOtech" },
1629{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1630{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1631{ 8712, "isin", "element of, U+2208 ISOtech" },
1632{ 8713, "notin","not an element of, U+2209 ISOtech" },
1633{ 8715, "ni", "contains as member, U+220B ISOtech" },
1634{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001635{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001636{ 8722, "minus","minus sign, U+2212 ISOtech" },
1637{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1638{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1639{ 8733, "prop", "proportional to, U+221D ISOtech" },
1640{ 8734, "infin","infinity, U+221E ISOtech" },
1641{ 8736, "ang", "angle, U+2220 ISOamso" },
1642{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1643{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1644{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1645{ 8746, "cup", "union = cup, U+222A ISOtech" },
1646{ 8747, "int", "integral, U+222B ISOtech" },
1647{ 8756, "there4","therefore, U+2234 ISOtech" },
1648{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1649{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1650{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1651{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1652{ 8801, "equiv","identical to, U+2261 ISOtech" },
1653{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1654{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1655{ 8834, "sub", "subset of, U+2282 ISOtech" },
1656{ 8835, "sup", "superset of, U+2283 ISOtech" },
1657{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1658{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1659{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1660{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1661{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1662{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1663{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1664{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1665{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1666{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1667{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1668{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1669{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1670{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1671
1672{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1673{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1674{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1675{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1676
1677};
1678
1679/************************************************************************
1680 * *
1681 * Commodity functions to handle entities *
1682 * *
1683 ************************************************************************/
1684
1685/*
1686 * Macro used to grow the current buffer.
1687 */
1688#define growBuffer(buffer) { \
1689 buffer##_size *= 2; \
Daniel Veillard3487c8d2002-09-05 11:33:25 +00001690 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
Owen Taylor3473f882001-02-23 17:55:21 +00001691 if (buffer == NULL) { \
Daniel Veillardf403d292003-10-05 13:51:35 +00001692 htmlErrMemory(ctxt, "growing buffer\n"); \
Owen Taylor3473f882001-02-23 17:55:21 +00001693 return(NULL); \
1694 } \
1695}
1696
1697/**
1698 * htmlEntityLookup:
1699 * @name: the entity name
1700 *
1701 * Lookup the given entity in EntitiesTable
1702 *
1703 * TODO: the linear scan is really ugly, an hash table is really needed.
1704 *
1705 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1706 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001707const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001708htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001709 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001710
1711 for (i = 0;i < (sizeof(html40EntitiesTable)/
1712 sizeof(html40EntitiesTable[0]));i++) {
1713 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
William M. Brack78637da2003-07-31 14:47:38 +00001714 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001715 }
1716 }
1717 return(NULL);
1718}
1719
1720/**
1721 * htmlEntityValueLookup:
1722 * @value: the entity's unicode value
1723 *
1724 * Lookup the given entity in EntitiesTable
1725 *
1726 * TODO: the linear scan is really ugly, an hash table is really needed.
1727 *
1728 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1729 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001730const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001731htmlEntityValueLookup(unsigned int value) {
1732 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001733
1734 for (i = 0;i < (sizeof(html40EntitiesTable)/
1735 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001736 if (html40EntitiesTable[i].value >= value) {
1737 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001738 break;
William M. Brack78637da2003-07-31 14:47:38 +00001739 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001740 }
Owen Taylor3473f882001-02-23 17:55:21 +00001741 }
1742 return(NULL);
1743}
1744
1745/**
1746 * UTF8ToHtml:
1747 * @out: a pointer to an array of bytes to store the result
1748 * @outlen: the length of @out
1749 * @in: a pointer to an array of UTF-8 chars
1750 * @inlen: the length of @in
1751 *
1752 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1753 * plus HTML entities block of chars out.
1754 *
1755 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1756 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001757 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001758 * The value of @outlen after return is the number of octets consumed.
1759 */
1760int
1761UTF8ToHtml(unsigned char* out, int *outlen,
1762 const unsigned char* in, int *inlen) {
1763 const unsigned char* processed = in;
1764 const unsigned char* outend;
1765 const unsigned char* outstart = out;
1766 const unsigned char* instart = in;
1767 const unsigned char* inend;
1768 unsigned int c, d;
1769 int trailing;
1770
1771 if (in == NULL) {
1772 /*
1773 * initialization nothing to do
1774 */
1775 *outlen = 0;
1776 *inlen = 0;
1777 return(0);
1778 }
1779 inend = in + (*inlen);
1780 outend = out + (*outlen);
1781 while (in < inend) {
1782 d = *in++;
1783 if (d < 0x80) { c= d; trailing= 0; }
1784 else if (d < 0xC0) {
1785 /* trailing byte in leading position */
1786 *outlen = out - outstart;
1787 *inlen = processed - instart;
1788 return(-2);
1789 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1790 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1791 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1792 else {
1793 /* no chance for this in Ascii */
1794 *outlen = out - outstart;
1795 *inlen = processed - instart;
1796 return(-2);
1797 }
1798
1799 if (inend - in < trailing) {
1800 break;
1801 }
1802
1803 for ( ; trailing; trailing--) {
1804 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1805 break;
1806 c <<= 6;
1807 c |= d & 0x3F;
1808 }
1809
1810 /* assertion: c is a single UTF-4 value */
1811 if (c < 0x80) {
1812 if (out + 1 >= outend)
1813 break;
1814 *out++ = c;
1815 } else {
1816 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001817 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001818
1819 /*
1820 * Try to lookup a predefined HTML entity for it
1821 */
1822
1823 ent = htmlEntityValueLookup(c);
1824 if (ent == NULL) {
1825 /* no chance for this in Ascii */
1826 *outlen = out - outstart;
1827 *inlen = processed - instart;
1828 return(-2);
1829 }
1830 len = strlen(ent->name);
1831 if (out + 2 + len >= outend)
1832 break;
1833 *out++ = '&';
1834 memcpy(out, ent->name, len);
1835 out += len;
1836 *out++ = ';';
1837 }
1838 processed = in;
1839 }
1840 *outlen = out - outstart;
1841 *inlen = processed - instart;
1842 return(0);
1843}
1844
1845/**
1846 * htmlEncodeEntities:
1847 * @out: a pointer to an array of bytes to store the result
1848 * @outlen: the length of @out
1849 * @in: a pointer to an array of UTF-8 chars
1850 * @inlen: the length of @in
1851 * @quoteChar: the quote character to escape (' or ") or zero.
1852 *
1853 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1854 * plus HTML entities block of chars out.
1855 *
1856 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1857 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001858 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001859 * The value of @outlen after return is the number of octets consumed.
1860 */
1861int
1862htmlEncodeEntities(unsigned char* out, int *outlen,
1863 const unsigned char* in, int *inlen, int quoteChar) {
1864 const unsigned char* processed = in;
1865 const unsigned char* outend = out + (*outlen);
1866 const unsigned char* outstart = out;
1867 const unsigned char* instart = in;
1868 const unsigned char* inend = in + (*inlen);
1869 unsigned int c, d;
1870 int trailing;
1871
1872 while (in < inend) {
1873 d = *in++;
1874 if (d < 0x80) { c= d; trailing= 0; }
1875 else if (d < 0xC0) {
1876 /* trailing byte in leading position */
1877 *outlen = out - outstart;
1878 *inlen = processed - instart;
1879 return(-2);
1880 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1881 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1882 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1883 else {
1884 /* no chance for this in Ascii */
1885 *outlen = out - outstart;
1886 *inlen = processed - instart;
1887 return(-2);
1888 }
1889
1890 if (inend - in < trailing)
1891 break;
1892
1893 while (trailing--) {
1894 if (((d= *in++) & 0xC0) != 0x80) {
1895 *outlen = out - outstart;
1896 *inlen = processed - instart;
1897 return(-2);
1898 }
1899 c <<= 6;
1900 c |= d & 0x3F;
1901 }
1902
1903 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001904 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1905 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001906 if (out >= outend)
1907 break;
1908 *out++ = c;
1909 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001910 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001911 const char *cp;
1912 char nbuf[16];
1913 int len;
1914
1915 /*
1916 * Try to lookup a predefined HTML entity for it
1917 */
1918 ent = htmlEntityValueLookup(c);
1919 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00001920 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00001921 cp = nbuf;
1922 }
1923 else
1924 cp = ent->name;
1925 len = strlen(cp);
1926 if (out + 2 + len > outend)
1927 break;
1928 *out++ = '&';
1929 memcpy(out, cp, len);
1930 out += len;
1931 *out++ = ';';
1932 }
1933 processed = in;
1934 }
1935 *outlen = out - outstart;
1936 *inlen = processed - instart;
1937 return(0);
1938}
1939
Owen Taylor3473f882001-02-23 17:55:21 +00001940/************************************************************************
1941 * *
1942 * Commodity functions to handle streams *
1943 * *
1944 ************************************************************************/
1945
1946/**
Owen Taylor3473f882001-02-23 17:55:21 +00001947 * htmlNewInputStream:
1948 * @ctxt: an HTML parser context
1949 *
1950 * Create a new input stream structure
1951 * Returns the new input stream or NULL
1952 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001953static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001954htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1955 htmlParserInputPtr input;
1956
1957 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1958 if (input == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00001959 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
Owen Taylor3473f882001-02-23 17:55:21 +00001960 return(NULL);
1961 }
1962 memset(input, 0, sizeof(htmlParserInput));
1963 input->filename = NULL;
1964 input->directory = NULL;
1965 input->base = NULL;
1966 input->cur = NULL;
1967 input->buf = NULL;
1968 input->line = 1;
1969 input->col = 1;
1970 input->buf = NULL;
1971 input->free = NULL;
1972 input->version = NULL;
1973 input->consumed = 0;
1974 input->length = 0;
1975 return(input);
1976}
1977
1978
1979/************************************************************************
1980 * *
1981 * Commodity functions, cleanup needed ? *
1982 * *
1983 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001984/*
1985 * all tags allowing pc data from the html 4.01 loose dtd
1986 * NOTE: it might be more apropriate to integrate this information
1987 * into the html40ElementTable array but I don't want to risk any
1988 * binary incomptibility
1989 */
1990static const char *allowPCData[] = {
1991 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
1992 "blockquote", "body", "button", "caption", "center", "cite", "code",
1993 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
1994 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
1995 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
1996 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
1997};
Owen Taylor3473f882001-02-23 17:55:21 +00001998
1999/**
2000 * areBlanks:
2001 * @ctxt: an HTML parser context
2002 * @str: a xmlChar *
2003 * @len: the size of @str
2004 *
2005 * Is this a sequence of blank chars that one can ignore ?
2006 *
2007 * Returns 1 if ignorable 0 otherwise.
2008 */
2009
2010static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002011 unsigned int i;
2012 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002013 xmlNodePtr lastChild;
2014
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002015 for (j = 0;j < len;j++)
William M. Brack76e95df2003-10-18 16:20:14 +00002016 if (!(IS_BLANK_CH(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002017
2018 if (CUR == 0) return(1);
2019 if (CUR != '<') return(0);
2020 if (ctxt->name == NULL)
2021 return(1);
2022 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2023 return(1);
2024 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2025 return(1);
2026 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
2027 return(1);
2028 if (ctxt->node == NULL) return(0);
2029 lastChild = xmlGetLastChild(ctxt->node);
Daniel Veillard18a65092004-05-11 15:57:42 +00002030 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2031 lastChild = lastChild->prev;
Owen Taylor3473f882001-02-23 17:55:21 +00002032 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002033 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2034 (ctxt->node->content != NULL)) return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002035 /* keep ws in constructs like ...<b> </b>...
2036 for all tags "b" allowing PCDATA */
2037 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2038 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2039 return(0);
2040 }
2041 }
Owen Taylor3473f882001-02-23 17:55:21 +00002042 } else if (xmlNodeIsText(lastChild)) {
2043 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002044 } else {
2045 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2046 for all tags "p" allowing PCDATA */
2047 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2048 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2049 return(0);
2050 }
2051 }
Owen Taylor3473f882001-02-23 17:55:21 +00002052 }
2053 return(1);
2054}
2055
2056/**
Owen Taylor3473f882001-02-23 17:55:21 +00002057 * htmlNewDocNoDtD:
2058 * @URI: URI for the dtd, or NULL
2059 * @ExternalID: the external ID of the DTD, or NULL
2060 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002061 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2062 * are NULL
2063 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002064 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002065 */
2066htmlDocPtr
2067htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2068 xmlDocPtr cur;
2069
2070 /*
2071 * Allocate a new document and fill the fields.
2072 */
2073 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2074 if (cur == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002075 htmlErrMemory(NULL, "HTML document creation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002076 return(NULL);
2077 }
2078 memset(cur, 0, sizeof(xmlDoc));
2079
2080 cur->type = XML_HTML_DOCUMENT_NODE;
2081 cur->version = NULL;
2082 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002083 cur->doc = cur;
2084 cur->name = NULL;
2085 cur->children = NULL;
2086 cur->extSubset = NULL;
2087 cur->oldNs = NULL;
2088 cur->encoding = NULL;
2089 cur->standalone = 1;
2090 cur->compression = 0;
2091 cur->ids = NULL;
2092 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002093 cur->_private = NULL;
Daniel Veillard7cc23572004-07-29 11:20:30 +00002094 cur->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002095 if ((ExternalID != NULL) ||
2096 (URI != NULL))
Daniel Veillard40412cd2003-09-03 13:28:32 +00002097 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002098 return(cur);
2099}
2100
2101/**
2102 * htmlNewDoc:
2103 * @URI: URI for the dtd, or NULL
2104 * @ExternalID: the external ID of the DTD, or NULL
2105 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002106 * Creates a new HTML document
2107 *
Owen Taylor3473f882001-02-23 17:55:21 +00002108 * Returns a new document
2109 */
2110htmlDocPtr
2111htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2112 if ((URI == NULL) && (ExternalID == NULL))
2113 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002114 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2115 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002116
2117 return(htmlNewDocNoDtD(URI, ExternalID));
2118}
2119
2120
2121/************************************************************************
2122 * *
2123 * The parser itself *
2124 * Relates to http://www.w3.org/TR/html40 *
2125 * *
2126 ************************************************************************/
2127
2128/************************************************************************
2129 * *
2130 * The parser itself *
2131 * *
2132 ************************************************************************/
2133
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002134static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002135
Owen Taylor3473f882001-02-23 17:55:21 +00002136/**
2137 * htmlParseHTMLName:
2138 * @ctxt: an HTML parser context
2139 *
2140 * parse an HTML tag or attribute name, note that we convert it to lowercase
2141 * since HTML names are not case-sensitive.
2142 *
2143 * Returns the Tag Name parsed or NULL
2144 */
2145
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002146static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002147htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002148 int i = 0;
2149 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2150
William M. Brack76e95df2003-10-18 16:20:14 +00002151 if (!IS_LETTER_CH(CUR) && (CUR != '_') &&
Owen Taylor3473f882001-02-23 17:55:21 +00002152 (CUR != ':')) return(NULL);
2153
2154 while ((i < HTML_PARSER_BUFFER_SIZE) &&
William M. Brack76e95df2003-10-18 16:20:14 +00002155 ((IS_LETTER_CH(CUR)) || (IS_DIGIT_CH(CUR)) ||
Owen Taylor3473f882001-02-23 17:55:21 +00002156 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
2157 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2158 else loc[i] = CUR;
2159 i++;
2160
2161 NEXT;
2162 }
2163
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002164 return(xmlDictLookup(ctxt->dict, loc, i));
Owen Taylor3473f882001-02-23 17:55:21 +00002165}
2166
2167/**
2168 * htmlParseName:
2169 * @ctxt: an HTML parser context
2170 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002171 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002172 *
2173 * Returns the Name parsed or NULL
2174 */
2175
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002176static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002177htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002178 const xmlChar *in;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002179 const xmlChar *ret;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002180 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002181
2182 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002183
2184 /*
2185 * Accelerator for simple ASCII names
2186 */
2187 in = ctxt->input->cur;
2188 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2189 ((*in >= 0x41) && (*in <= 0x5A)) ||
2190 (*in == '_') || (*in == ':')) {
2191 in++;
2192 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2193 ((*in >= 0x41) && (*in <= 0x5A)) ||
2194 ((*in >= 0x30) && (*in <= 0x39)) ||
2195 (*in == '_') || (*in == '-') ||
2196 (*in == ':') || (*in == '.'))
2197 in++;
2198 if ((*in > 0) && (*in < 0x80)) {
2199 count = in - ctxt->input->cur;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002200 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002201 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002202 ctxt->nbChars += count;
2203 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002204 return(ret);
2205 }
2206 }
2207 return(htmlParseNameComplex(ctxt));
2208}
2209
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002210static const xmlChar *
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002211htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002212 int len = 0, l;
2213 int c;
2214 int count = 0;
2215
2216 /*
2217 * Handler for more complex cases
2218 */
2219 GROW;
2220 c = CUR_CHAR(l);
2221 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2222 (!IS_LETTER(c) && (c != '_') &&
2223 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002224 return(NULL);
2225 }
2226
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002227 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2228 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2229 (c == '.') || (c == '-') ||
2230 (c == '_') || (c == ':') ||
2231 (IS_COMBINING(c)) ||
2232 (IS_EXTENDER(c)))) {
2233 if (count++ > 100) {
2234 count = 0;
2235 GROW;
2236 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002237 len += l;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002238 NEXTL(l);
2239 c = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002240 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002241 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
Owen Taylor3473f882001-02-23 17:55:21 +00002242}
2243
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002244
Owen Taylor3473f882001-02-23 17:55:21 +00002245/**
2246 * htmlParseHTMLAttribute:
2247 * @ctxt: an HTML parser context
2248 * @stop: a char stop value
2249 *
2250 * parse an HTML attribute value till the stop (quote), if
2251 * stop is 0 then it stops at the first space
2252 *
2253 * Returns the attribute parsed or NULL
2254 */
2255
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002256static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002257htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2258 xmlChar *buffer = NULL;
2259 int buffer_size = 0;
2260 xmlChar *out = NULL;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002261 const xmlChar *name = NULL;
2262 const xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002263 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002264
2265 /*
2266 * allocate a translation buffer.
2267 */
2268 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002269 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002270 if (buffer == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002271 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002272 return(NULL);
2273 }
2274 out = buffer;
2275
2276 /*
2277 * Ok loop until we reach one of the ending chars
2278 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002279 while ((CUR != 0) && (CUR != stop)) {
2280 if ((stop == 0) && (CUR == '>')) break;
William M. Brack76e95df2003-10-18 16:20:14 +00002281 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002282 if (CUR == '&') {
2283 if (NXT(1) == '#') {
2284 unsigned int c;
2285 int bits;
2286
2287 c = htmlParseCharRef(ctxt);
2288 if (c < 0x80)
2289 { *out++ = c; bits= -6; }
2290 else if (c < 0x800)
2291 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2292 else if (c < 0x10000)
2293 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2294 else
2295 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2296
2297 for ( ; bits >= 0; bits-= 6) {
2298 *out++ = ((c >> bits) & 0x3F) | 0x80;
2299 }
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002300
2301 if (out - buffer > buffer_size - 100) {
2302 int indx = out - buffer;
2303
2304 growBuffer(buffer);
2305 out = &buffer[indx];
2306 }
Owen Taylor3473f882001-02-23 17:55:21 +00002307 } else {
2308 ent = htmlParseEntityRef(ctxt, &name);
2309 if (name == NULL) {
2310 *out++ = '&';
2311 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002312 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002313
2314 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002315 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002316 }
2317 } else if (ent == NULL) {
2318 *out++ = '&';
2319 cur = name;
2320 while (*cur != 0) {
2321 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002322 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002323
2324 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002325 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002326 }
2327 *out++ = *cur++;
2328 }
Owen Taylor3473f882001-02-23 17:55:21 +00002329 } else {
2330 unsigned int c;
2331 int bits;
2332
2333 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002334 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002335
2336 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002337 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002338 }
2339 c = (xmlChar)ent->value;
2340 if (c < 0x80)
2341 { *out++ = c; bits= -6; }
2342 else if (c < 0x800)
2343 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2344 else if (c < 0x10000)
2345 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2346 else
2347 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2348
2349 for ( ; bits >= 0; bits-= 6) {
2350 *out++ = ((c >> bits) & 0x3F) | 0x80;
2351 }
Owen Taylor3473f882001-02-23 17:55:21 +00002352 }
2353 }
2354 } else {
2355 unsigned int c;
2356 int bits, l;
2357
2358 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002359 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002360
2361 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002362 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002363 }
2364 c = CUR_CHAR(l);
2365 if (c < 0x80)
2366 { *out++ = c; bits= -6; }
2367 else if (c < 0x800)
2368 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2369 else if (c < 0x10000)
2370 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2371 else
2372 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2373
2374 for ( ; bits >= 0; bits-= 6) {
2375 *out++ = ((c >> bits) & 0x3F) | 0x80;
2376 }
2377 NEXT;
2378 }
2379 }
2380 *out++ = 0;
2381 return(buffer);
2382}
2383
2384/**
Owen Taylor3473f882001-02-23 17:55:21 +00002385 * htmlParseEntityRef:
2386 * @ctxt: an HTML parser context
2387 * @str: location to store the entity name
2388 *
2389 * parse an HTML ENTITY references
2390 *
2391 * [68] EntityRef ::= '&' Name ';'
2392 *
2393 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2394 * if non-NULL *str will have to be freed by the caller.
2395 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002396const htmlEntityDesc *
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002397htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2398 const xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002399 const htmlEntityDesc * ent = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002400 *str = NULL;
2401
2402 if (CUR == '&') {
2403 NEXT;
2404 name = htmlParseName(ctxt);
2405 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002406 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2407 "htmlParseEntityRef: no name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002408 } else {
2409 GROW;
2410 if (CUR == ';') {
2411 *str = name;
2412
2413 /*
2414 * Lookup the entity in the table.
2415 */
2416 ent = htmlEntityLookup(name);
2417 if (ent != NULL) /* OK that's ugly !!! */
2418 NEXT;
2419 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002420 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2421 "htmlParseEntityRef: expecting ';'\n",
2422 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002423 *str = name;
2424 }
2425 }
2426 }
2427 return(ent);
2428}
2429
2430/**
2431 * htmlParseAttValue:
2432 * @ctxt: an HTML parser context
2433 *
2434 * parse a value for an attribute
2435 * Note: the parser won't do substitution of entities here, this
2436 * will be handled later in xmlStringGetNodeList, unless it was
2437 * asked for ctxt->replaceEntities != 0
2438 *
2439 * Returns the AttValue parsed or NULL.
2440 */
2441
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002442static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002443htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2444 xmlChar *ret = NULL;
2445
2446 if (CUR == '"') {
2447 NEXT;
2448 ret = htmlParseHTMLAttribute(ctxt, '"');
2449 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002450 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2451 "AttValue: \" expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002452 } else
2453 NEXT;
2454 } else if (CUR == '\'') {
2455 NEXT;
2456 ret = htmlParseHTMLAttribute(ctxt, '\'');
2457 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002458 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2459 "AttValue: ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002460 } else
2461 NEXT;
2462 } else {
2463 /*
2464 * That's an HTMLism, the attribute value may not be quoted
2465 */
2466 ret = htmlParseHTMLAttribute(ctxt, 0);
2467 if (ret == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002468 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2469 "AttValue: no value found\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002470 }
2471 }
2472 return(ret);
2473}
2474
2475/**
2476 * htmlParseSystemLiteral:
2477 * @ctxt: an HTML parser context
2478 *
2479 * parse an HTML Literal
2480 *
2481 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2482 *
2483 * Returns the SystemLiteral parsed or NULL
2484 */
2485
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002486static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002487htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2488 const xmlChar *q;
2489 xmlChar *ret = NULL;
2490
2491 if (CUR == '"') {
2492 NEXT;
2493 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002494 while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
Owen Taylor3473f882001-02-23 17:55:21 +00002495 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002496 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002497 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2498 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002499 } else {
2500 ret = xmlStrndup(q, CUR_PTR - q);
2501 NEXT;
2502 }
2503 } else if (CUR == '\'') {
2504 NEXT;
2505 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002506 while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002507 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002508 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002509 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2510 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002511 } else {
2512 ret = xmlStrndup(q, CUR_PTR - q);
2513 NEXT;
2514 }
2515 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002516 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2517 " or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002518 }
2519
2520 return(ret);
2521}
2522
2523/**
2524 * htmlParsePubidLiteral:
2525 * @ctxt: an HTML parser context
2526 *
2527 * parse an HTML public literal
2528 *
2529 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2530 *
2531 * Returns the PubidLiteral parsed or NULL.
2532 */
2533
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002534static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002535htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2536 const xmlChar *q;
2537 xmlChar *ret = NULL;
2538 /*
2539 * Name ::= (Letter | '_') (NameChar)*
2540 */
2541 if (CUR == '"') {
2542 NEXT;
2543 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002544 while (IS_PUBIDCHAR_CH(CUR)) NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00002545 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002546 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2547 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002548 } else {
2549 ret = xmlStrndup(q, CUR_PTR - q);
2550 NEXT;
2551 }
2552 } else if (CUR == '\'') {
2553 NEXT;
2554 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002555 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002556 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002557 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002558 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2559 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002560 } else {
2561 ret = xmlStrndup(q, CUR_PTR - q);
2562 NEXT;
2563 }
2564 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002565 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2566 "PubidLiteral \" or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002567 }
2568
2569 return(ret);
2570}
2571
2572/**
2573 * htmlParseScript:
2574 * @ctxt: an HTML parser context
2575 *
2576 * parse the content of an HTML SCRIPT or STYLE element
2577 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2578 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2579 * http://www.w3.org/TR/html4/types.html#type-script
2580 * http://www.w3.org/TR/html4/types.html#h-6.15
2581 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2582 *
2583 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2584 * element and the value of intrinsic event attributes. User agents must
2585 * not evaluate script data as HTML markup but instead must pass it on as
2586 * data to a script engine.
2587 * NOTES:
2588 * - The content is passed like CDATA
2589 * - the attributes for style and scripting "onXXX" are also described
2590 * as CDATA but SGML allows entities references in attributes so their
2591 * processing is identical as other attributes
2592 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002593static void
Owen Taylor3473f882001-02-23 17:55:21 +00002594htmlParseScript(htmlParserCtxtPtr ctxt) {
2595 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2596 int nbchar = 0;
2597 xmlChar cur;
2598
2599 SHRINK;
2600 cur = CUR;
William M. Brack76e95df2003-10-18 16:20:14 +00002601 while (IS_CHAR_CH(cur)) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00002602 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2603 (NXT(3) == '-')) {
2604 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2605 if (ctxt->sax->cdataBlock!= NULL) {
2606 /*
2607 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2608 */
2609 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002610 } else if (ctxt->sax->characters != NULL) {
2611 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Daniel Veillardc1f78342001-11-10 11:43:05 +00002612 }
2613 }
2614 nbchar = 0;
2615 htmlParseComment(ctxt);
2616 cur = CUR;
2617 continue;
2618 } else if ((cur == '<') && (NXT(1) == '/')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002619 /*
2620 * One should break here, the specification is clear:
2621 * Authors should therefore escape "</" within the content.
2622 * Escape mechanisms are specific to each scripting or
2623 * style sheet language.
2624 */
2625 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2626 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2627 break; /* while */
2628 }
2629 buf[nbchar++] = cur;
2630 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2631 if (ctxt->sax->cdataBlock!= NULL) {
2632 /*
2633 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2634 */
2635 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002636 } else if (ctxt->sax->characters != NULL) {
2637 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002638 }
2639 nbchar = 0;
2640 }
2641 NEXT;
2642 cur = CUR;
2643 }
William M. Brack76e95df2003-10-18 16:20:14 +00002644 if (!(IS_CHAR_CH(cur))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002645 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2646 "Invalid char in CDATA 0x%X\n", cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002647 NEXT;
2648 }
2649
2650 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2651 if (ctxt->sax->cdataBlock!= NULL) {
2652 /*
2653 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2654 */
2655 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002656 } else if (ctxt->sax->characters != NULL) {
2657 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002658 }
2659 }
2660}
2661
2662
2663/**
2664 * htmlParseCharData:
2665 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002666 *
2667 * parse a CharData section.
2668 * if we are within a CDATA section ']]>' marks an end of section.
2669 *
2670 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2671 */
2672
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002673static void
2674htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002675 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2676 int nbchar = 0;
2677 int cur, l;
2678
2679 SHRINK;
2680 cur = CUR_CHAR(l);
2681 while (((cur != '<') || (ctxt->token == '<')) &&
2682 ((cur != '&') || (ctxt->token == '&')) &&
2683 (IS_CHAR(cur))) {
2684 COPY_BUF(l,buf,nbchar,cur);
2685 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2686 /*
2687 * Ok the segment is to be consumed as chars.
2688 */
2689 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2690 if (areBlanks(ctxt, buf, nbchar)) {
2691 if (ctxt->sax->ignorableWhitespace != NULL)
2692 ctxt->sax->ignorableWhitespace(ctxt->userData,
2693 buf, nbchar);
2694 } else {
2695 htmlCheckParagraph(ctxt);
2696 if (ctxt->sax->characters != NULL)
2697 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2698 }
2699 }
2700 nbchar = 0;
2701 }
2702 NEXTL(l);
2703 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002704 if (cur == 0) {
2705 SHRINK;
2706 GROW;
2707 cur = CUR_CHAR(l);
2708 }
Owen Taylor3473f882001-02-23 17:55:21 +00002709 }
2710 if (nbchar != 0) {
2711 /*
2712 * Ok the segment is to be consumed as chars.
2713 */
2714 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2715 if (areBlanks(ctxt, buf, nbchar)) {
2716 if (ctxt->sax->ignorableWhitespace != NULL)
2717 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2718 } else {
2719 htmlCheckParagraph(ctxt);
2720 if (ctxt->sax->characters != NULL)
2721 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2722 }
2723 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002724 } else {
2725 /*
2726 * Loop detection
2727 */
2728 if (cur == 0)
2729 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002730 }
2731}
2732
2733/**
2734 * htmlParseExternalID:
2735 * @ctxt: an HTML parser context
2736 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002737 *
2738 * Parse an External ID or a Public ID
2739 *
Owen Taylor3473f882001-02-23 17:55:21 +00002740 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2741 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2742 *
2743 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2744 *
2745 * Returns the function returns SystemLiteral and in the second
2746 * case publicID receives PubidLiteral, is strict is off
2747 * it is possible to return NULL and have publicID set.
2748 */
2749
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002750static xmlChar *
2751htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002752 xmlChar *URI = NULL;
2753
2754 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2755 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2756 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2757 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002758 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002759 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2760 "Space required after 'SYSTEM'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002761 }
2762 SKIP_BLANKS;
2763 URI = htmlParseSystemLiteral(ctxt);
2764 if (URI == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002765 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2766 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002767 }
2768 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2769 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2770 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2771 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002772 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002773 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2774 "Space required after 'PUBLIC'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002775 }
2776 SKIP_BLANKS;
2777 *publicID = htmlParsePubidLiteral(ctxt);
2778 if (*publicID == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002779 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2780 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
2781 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002782 }
2783 SKIP_BLANKS;
2784 if ((CUR == '"') || (CUR == '\'')) {
2785 URI = htmlParseSystemLiteral(ctxt);
2786 }
2787 }
2788 return(URI);
2789}
2790
2791/**
2792 * htmlParseComment:
2793 * @ctxt: an HTML parser context
2794 *
2795 * Parse an XML (SGML) comment <!-- .... -->
2796 *
2797 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2798 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002799static void
Owen Taylor3473f882001-02-23 17:55:21 +00002800htmlParseComment(htmlParserCtxtPtr ctxt) {
2801 xmlChar *buf = NULL;
2802 int len;
2803 int size = HTML_PARSER_BUFFER_SIZE;
2804 int q, ql;
2805 int r, rl;
2806 int cur, l;
2807 xmlParserInputState state;
2808
2809 /*
2810 * Check that there is a comment right here.
2811 */
2812 if ((RAW != '<') || (NXT(1) != '!') ||
2813 (NXT(2) != '-') || (NXT(3) != '-')) return;
2814
2815 state = ctxt->instate;
2816 ctxt->instate = XML_PARSER_COMMENT;
2817 SHRINK;
2818 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002819 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002820 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002821 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002822 ctxt->instate = state;
2823 return;
2824 }
2825 q = CUR_CHAR(ql);
2826 NEXTL(ql);
2827 r = CUR_CHAR(rl);
2828 NEXTL(rl);
2829 cur = CUR_CHAR(l);
2830 len = 0;
2831 while (IS_CHAR(cur) &&
2832 ((cur != '>') ||
2833 (r != '-') || (q != '-'))) {
2834 if (len + 5 >= size) {
2835 size *= 2;
2836 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2837 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002838 htmlErrMemory(ctxt, "growing buffer failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002839 ctxt->instate = state;
2840 return;
2841 }
2842 }
2843 COPY_BUF(ql,buf,len,q);
2844 q = r;
2845 ql = rl;
2846 r = cur;
2847 rl = l;
2848 NEXTL(l);
2849 cur = CUR_CHAR(l);
2850 if (cur == 0) {
2851 SHRINK;
2852 GROW;
2853 cur = CUR_CHAR(l);
2854 }
2855 }
2856 buf[len] = 0;
2857 if (!IS_CHAR(cur)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002858 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
2859 "Comment not terminated \n<!--%.50s\n", buf, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002860 xmlFree(buf);
2861 } else {
2862 NEXT;
2863 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2864 (!ctxt->disableSAX))
2865 ctxt->sax->comment(ctxt->userData, buf);
2866 xmlFree(buf);
2867 }
2868 ctxt->instate = state;
2869}
2870
2871/**
2872 * htmlParseCharRef:
2873 * @ctxt: an HTML parser context
2874 *
2875 * parse Reference declarations
2876 *
2877 * [66] CharRef ::= '&#' [0-9]+ ';' |
2878 * '&#x' [0-9a-fA-F]+ ';'
2879 *
2880 * Returns the value parsed (as an int)
2881 */
2882int
2883htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2884 int val = 0;
2885
2886 if ((CUR == '&') && (NXT(1) == '#') &&
Daniel Veillardc59d8262003-11-20 21:59:12 +00002887 ((NXT(2) == 'x') || NXT(2) == 'X')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002888 SKIP(3);
2889 while (CUR != ';') {
2890 if ((CUR >= '0') && (CUR <= '9'))
2891 val = val * 16 + (CUR - '0');
2892 else if ((CUR >= 'a') && (CUR <= 'f'))
2893 val = val * 16 + (CUR - 'a') + 10;
2894 else if ((CUR >= 'A') && (CUR <= 'F'))
2895 val = val * 16 + (CUR - 'A') + 10;
2896 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002897 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
2898 "htmlParseCharRef: invalid hexadecimal value\n",
2899 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002900 return(0);
2901 }
2902 NEXT;
2903 }
2904 if (CUR == ';')
2905 NEXT;
2906 } else if ((CUR == '&') && (NXT(1) == '#')) {
2907 SKIP(2);
2908 while (CUR != ';') {
2909 if ((CUR >= '0') && (CUR <= '9'))
2910 val = val * 10 + (CUR - '0');
2911 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002912 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
2913 "htmlParseCharRef: invalid decimal value\n",
2914 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002915 return(0);
2916 }
2917 NEXT;
2918 }
2919 if (CUR == ';')
2920 NEXT;
2921 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002922 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
2923 "htmlParseCharRef: invalid value\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002924 }
2925 /*
2926 * Check the value IS_CHAR ...
2927 */
2928 if (IS_CHAR(val)) {
2929 return(val);
2930 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002931 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2932 "htmlParseCharRef: invalid xmlChar value %d\n",
2933 val);
Owen Taylor3473f882001-02-23 17:55:21 +00002934 }
2935 return(0);
2936}
2937
2938
2939/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00002940 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00002941 * @ctxt: an HTML parser context
2942 *
2943 * parse a DOCTYPE declaration
2944 *
2945 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2946 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2947 */
2948
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002949static void
Owen Taylor3473f882001-02-23 17:55:21 +00002950htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002951 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00002952 xmlChar *ExternalID = NULL;
2953 xmlChar *URI = NULL;
2954
2955 /*
2956 * We know that '<!DOCTYPE' has been detected.
2957 */
2958 SKIP(9);
2959
2960 SKIP_BLANKS;
2961
2962 /*
2963 * Parse the DOCTYPE name.
2964 */
2965 name = htmlParseName(ctxt);
2966 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002967 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2968 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
2969 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002970 }
2971 /*
2972 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2973 */
2974
2975 SKIP_BLANKS;
2976
2977 /*
2978 * Check for SystemID and ExternalID
2979 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002980 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00002981 SKIP_BLANKS;
2982
2983 /*
2984 * We should be at the end of the DOCTYPE declaration.
2985 */
2986 if (CUR != '>') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002987 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
2988 "DOCTYPE improperly terminated\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002989 /* We shouldn't try to resynchronize ... */
2990 }
2991 NEXT;
2992
2993 /*
2994 * Create or update the document accordingly to the DOCTYPE
2995 */
2996 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2997 (!ctxt->disableSAX))
2998 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
2999
3000 /*
3001 * Cleanup, since we don't use all those identifiers
3002 */
3003 if (URI != NULL) xmlFree(URI);
3004 if (ExternalID != NULL) xmlFree(ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003005}
3006
3007/**
3008 * htmlParseAttribute:
3009 * @ctxt: an HTML parser context
3010 * @value: a xmlChar ** used to store the value of the attribute
3011 *
3012 * parse an attribute
3013 *
3014 * [41] Attribute ::= Name Eq AttValue
3015 *
3016 * [25] Eq ::= S? '=' S?
3017 *
3018 * With namespace:
3019 *
3020 * [NS 11] Attribute ::= QName Eq AttValue
3021 *
3022 * Also the case QName == xmlns:??? is handled independently as a namespace
3023 * definition.
3024 *
3025 * Returns the attribute name, and the value in *value.
3026 */
3027
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003028static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003029htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003030 const xmlChar *name;
3031 xmlChar *val = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003032
3033 *value = NULL;
3034 name = htmlParseHTMLName(ctxt);
3035 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003036 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3037 "error parsing attribute name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003038 return(NULL);
3039 }
3040
3041 /*
3042 * read the value
3043 */
3044 SKIP_BLANKS;
3045 if (CUR == '=') {
3046 NEXT;
3047 SKIP_BLANKS;
3048 val = htmlParseAttValue(ctxt);
3049 /******
3050 } else {
3051 * TODO : some attribute must have values, some may not
3052 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3053 ctxt->sax->warning(ctxt->userData,
3054 "No value for attribute %s\n", name); */
3055 }
3056
3057 *value = val;
3058 return(name);
3059}
3060
3061/**
3062 * htmlCheckEncoding:
3063 * @ctxt: an HTML parser context
3064 * @attvalue: the attribute value
3065 *
3066 * Checks an http-equiv attribute from a Meta tag to detect
3067 * the encoding
3068 * If a new encoding is detected the parser is switched to decode
3069 * it and pass UTF8
3070 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003071static void
Owen Taylor3473f882001-02-23 17:55:21 +00003072htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3073 const xmlChar *encoding;
3074
3075 if ((ctxt == NULL) || (attvalue == NULL))
3076 return;
3077
3078 /* do not change encoding */
3079 if (ctxt->input->encoding != NULL)
3080 return;
3081
3082 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3083 if (encoding != NULL) {
3084 encoding += 8;
3085 } else {
3086 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3087 if (encoding != NULL)
3088 encoding += 9;
3089 }
3090 if (encoding != NULL) {
3091 xmlCharEncoding enc;
3092 xmlCharEncodingHandlerPtr handler;
3093
3094 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3095
3096 if (ctxt->input->encoding != NULL)
3097 xmlFree((xmlChar *) ctxt->input->encoding);
3098 ctxt->input->encoding = xmlStrdup(encoding);
3099
3100 enc = xmlParseCharEncoding((const char *) encoding);
3101 /*
3102 * registered set of known encodings
3103 */
3104 if (enc != XML_CHAR_ENCODING_ERROR) {
3105 xmlSwitchEncoding(ctxt, enc);
3106 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3107 } else {
3108 /*
3109 * fallback for unknown encodings
3110 */
3111 handler = xmlFindCharEncodingHandler((const char *) encoding);
3112 if (handler != NULL) {
3113 xmlSwitchToEncoding(ctxt, handler);
3114 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3115 } else {
3116 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3117 }
3118 }
3119
3120 if ((ctxt->input->buf != NULL) &&
3121 (ctxt->input->buf->encoder != NULL) &&
3122 (ctxt->input->buf->raw != NULL) &&
3123 (ctxt->input->buf->buffer != NULL)) {
3124 int nbchars;
3125 int processed;
3126
3127 /*
3128 * convert as much as possible to the parser reading buffer.
3129 */
3130 processed = ctxt->input->cur - ctxt->input->base;
3131 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3132 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3133 ctxt->input->buf->buffer,
3134 ctxt->input->buf->raw);
3135 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003136 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3137 "htmlCheckEncoding: encoder error\n",
3138 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003139 }
3140 ctxt->input->base =
3141 ctxt->input->cur = ctxt->input->buf->buffer->content;
3142 }
3143 }
3144}
3145
3146/**
3147 * htmlCheckMeta:
3148 * @ctxt: an HTML parser context
3149 * @atts: the attributes values
3150 *
3151 * Checks an attributes from a Meta tag
3152 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003153static void
Owen Taylor3473f882001-02-23 17:55:21 +00003154htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3155 int i;
3156 const xmlChar *att, *value;
3157 int http = 0;
3158 const xmlChar *content = NULL;
3159
3160 if ((ctxt == NULL) || (atts == NULL))
3161 return;
3162
3163 i = 0;
3164 att = atts[i++];
3165 while (att != NULL) {
3166 value = atts[i++];
3167 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3168 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3169 http = 1;
3170 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3171 content = value;
3172 att = atts[i++];
3173 }
3174 if ((http) && (content != NULL))
3175 htmlCheckEncoding(ctxt, content);
3176
3177}
3178
3179/**
3180 * htmlParseStartTag:
3181 * @ctxt: an HTML parser context
3182 *
3183 * parse a start of tag either for rule element or
3184 * EmptyElement. In both case we don't parse the tag closing chars.
3185 *
3186 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3187 *
3188 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3189 *
3190 * With namespace:
3191 *
3192 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3193 *
3194 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3195 *
3196 */
3197
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003198static void
Owen Taylor3473f882001-02-23 17:55:21 +00003199htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003200 const xmlChar *name;
3201 const xmlChar *attname;
Owen Taylor3473f882001-02-23 17:55:21 +00003202 xmlChar *attvalue;
Daniel Veillardf403d292003-10-05 13:51:35 +00003203 const xmlChar **atts = ctxt->atts;
Owen Taylor3473f882001-02-23 17:55:21 +00003204 int nbatts = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +00003205 int maxatts = ctxt->maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003206 int meta = 0;
3207 int i;
3208
3209 if (CUR != '<') return;
3210 NEXT;
3211
3212 GROW;
3213 name = htmlParseHTMLName(ctxt);
3214 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003215 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3216 "htmlParseStartTag: invalid element name\n",
3217 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003218 /* Dump the bogus tag like browsers do */
William M. Brack76e95df2003-10-18 16:20:14 +00003219 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
Owen Taylor3473f882001-02-23 17:55:21 +00003220 NEXT;
3221 return;
3222 }
3223 if (xmlStrEqual(name, BAD_CAST"meta"))
3224 meta = 1;
3225
3226 /*
3227 * Check for auto-closure of HTML elements.
3228 */
3229 htmlAutoClose(ctxt, name);
3230
3231 /*
3232 * Check for implied HTML elements.
3233 */
3234 htmlCheckImplied(ctxt, name);
3235
3236 /*
3237 * Avoid html at any level > 0, head at any level != 1
3238 * or any attempt to recurse body
3239 */
3240 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003241 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3242 "htmlParseStartTag: misplaced <html> tag\n",
3243 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003244 return;
3245 }
3246 if ((ctxt->nameNr != 1) &&
3247 (xmlStrEqual(name, BAD_CAST"head"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003248 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3249 "htmlParseStartTag: misplaced <head> tag\n",
3250 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003251 return;
3252 }
3253 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003254 int indx;
3255 for (indx = 0;indx < ctxt->nameNr;indx++) {
3256 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003257 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3258 "htmlParseStartTag: misplaced <body> tag\n",
3259 name, NULL);
Daniel Veillardc59d8262003-11-20 21:59:12 +00003260 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3261 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003262 return;
3263 }
3264 }
3265 }
3266
3267 /*
3268 * Now parse the attributes, it ends up with the ending
3269 *
3270 * (S Attribute)* S?
3271 */
3272 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003273 while ((IS_CHAR_CH(CUR)) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003274 (CUR != '>') &&
3275 ((CUR != '/') || (NXT(1) != '>'))) {
3276 long cons = ctxt->nbChars;
3277
3278 GROW;
3279 attname = htmlParseAttribute(ctxt, &attvalue);
3280 if (attname != NULL) {
3281
3282 /*
3283 * Well formedness requires at most one declaration of an attribute
3284 */
3285 for (i = 0; i < nbatts;i += 2) {
3286 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003287 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3288 "Attribute %s redefined\n", attname, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003289 if (attvalue != NULL)
3290 xmlFree(attvalue);
3291 goto failed;
3292 }
3293 }
3294
3295 /*
3296 * Add the pair to atts
3297 */
3298 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003299 maxatts = 22; /* allow for 10 attrs by default */
3300 atts = (const xmlChar **)
3301 xmlMalloc(maxatts * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003302 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003303 htmlErrMemory(ctxt, NULL);
3304 if (attvalue != NULL)
3305 xmlFree(attvalue);
3306 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003307 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003308 ctxt->atts = atts;
3309 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003310 } else if (nbatts + 4 > maxatts) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003311 const xmlChar **n;
3312
Owen Taylor3473f882001-02-23 17:55:21 +00003313 maxatts *= 2;
Daniel Veillardf403d292003-10-05 13:51:35 +00003314 n = (const xmlChar **) xmlRealloc((void *) atts,
3315 maxatts * sizeof(const xmlChar *));
3316 if (n == NULL) {
3317 htmlErrMemory(ctxt, NULL);
3318 if (attvalue != NULL)
3319 xmlFree(attvalue);
3320 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003321 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003322 atts = n;
3323 ctxt->atts = atts;
3324 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003325 }
3326 atts[nbatts++] = attname;
3327 atts[nbatts++] = attvalue;
3328 atts[nbatts] = NULL;
3329 atts[nbatts + 1] = NULL;
3330 }
3331 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003332 if (attvalue != NULL)
3333 xmlFree(attvalue);
Owen Taylor3473f882001-02-23 17:55:21 +00003334 /* Dump the bogus attribute string up to the next blank or
3335 * the end of the tag. */
William M. Brack76e95df2003-10-18 16:20:14 +00003336 while ((IS_CHAR_CH(CUR)) &&
3337 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
Daniel Veillard34ba3872003-07-15 13:34:05 +00003338 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003339 NEXT;
3340 }
3341
3342failed:
3343 SKIP_BLANKS;
3344 if (cons == ctxt->nbChars) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003345 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3346 "htmlParseStartTag: problem parsing attributes\n",
3347 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003348 break;
3349 }
3350 }
3351
3352 /*
3353 * Handle specific association to the META tag
3354 */
3355 if (meta)
3356 htmlCheckMeta(ctxt, atts);
3357
3358 /*
3359 * SAX: Start of Element !
3360 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003361 htmlnamePush(ctxt, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003362 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3363 if (nbatts != 0)
3364 ctxt->sax->startElement(ctxt->userData, name, atts);
3365 else
3366 ctxt->sax->startElement(ctxt->userData, name, NULL);
3367 }
Owen Taylor3473f882001-02-23 17:55:21 +00003368
3369 if (atts != NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003370 for (i = 1;i < nbatts;i += 2) {
Owen Taylor3473f882001-02-23 17:55:21 +00003371 if (atts[i] != NULL)
3372 xmlFree((xmlChar *) atts[i]);
3373 }
Owen Taylor3473f882001-02-23 17:55:21 +00003374 }
Owen Taylor3473f882001-02-23 17:55:21 +00003375}
3376
3377/**
3378 * htmlParseEndTag:
3379 * @ctxt: an HTML parser context
3380 *
3381 * parse an end of tag
3382 *
3383 * [42] ETag ::= '</' Name S? '>'
3384 *
3385 * With namespace
3386 *
3387 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003388 *
3389 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003390 */
3391
Daniel Veillardf420ac52001-07-04 16:04:09 +00003392static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003393htmlParseEndTag(htmlParserCtxtPtr ctxt)
3394{
3395 const xmlChar *name;
3396 const xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003397 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003398
3399 if ((CUR != '<') || (NXT(1) != '/')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003400 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3401 "htmlParseEndTag: '</' not found\n", NULL, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003402 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003403 }
3404 SKIP(2);
3405
3406 name = htmlParseHTMLName(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003407 if (name == NULL)
3408 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003409
3410 /*
3411 * We should definitely be at the ending "S? '>'" part
3412 */
3413 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003414 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003415 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3416 "End tag : expected '>'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003417 } else
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003418 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003419
3420 /*
3421 * If the name read is not one of the element in the parsing stack
3422 * then return, it's just an error.
3423 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003424 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3425 if (xmlStrEqual(name, ctxt->nameTab[i]))
3426 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003427 }
3428 if (i < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003429 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3430 "Unexpected end tag : %s\n", name, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003431 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003432 }
3433
3434
3435 /*
3436 * Check for auto-closure of HTML elements.
3437 */
3438
3439 htmlAutoCloseOnClose(ctxt, name);
3440
3441 /*
3442 * Well formedness constraints, opening and closing must match.
3443 * With the exception that the autoclose may have popped stuff out
3444 * of the stack.
3445 */
3446 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003447 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003448 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3449 "Opening and ending tag mismatch: %s and %s\n",
3450 name, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00003451 }
3452 }
3453
3454 /*
3455 * SAX: End of Tag
3456 */
3457 oldname = ctxt->name;
3458 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003459 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3460 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003461 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003462 ret = 1;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003463 } else {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003464 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003465 }
3466
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003467 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003468}
3469
3470
3471/**
3472 * htmlParseReference:
3473 * @ctxt: an HTML parser context
3474 *
3475 * parse and handle entity references in content,
3476 * this will end-up in a call to character() since this is either a
3477 * CharRef, or a predefined entity.
3478 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003479static void
Owen Taylor3473f882001-02-23 17:55:21 +00003480htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003481 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003482 xmlChar out[6];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003483 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003484 if (CUR != '&') return;
3485
3486 if (NXT(1) == '#') {
3487 unsigned int c;
3488 int bits, i = 0;
3489
3490 c = htmlParseCharRef(ctxt);
3491 if (c == 0)
3492 return;
3493
3494 if (c < 0x80) { out[i++]= c; bits= -6; }
3495 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3496 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3497 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3498
3499 for ( ; bits >= 0; bits-= 6) {
3500 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3501 }
3502 out[i] = 0;
3503
3504 htmlCheckParagraph(ctxt);
3505 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3506 ctxt->sax->characters(ctxt->userData, out, i);
3507 } else {
3508 ent = htmlParseEntityRef(ctxt, &name);
3509 if (name == NULL) {
3510 htmlCheckParagraph(ctxt);
3511 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3512 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3513 return;
3514 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003515 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003516 htmlCheckParagraph(ctxt);
3517 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3518 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3519 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3520 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3521 }
3522 } else {
3523 unsigned int c;
3524 int bits, i = 0;
3525
3526 c = ent->value;
3527 if (c < 0x80)
3528 { out[i++]= c; bits= -6; }
3529 else if (c < 0x800)
3530 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3531 else if (c < 0x10000)
3532 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3533 else
3534 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3535
3536 for ( ; bits >= 0; bits-= 6) {
3537 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3538 }
3539 out[i] = 0;
3540
3541 htmlCheckParagraph(ctxt);
3542 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3543 ctxt->sax->characters(ctxt->userData, out, i);
3544 }
Owen Taylor3473f882001-02-23 17:55:21 +00003545 }
3546}
3547
3548/**
3549 * htmlParseContent:
3550 * @ctxt: an HTML parser context
3551 * @name: the node name
3552 *
3553 * Parse a content: comment, sub-element, reference or text.
3554 *
3555 */
3556
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003557static void
Owen Taylor3473f882001-02-23 17:55:21 +00003558htmlParseContent(htmlParserCtxtPtr ctxt) {
3559 xmlChar *currentNode;
3560 int depth;
3561
3562 currentNode = xmlStrdup(ctxt->name);
3563 depth = ctxt->nameNr;
3564 while (1) {
3565 long cons = ctxt->nbChars;
3566
3567 GROW;
3568 /*
3569 * Our tag or one of it's parent or children is ending.
3570 */
3571 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003572 if (htmlParseEndTag(ctxt) &&
3573 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3574 if (currentNode != NULL)
3575 xmlFree(currentNode);
3576 return;
3577 }
3578 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003579 }
3580
3581 /*
3582 * Has this node been popped out during parsing of
3583 * the next element
3584 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003585 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3586 (!xmlStrEqual(currentNode, ctxt->name)))
3587 {
Owen Taylor3473f882001-02-23 17:55:21 +00003588 if (currentNode != NULL) xmlFree(currentNode);
3589 return;
3590 }
3591
Daniel Veillardf9533d12001-03-03 10:04:57 +00003592 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3593 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003594 /*
3595 * Handle SCRIPT/STYLE separately
3596 */
3597 htmlParseScript(ctxt);
3598 } else {
3599 /*
3600 * Sometimes DOCTYPE arrives in the middle of the document
3601 */
3602 if ((CUR == '<') && (NXT(1) == '!') &&
3603 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3604 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3605 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3606 (UPP(8) == 'E')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003607 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3608 "Misplaced DOCTYPE declaration\n",
3609 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003610 htmlParseDocTypeDecl(ctxt);
3611 }
3612
3613 /*
3614 * First case : a comment
3615 */
3616 if ((CUR == '<') && (NXT(1) == '!') &&
3617 (NXT(2) == '-') && (NXT(3) == '-')) {
3618 htmlParseComment(ctxt);
3619 }
3620
3621 /*
3622 * Second case : a sub-element.
3623 */
3624 else if (CUR == '<') {
3625 htmlParseElement(ctxt);
3626 }
3627
3628 /*
3629 * Third case : a reference. If if has not been resolved,
3630 * parsing returns it's Name, create the node
3631 */
3632 else if (CUR == '&') {
3633 htmlParseReference(ctxt);
3634 }
3635
3636 /*
3637 * Fourth : end of the resource
3638 */
3639 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003640 htmlAutoCloseOnEnd(ctxt);
3641 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003642 }
3643
3644 /*
3645 * Last case, text. Note that References are handled directly.
3646 */
3647 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003648 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003649 }
3650
3651 if (cons == ctxt->nbChars) {
3652 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003653 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3654 "detected an error in element content\n",
3655 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003656 }
3657 break;
3658 }
3659 }
3660 GROW;
3661 }
3662 if (currentNode != NULL) xmlFree(currentNode);
3663}
3664
3665/**
3666 * htmlParseElement:
3667 * @ctxt: an HTML parser context
3668 *
3669 * parse an HTML element, this is highly recursive
3670 *
3671 * [39] element ::= EmptyElemTag | STag content ETag
3672 *
3673 * [41] Attribute ::= Name Eq AttValue
3674 */
3675
3676void
3677htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003678 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003679 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003680 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003681 htmlParserNodeInfo node_info;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003682 const xmlChar *oldname;
Owen Taylor3473f882001-02-23 17:55:21 +00003683 int depth = ctxt->nameNr;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003684 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003685
3686 /* Capture start position */
3687 if (ctxt->record_info) {
3688 node_info.begin_pos = ctxt->input->consumed +
3689 (CUR_PTR - ctxt->input->base);
3690 node_info.begin_line = ctxt->input->line;
3691 }
3692
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003693 oldname = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00003694 htmlParseStartTag(ctxt);
3695 name = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00003696 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3697 (name == NULL)) {
3698 if (CUR == '>')
3699 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003700 return;
3701 }
Owen Taylor3473f882001-02-23 17:55:21 +00003702
3703 /*
3704 * Lookup the info for that element.
3705 */
3706 info = htmlTagLookup(name);
3707 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003708 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
3709 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003710 }
3711
3712 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00003713 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00003714 */
3715 if ((CUR == '/') && (NXT(1) == '>')) {
3716 SKIP(2);
3717 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3718 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003719 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003720 return;
3721 }
3722
3723 if (CUR == '>') {
3724 NEXT;
3725 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003726 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3727 "Couldn't find end of Start Tag %s\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003728
3729 /*
3730 * end of parsing of this node.
3731 */
3732 if (xmlStrEqual(name, ctxt->name)) {
3733 nodePop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00003734 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003735 }
3736
3737 /*
3738 * Capture end position and add node
3739 */
3740 if ( currentNode != NULL && ctxt->record_info ) {
3741 node_info.end_pos = ctxt->input->consumed +
3742 (CUR_PTR - ctxt->input->base);
3743 node_info.end_line = ctxt->input->line;
3744 node_info.node = ctxt->node;
3745 xmlParserAddNodeInfo(ctxt, &node_info);
3746 }
3747 return;
3748 }
3749
3750 /*
3751 * Check for an Empty Element from DTD definition
3752 */
3753 if ((info != NULL) && (info->empty)) {
3754 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3755 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003756 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003757 return;
3758 }
3759
3760 /*
3761 * Parse the content of the element:
3762 */
3763 currentNode = xmlStrdup(ctxt->name);
3764 depth = ctxt->nameNr;
William M. Brack76e95df2003-10-18 16:20:14 +00003765 while (IS_CHAR_CH(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00003766 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00003767 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00003768 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00003769 if (ctxt->nameNr < depth) break;
3770 }
3771
Owen Taylor3473f882001-02-23 17:55:21 +00003772 /*
3773 * Capture end position and add node
3774 */
3775 if ( currentNode != NULL && ctxt->record_info ) {
3776 node_info.end_pos = ctxt->input->consumed +
3777 (CUR_PTR - ctxt->input->base);
3778 node_info.end_line = ctxt->input->line;
3779 node_info.node = ctxt->node;
3780 xmlParserAddNodeInfo(ctxt, &node_info);
3781 }
William M. Brack76e95df2003-10-18 16:20:14 +00003782 if (!IS_CHAR_CH(CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003783 htmlAutoCloseOnEnd(ctxt);
3784 }
3785
Owen Taylor3473f882001-02-23 17:55:21 +00003786 if (currentNode != NULL)
3787 xmlFree(currentNode);
3788}
3789
3790/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003791 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00003792 * @ctxt: an HTML parser context
3793 *
3794 * parse an HTML document (and build a tree if using the standard SAX
3795 * interface).
3796 *
3797 * Returns 0, -1 in case of error. the parser context is augmented
3798 * as a result of the parsing.
3799 */
3800
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00003801int
Owen Taylor3473f882001-02-23 17:55:21 +00003802htmlParseDocument(htmlParserCtxtPtr ctxt) {
3803 xmlDtdPtr dtd;
3804
Daniel Veillardd0463562001-10-13 09:15:48 +00003805 xmlInitParser();
3806
Owen Taylor3473f882001-02-23 17:55:21 +00003807 htmlDefaultSAXHandlerInit();
3808 ctxt->html = 1;
3809
3810 GROW;
3811 /*
3812 * SAX: beginning of the document processing.
3813 */
3814 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3815 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3816
3817 /*
3818 * Wipe out everything which is before the first '<'
3819 */
3820 SKIP_BLANKS;
3821 if (CUR == 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003822 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
3823 "Document is empty\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003824 }
3825
3826 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3827 ctxt->sax->startDocument(ctxt->userData);
3828
3829
3830 /*
3831 * Parse possible comments before any content
3832 */
3833 while ((CUR == '<') && (NXT(1) == '!') &&
3834 (NXT(2) == '-') && (NXT(3) == '-')) {
3835 htmlParseComment(ctxt);
3836 SKIP_BLANKS;
3837 }
3838
3839
3840 /*
3841 * Then possibly doc type declaration(s) and more Misc
3842 * (doctypedecl Misc*)?
3843 */
3844 if ((CUR == '<') && (NXT(1) == '!') &&
3845 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3846 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3847 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3848 (UPP(8) == 'E')) {
3849 htmlParseDocTypeDecl(ctxt);
3850 }
3851 SKIP_BLANKS;
3852
3853 /*
3854 * Parse possible comments before any content
3855 */
3856 while ((CUR == '<') && (NXT(1) == '!') &&
3857 (NXT(2) == '-') && (NXT(3) == '-')) {
3858 htmlParseComment(ctxt);
3859 SKIP_BLANKS;
3860 }
3861
3862 /*
3863 * Time to start parsing the tree itself
3864 */
3865 htmlParseContent(ctxt);
3866
3867 /*
3868 * autoclose
3869 */
3870 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003871 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003872
3873
3874 /*
3875 * SAX: end of the document processing.
3876 */
3877 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3878 ctxt->sax->endDocument(ctxt->userData);
3879
3880 if (ctxt->myDoc != NULL) {
3881 dtd = xmlGetIntSubset(ctxt->myDoc);
3882 if (dtd == NULL)
3883 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00003884 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00003885 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3886 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3887 }
3888 if (! ctxt->wellFormed) return(-1);
3889 return(0);
3890}
3891
3892
3893/************************************************************************
3894 * *
3895 * Parser contexts handling *
3896 * *
3897 ************************************************************************/
3898
3899/**
William M. Brackedb65a72004-02-06 07:36:04 +00003900 * htmlInitParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00003901 * @ctxt: an HTML parser context
3902 *
3903 * Initialize a parser context
Daniel Veillardf403d292003-10-05 13:51:35 +00003904 *
3905 * Returns 0 in case of success and -1 in case of error
Owen Taylor3473f882001-02-23 17:55:21 +00003906 */
3907
Daniel Veillardf403d292003-10-05 13:51:35 +00003908static int
Owen Taylor3473f882001-02-23 17:55:21 +00003909htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3910{
3911 htmlSAXHandler *sax;
3912
Daniel Veillardf403d292003-10-05 13:51:35 +00003913 if (ctxt == NULL) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00003914 memset(ctxt, 0, sizeof(htmlParserCtxt));
3915
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003916 ctxt->dict = xmlDictCreate();
3917 if (ctxt->dict == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003918 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
3919 return(-1);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003920 }
Owen Taylor3473f882001-02-23 17:55:21 +00003921 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3922 if (sax == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003923 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
3924 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00003925 }
3926 else
3927 memset(sax, 0, sizeof(htmlSAXHandler));
3928
3929 /* Allocate the Input stack */
3930 ctxt->inputTab = (htmlParserInputPtr *)
3931 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3932 if (ctxt->inputTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003933 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003934 ctxt->inputNr = 0;
3935 ctxt->inputMax = 0;
3936 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00003937 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00003938 }
3939 ctxt->inputNr = 0;
3940 ctxt->inputMax = 5;
3941 ctxt->input = NULL;
3942 ctxt->version = NULL;
3943 ctxt->encoding = NULL;
3944 ctxt->standalone = -1;
3945 ctxt->instate = XML_PARSER_START;
3946
3947 /* Allocate the Node stack */
3948 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3949 if (ctxt->nodeTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003950 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003951 ctxt->nodeNr = 0;
3952 ctxt->nodeMax = 0;
3953 ctxt->node = NULL;
3954 ctxt->inputNr = 0;
3955 ctxt->inputMax = 0;
3956 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00003957 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00003958 }
3959 ctxt->nodeNr = 0;
3960 ctxt->nodeMax = 10;
3961 ctxt->node = NULL;
3962
3963 /* Allocate the Name stack */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003964 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003965 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003966 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003967 ctxt->nameNr = 0;
3968 ctxt->nameMax = 10;
3969 ctxt->name = NULL;
3970 ctxt->nodeNr = 0;
3971 ctxt->nodeMax = 0;
3972 ctxt->node = NULL;
3973 ctxt->inputNr = 0;
3974 ctxt->inputMax = 0;
3975 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00003976 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00003977 }
3978 ctxt->nameNr = 0;
3979 ctxt->nameMax = 10;
3980 ctxt->name = NULL;
3981
Daniel Veillard092643b2003-09-25 14:29:29 +00003982 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
Owen Taylor3473f882001-02-23 17:55:21 +00003983 else {
3984 ctxt->sax = sax;
Daniel Veillard092643b2003-09-25 14:29:29 +00003985 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Owen Taylor3473f882001-02-23 17:55:21 +00003986 }
3987 ctxt->userData = ctxt;
3988 ctxt->myDoc = NULL;
3989 ctxt->wellFormed = 1;
3990 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00003991 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00003992 ctxt->html = 1;
William M. Brackedb65a72004-02-06 07:36:04 +00003993 ctxt->vctxt.userData = ctxt;
3994 ctxt->vctxt.error = xmlParserValidityError;
3995 ctxt->vctxt.warning = xmlParserValidityWarning;
Owen Taylor3473f882001-02-23 17:55:21 +00003996 ctxt->record_info = 0;
3997 ctxt->validate = 0;
3998 ctxt->nbChars = 0;
3999 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00004000 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004001 xmlInitNodeInfoSeq(&ctxt->node_seq);
Daniel Veillardf403d292003-10-05 13:51:35 +00004002 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00004003}
4004
4005/**
4006 * htmlFreeParserCtxt:
4007 * @ctxt: an HTML parser context
4008 *
4009 * Free all the memory used by a parser context. However the parsed
4010 * document in ctxt->myDoc is not freed.
4011 */
4012
4013void
4014htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4015{
4016 xmlFreeParserCtxt(ctxt);
4017}
4018
4019/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004020 * htmlNewParserCtxt:
4021 *
4022 * Allocate and initialize a new parser context.
4023 *
4024 * Returns the xmlParserCtxtPtr or NULL
4025 */
4026
4027static htmlParserCtxtPtr
4028htmlNewParserCtxt(void)
4029{
4030 xmlParserCtxtPtr ctxt;
4031
4032 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4033 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004034 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004035 return(NULL);
4036 }
4037 memset(ctxt, 0, sizeof(xmlParserCtxt));
Daniel Veillardf403d292003-10-05 13:51:35 +00004038 if (htmlInitParserCtxt(ctxt) < 0) {
4039 htmlFreeParserCtxt(ctxt);
4040 return(NULL);
4041 }
Daniel Veillard1d995272002-07-22 16:43:32 +00004042 return(ctxt);
4043}
4044
4045/**
4046 * htmlCreateMemoryParserCtxt:
4047 * @buffer: a pointer to a char array
4048 * @size: the size of the array
4049 *
4050 * Create a parser context for an HTML in-memory document.
4051 *
4052 * Returns the new parser context or NULL
4053 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004054htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004055htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4056 xmlParserCtxtPtr ctxt;
4057 xmlParserInputPtr input;
4058 xmlParserInputBufferPtr buf;
4059
4060 if (buffer == NULL)
4061 return(NULL);
4062 if (size <= 0)
4063 return(NULL);
4064
4065 ctxt = htmlNewParserCtxt();
4066 if (ctxt == NULL)
4067 return(NULL);
4068
4069 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4070 if (buf == NULL) return(NULL);
4071
4072 input = xmlNewInputStream(ctxt);
4073 if (input == NULL) {
4074 xmlFreeParserCtxt(ctxt);
4075 return(NULL);
4076 }
4077
4078 input->filename = NULL;
4079 input->buf = buf;
4080 input->base = input->buf->buffer->content;
4081 input->cur = input->buf->buffer->content;
4082 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4083
4084 inputPush(ctxt, input);
4085 return(ctxt);
4086}
4087
4088/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004089 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004090 * @cur: a pointer to an array of xmlChar
4091 * @encoding: a free form C string describing the HTML document encoding, or NULL
4092 *
4093 * Create a parser context for an HTML document.
4094 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004095 * TODO: check the need to add encoding handling there
4096 *
Owen Taylor3473f882001-02-23 17:55:21 +00004097 * Returns the new parser context or NULL
4098 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004099static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00004100htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004101 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004102 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004103
Daniel Veillard1d995272002-07-22 16:43:32 +00004104 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004105 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004106 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004107 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4108
4109 if (encoding != NULL) {
4110 xmlCharEncoding enc;
4111 xmlCharEncodingHandlerPtr handler;
4112
4113 if (ctxt->input->encoding != NULL)
4114 xmlFree((xmlChar *) ctxt->input->encoding);
Daniel Veillarde8ed6202003-08-14 23:39:01 +00004115 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004116
4117 enc = xmlParseCharEncoding(encoding);
4118 /*
4119 * registered set of known encodings
4120 */
4121 if (enc != XML_CHAR_ENCODING_ERROR) {
4122 xmlSwitchEncoding(ctxt, enc);
4123 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004124 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4125 "Unsupported encoding %s\n",
4126 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004127 }
4128 } else {
4129 /*
4130 * fallback for unknown encodings
4131 */
4132 handler = xmlFindCharEncodingHandler((const char *) encoding);
4133 if (handler != NULL) {
4134 xmlSwitchToEncoding(ctxt, handler);
4135 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004136 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4137 "Unsupported encoding %s\n",
4138 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004139 }
4140 }
4141 }
4142 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004143}
4144
Daniel Veillard73b013f2003-09-30 12:36:01 +00004145#ifdef LIBXML_PUSH_ENABLED
Owen Taylor3473f882001-02-23 17:55:21 +00004146/************************************************************************
4147 * *
4148 * Progressive parsing interfaces *
4149 * *
4150 ************************************************************************/
4151
4152/**
4153 * htmlParseLookupSequence:
4154 * @ctxt: an HTML parser context
4155 * @first: the first char to lookup
4156 * @next: the next char to lookup or zero
4157 * @third: the next char to lookup or zero
William M. Brack78637da2003-07-31 14:47:38 +00004158 * @comment: flag to force checking inside comments
Owen Taylor3473f882001-02-23 17:55:21 +00004159 *
4160 * Try to find if a sequence (first, next, third) or just (first next) or
4161 * (first) is available in the input stream.
4162 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4163 * to avoid rescanning sequences of bytes, it DOES change the state of the
4164 * parser, do not use liberally.
4165 * This is basically similar to xmlParseLookupSequence()
4166 *
4167 * Returns the index to the current parsing point if the full sequence
4168 * is available, -1 otherwise.
4169 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004170static int
Owen Taylor3473f882001-02-23 17:55:21 +00004171htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
William M. Brackc1939562003-08-05 15:52:22 +00004172 xmlChar next, xmlChar third, int iscomment) {
Owen Taylor3473f882001-02-23 17:55:21 +00004173 int base, len;
4174 htmlParserInputPtr in;
4175 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004176 int incomment = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004177
4178 in = ctxt->input;
4179 if (in == NULL) return(-1);
4180 base = in->cur - in->base;
4181 if (base < 0) return(-1);
4182 if (ctxt->checkIndex > base)
4183 base = ctxt->checkIndex;
4184 if (in->buf == NULL) {
4185 buf = in->base;
4186 len = in->length;
4187 } else {
4188 buf = in->buf->buffer->content;
4189 len = in->buf->buffer->use;
4190 }
4191 /* take into account the sequence length */
4192 if (third) len -= 2;
4193 else if (next) len --;
4194 for (;base < len;base++) {
William M. Brackc1939562003-08-05 15:52:22 +00004195 if (!incomment && (base + 4 < len) && !iscomment) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00004196 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4197 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4198 incomment = 1;
Daniel Veillard97e01882003-07-30 18:59:19 +00004199 /* do not increment past <! - some people use <!--> */
4200 base += 2;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004201 }
Daniel Veillardc1f78342001-11-10 11:43:05 +00004202 }
4203 if (incomment) {
William M. Brack4a557d92003-07-29 04:28:04 +00004204 if (base + 3 > len)
Daniel Veillardc1f78342001-11-10 11:43:05 +00004205 return(-1);
4206 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4207 (buf[base + 2] == '>')) {
4208 incomment = 0;
4209 base += 2;
4210 }
4211 continue;
4212 }
Owen Taylor3473f882001-02-23 17:55:21 +00004213 if (buf[base] == first) {
4214 if (third != 0) {
4215 if ((buf[base + 1] != next) ||
4216 (buf[base + 2] != third)) continue;
4217 } else if (next != 0) {
4218 if (buf[base + 1] != next) continue;
4219 }
4220 ctxt->checkIndex = 0;
4221#ifdef DEBUG_PUSH
4222 if (next == 0)
4223 xmlGenericError(xmlGenericErrorContext,
4224 "HPP: lookup '%c' found at %d\n",
4225 first, base);
4226 else if (third == 0)
4227 xmlGenericError(xmlGenericErrorContext,
4228 "HPP: lookup '%c%c' found at %d\n",
4229 first, next, base);
4230 else
4231 xmlGenericError(xmlGenericErrorContext,
4232 "HPP: lookup '%c%c%c' found at %d\n",
4233 first, next, third, base);
4234#endif
4235 return(base - (in->cur - in->base));
4236 }
4237 }
4238 ctxt->checkIndex = base;
4239#ifdef DEBUG_PUSH
4240 if (next == 0)
4241 xmlGenericError(xmlGenericErrorContext,
4242 "HPP: lookup '%c' failed\n", first);
4243 else if (third == 0)
4244 xmlGenericError(xmlGenericErrorContext,
4245 "HPP: lookup '%c%c' failed\n", first, next);
4246 else
4247 xmlGenericError(xmlGenericErrorContext,
4248 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4249#endif
4250 return(-1);
4251}
4252
4253/**
4254 * htmlParseTryOrFinish:
4255 * @ctxt: an HTML parser context
4256 * @terminate: last chunk indicator
4257 *
4258 * Try to progress on parsing
4259 *
4260 * Returns zero if no parsing was possible
4261 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004262static int
Owen Taylor3473f882001-02-23 17:55:21 +00004263htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4264 int ret = 0;
4265 htmlParserInputPtr in;
4266 int avail = 0;
4267 xmlChar cur, next;
4268
4269#ifdef DEBUG_PUSH
4270 switch (ctxt->instate) {
4271 case XML_PARSER_EOF:
4272 xmlGenericError(xmlGenericErrorContext,
4273 "HPP: try EOF\n"); break;
4274 case XML_PARSER_START:
4275 xmlGenericError(xmlGenericErrorContext,
4276 "HPP: try START\n"); break;
4277 case XML_PARSER_MISC:
4278 xmlGenericError(xmlGenericErrorContext,
4279 "HPP: try MISC\n");break;
4280 case XML_PARSER_COMMENT:
4281 xmlGenericError(xmlGenericErrorContext,
4282 "HPP: try COMMENT\n");break;
4283 case XML_PARSER_PROLOG:
4284 xmlGenericError(xmlGenericErrorContext,
4285 "HPP: try PROLOG\n");break;
4286 case XML_PARSER_START_TAG:
4287 xmlGenericError(xmlGenericErrorContext,
4288 "HPP: try START_TAG\n");break;
4289 case XML_PARSER_CONTENT:
4290 xmlGenericError(xmlGenericErrorContext,
4291 "HPP: try CONTENT\n");break;
4292 case XML_PARSER_CDATA_SECTION:
4293 xmlGenericError(xmlGenericErrorContext,
4294 "HPP: try CDATA_SECTION\n");break;
4295 case XML_PARSER_END_TAG:
4296 xmlGenericError(xmlGenericErrorContext,
4297 "HPP: try END_TAG\n");break;
4298 case XML_PARSER_ENTITY_DECL:
4299 xmlGenericError(xmlGenericErrorContext,
4300 "HPP: try ENTITY_DECL\n");break;
4301 case XML_PARSER_ENTITY_VALUE:
4302 xmlGenericError(xmlGenericErrorContext,
4303 "HPP: try ENTITY_VALUE\n");break;
4304 case XML_PARSER_ATTRIBUTE_VALUE:
4305 xmlGenericError(xmlGenericErrorContext,
4306 "HPP: try ATTRIBUTE_VALUE\n");break;
4307 case XML_PARSER_DTD:
4308 xmlGenericError(xmlGenericErrorContext,
4309 "HPP: try DTD\n");break;
4310 case XML_PARSER_EPILOG:
4311 xmlGenericError(xmlGenericErrorContext,
4312 "HPP: try EPILOG\n");break;
4313 case XML_PARSER_PI:
4314 xmlGenericError(xmlGenericErrorContext,
4315 "HPP: try PI\n");break;
4316 case XML_PARSER_SYSTEM_LITERAL:
4317 xmlGenericError(xmlGenericErrorContext,
4318 "HPP: try SYSTEM_LITERAL\n");break;
4319 }
4320#endif
4321
4322 while (1) {
4323
4324 in = ctxt->input;
4325 if (in == NULL) break;
4326 if (in->buf == NULL)
4327 avail = in->length - (in->cur - in->base);
4328 else
4329 avail = in->buf->buffer->use - (in->cur - in->base);
4330 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004331 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004332 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4333 /*
4334 * SAX: end of the document processing.
4335 */
4336 ctxt->instate = XML_PARSER_EOF;
4337 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4338 ctxt->sax->endDocument(ctxt->userData);
4339 }
4340 }
4341 if (avail < 1)
4342 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00004343 cur = in->cur[0];
4344 if (cur == 0) {
4345 SKIP(1);
4346 continue;
4347 }
4348
Owen Taylor3473f882001-02-23 17:55:21 +00004349 switch (ctxt->instate) {
4350 case XML_PARSER_EOF:
4351 /*
4352 * Document parsing is done !
4353 */
4354 goto done;
4355 case XML_PARSER_START:
4356 /*
4357 * Very first chars read from the document flow.
4358 */
4359 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004360 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004361 SKIP_BLANKS;
4362 if (in->buf == NULL)
4363 avail = in->length - (in->cur - in->base);
4364 else
4365 avail = in->buf->buffer->use - (in->cur - in->base);
4366 }
4367 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4368 ctxt->sax->setDocumentLocator(ctxt->userData,
4369 &xmlDefaultSAXLocator);
4370 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4371 (!ctxt->disableSAX))
4372 ctxt->sax->startDocument(ctxt->userData);
4373
4374 cur = in->cur[0];
4375 next = in->cur[1];
4376 if ((cur == '<') && (next == '!') &&
4377 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4378 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4379 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4380 (UPP(8) == 'E')) {
4381 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004382 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004383 goto done;
4384#ifdef DEBUG_PUSH
4385 xmlGenericError(xmlGenericErrorContext,
4386 "HPP: Parsing internal subset\n");
4387#endif
4388 htmlParseDocTypeDecl(ctxt);
4389 ctxt->instate = XML_PARSER_PROLOG;
4390#ifdef DEBUG_PUSH
4391 xmlGenericError(xmlGenericErrorContext,
4392 "HPP: entering PROLOG\n");
4393#endif
4394 } else {
4395 ctxt->instate = XML_PARSER_MISC;
4396 }
4397#ifdef DEBUG_PUSH
4398 xmlGenericError(xmlGenericErrorContext,
4399 "HPP: entering MISC\n");
4400#endif
4401 break;
4402 case XML_PARSER_MISC:
4403 SKIP_BLANKS;
4404 if (in->buf == NULL)
4405 avail = in->length - (in->cur - in->base);
4406 else
4407 avail = in->buf->buffer->use - (in->cur - in->base);
4408 if (avail < 2)
4409 goto done;
4410 cur = in->cur[0];
4411 next = in->cur[1];
4412 if ((cur == '<') && (next == '!') &&
4413 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4414 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004415 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004416 goto done;
4417#ifdef DEBUG_PUSH
4418 xmlGenericError(xmlGenericErrorContext,
4419 "HPP: Parsing Comment\n");
4420#endif
4421 htmlParseComment(ctxt);
4422 ctxt->instate = XML_PARSER_MISC;
4423 } else if ((cur == '<') && (next == '!') &&
4424 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4425 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4426 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4427 (UPP(8) == 'E')) {
4428 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004429 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004430 goto done;
4431#ifdef DEBUG_PUSH
4432 xmlGenericError(xmlGenericErrorContext,
4433 "HPP: Parsing internal subset\n");
4434#endif
4435 htmlParseDocTypeDecl(ctxt);
4436 ctxt->instate = XML_PARSER_PROLOG;
4437#ifdef DEBUG_PUSH
4438 xmlGenericError(xmlGenericErrorContext,
4439 "HPP: entering PROLOG\n");
4440#endif
4441 } else if ((cur == '<') && (next == '!') &&
4442 (avail < 9)) {
4443 goto done;
4444 } else {
4445 ctxt->instate = XML_PARSER_START_TAG;
4446#ifdef DEBUG_PUSH
4447 xmlGenericError(xmlGenericErrorContext,
4448 "HPP: entering START_TAG\n");
4449#endif
4450 }
4451 break;
4452 case XML_PARSER_PROLOG:
4453 SKIP_BLANKS;
4454 if (in->buf == NULL)
4455 avail = in->length - (in->cur - in->base);
4456 else
4457 avail = in->buf->buffer->use - (in->cur - in->base);
4458 if (avail < 2)
4459 goto done;
4460 cur = in->cur[0];
4461 next = in->cur[1];
4462 if ((cur == '<') && (next == '!') &&
4463 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4464 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004465 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004466 goto done;
4467#ifdef DEBUG_PUSH
4468 xmlGenericError(xmlGenericErrorContext,
4469 "HPP: Parsing Comment\n");
4470#endif
4471 htmlParseComment(ctxt);
4472 ctxt->instate = XML_PARSER_PROLOG;
4473 } else if ((cur == '<') && (next == '!') &&
4474 (avail < 4)) {
4475 goto done;
4476 } else {
4477 ctxt->instate = XML_PARSER_START_TAG;
4478#ifdef DEBUG_PUSH
4479 xmlGenericError(xmlGenericErrorContext,
4480 "HPP: entering START_TAG\n");
4481#endif
4482 }
4483 break;
4484 case XML_PARSER_EPILOG:
4485 if (in->buf == NULL)
4486 avail = in->length - (in->cur - in->base);
4487 else
4488 avail = in->buf->buffer->use - (in->cur - in->base);
4489 if (avail < 1)
4490 goto done;
4491 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004492 if (IS_BLANK_CH(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004493 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004494 goto done;
4495 }
4496 if (avail < 2)
4497 goto done;
4498 next = in->cur[1];
4499 if ((cur == '<') && (next == '!') &&
4500 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4501 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004502 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004503 goto done;
4504#ifdef DEBUG_PUSH
4505 xmlGenericError(xmlGenericErrorContext,
4506 "HPP: Parsing Comment\n");
4507#endif
4508 htmlParseComment(ctxt);
4509 ctxt->instate = XML_PARSER_EPILOG;
4510 } else if ((cur == '<') && (next == '!') &&
4511 (avail < 4)) {
4512 goto done;
4513 } else {
4514 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004515 ctxt->wellFormed = 0;
4516 ctxt->instate = XML_PARSER_EOF;
4517#ifdef DEBUG_PUSH
4518 xmlGenericError(xmlGenericErrorContext,
4519 "HPP: entering EOF\n");
4520#endif
4521 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4522 ctxt->sax->endDocument(ctxt->userData);
4523 goto done;
4524 }
4525 break;
4526 case XML_PARSER_START_TAG: {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004527 const xmlChar *name, *oldname;
Owen Taylor3473f882001-02-23 17:55:21 +00004528 int depth = ctxt->nameNr;
Daniel Veillardbb371292001-08-16 23:26:59 +00004529 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004530
4531 if (avail < 2)
4532 goto done;
4533 cur = in->cur[0];
4534 if (cur != '<') {
4535 ctxt->instate = XML_PARSER_CONTENT;
4536#ifdef DEBUG_PUSH
4537 xmlGenericError(xmlGenericErrorContext,
4538 "HPP: entering CONTENT\n");
4539#endif
4540 break;
4541 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004542 if (in->cur[1] == '/') {
4543 ctxt->instate = XML_PARSER_END_TAG;
4544 ctxt->checkIndex = 0;
4545#ifdef DEBUG_PUSH
4546 xmlGenericError(xmlGenericErrorContext,
4547 "HPP: entering END_TAG\n");
4548#endif
4549 break;
4550 }
Owen Taylor3473f882001-02-23 17:55:21 +00004551 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004552 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004553 goto done;
4554
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004555 oldname = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00004556 htmlParseStartTag(ctxt);
4557 name = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00004558 if (((depth == ctxt->nameNr) &&
4559 (xmlStrEqual(oldname, ctxt->name))) ||
4560 (name == NULL)) {
4561 if (CUR == '>')
4562 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004563 break;
4564 }
Owen Taylor3473f882001-02-23 17:55:21 +00004565
4566 /*
4567 * Lookup the info for that element.
4568 */
4569 info = htmlTagLookup(name);
4570 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004571 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4572 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004573 }
4574
4575 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004576 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004577 */
4578 if ((CUR == '/') && (NXT(1) == '>')) {
4579 SKIP(2);
4580 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4581 ctxt->sax->endElement(ctxt->userData, name);
4582 oldname = htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004583 ctxt->instate = XML_PARSER_CONTENT;
4584#ifdef DEBUG_PUSH
4585 xmlGenericError(xmlGenericErrorContext,
4586 "HPP: entering CONTENT\n");
4587#endif
4588 break;
4589 }
4590
4591 if (CUR == '>') {
4592 NEXT;
4593 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004594 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4595 "Couldn't find end of Start Tag %s\n",
4596 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004597
4598 /*
4599 * end of parsing of this node.
4600 */
4601 if (xmlStrEqual(name, ctxt->name)) {
4602 nodePop(ctxt);
4603 oldname = htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004604 }
4605
4606 ctxt->instate = XML_PARSER_CONTENT;
4607#ifdef DEBUG_PUSH
4608 xmlGenericError(xmlGenericErrorContext,
4609 "HPP: entering CONTENT\n");
4610#endif
4611 break;
4612 }
4613
4614 /*
4615 * Check for an Empty Element from DTD definition
4616 */
4617 if ((info != NULL) && (info->empty)) {
4618 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4619 ctxt->sax->endElement(ctxt->userData, name);
4620 oldname = htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004621 }
4622 ctxt->instate = XML_PARSER_CONTENT;
4623#ifdef DEBUG_PUSH
4624 xmlGenericError(xmlGenericErrorContext,
4625 "HPP: entering CONTENT\n");
4626#endif
4627 break;
4628 }
4629 case XML_PARSER_CONTENT: {
4630 long cons;
4631 /*
4632 * Handle preparsed entities and charRef
4633 */
4634 if (ctxt->token != 0) {
4635 xmlChar chr[2] = { 0 , 0 } ;
4636
4637 chr[0] = (xmlChar) ctxt->token;
4638 htmlCheckParagraph(ctxt);
4639 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4640 ctxt->sax->characters(ctxt->userData, chr, 1);
4641 ctxt->token = 0;
4642 ctxt->checkIndex = 0;
4643 }
4644 if ((avail == 1) && (terminate)) {
4645 cur = in->cur[0];
4646 if ((cur != '<') && (cur != '&')) {
4647 if (ctxt->sax != NULL) {
William M. Brack76e95df2003-10-18 16:20:14 +00004648 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004649 if (ctxt->sax->ignorableWhitespace != NULL)
4650 ctxt->sax->ignorableWhitespace(
4651 ctxt->userData, &cur, 1);
4652 } else {
4653 htmlCheckParagraph(ctxt);
4654 if (ctxt->sax->characters != NULL)
4655 ctxt->sax->characters(
4656 ctxt->userData, &cur, 1);
4657 }
4658 }
4659 ctxt->token = 0;
4660 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00004661 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00004662 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004663 }
Owen Taylor3473f882001-02-23 17:55:21 +00004664 }
4665 if (avail < 2)
4666 goto done;
4667 cur = in->cur[0];
4668 next = in->cur[1];
4669 cons = ctxt->nbChars;
4670 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4671 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4672 /*
4673 * Handle SCRIPT/STYLE separately
4674 */
4675 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004676 (htmlParseLookupSequence(ctxt, '<', '/', 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004677 goto done;
4678 htmlParseScript(ctxt);
4679 if ((cur == '<') && (next == '/')) {
4680 ctxt->instate = XML_PARSER_END_TAG;
4681 ctxt->checkIndex = 0;
4682#ifdef DEBUG_PUSH
4683 xmlGenericError(xmlGenericErrorContext,
4684 "HPP: entering END_TAG\n");
4685#endif
4686 break;
4687 }
4688 } else {
4689 /*
4690 * Sometimes DOCTYPE arrives in the middle of the document
4691 */
4692 if ((cur == '<') && (next == '!') &&
4693 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4694 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4695 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4696 (UPP(8) == 'E')) {
4697 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004698 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004699 goto done;
Daniel Veillardf403d292003-10-05 13:51:35 +00004700 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4701 "Misplaced DOCTYPE declaration\n",
4702 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004703 htmlParseDocTypeDecl(ctxt);
4704 } else if ((cur == '<') && (next == '!') &&
4705 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4706 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004707 (htmlParseLookupSequence(
4708 ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004709 goto done;
4710#ifdef DEBUG_PUSH
4711 xmlGenericError(xmlGenericErrorContext,
4712 "HPP: Parsing Comment\n");
4713#endif
4714 htmlParseComment(ctxt);
4715 ctxt->instate = XML_PARSER_CONTENT;
4716 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4717 goto done;
4718 } else if ((cur == '<') && (next == '/')) {
4719 ctxt->instate = XML_PARSER_END_TAG;
4720 ctxt->checkIndex = 0;
4721#ifdef DEBUG_PUSH
4722 xmlGenericError(xmlGenericErrorContext,
4723 "HPP: entering END_TAG\n");
4724#endif
4725 break;
4726 } else if (cur == '<') {
4727 ctxt->instate = XML_PARSER_START_TAG;
4728 ctxt->checkIndex = 0;
4729#ifdef DEBUG_PUSH
4730 xmlGenericError(xmlGenericErrorContext,
4731 "HPP: entering START_TAG\n");
4732#endif
4733 break;
4734 } else if (cur == '&') {
4735 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004736 (htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004737 goto done;
4738#ifdef DEBUG_PUSH
4739 xmlGenericError(xmlGenericErrorContext,
4740 "HPP: Parsing Reference\n");
4741#endif
4742 /* TODO: check generation of subtrees if noent !!! */
4743 htmlParseReference(ctxt);
4744 } else {
Daniel Veillard14f752c2003-08-09 11:44:50 +00004745 /*
4746 * check that the text sequence is complete
4747 * before handing out the data to the parser
4748 * to avoid problems with erroneous end of
4749 * data detection.
Owen Taylor3473f882001-02-23 17:55:21 +00004750 */
Daniel Veillard14f752c2003-08-09 11:44:50 +00004751 if ((!terminate) &&
4752 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
4753 goto done;
Owen Taylor3473f882001-02-23 17:55:21 +00004754 ctxt->checkIndex = 0;
4755#ifdef DEBUG_PUSH
4756 xmlGenericError(xmlGenericErrorContext,
4757 "HPP: Parsing char data\n");
4758#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004759 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004760 }
4761 }
4762 if (cons == ctxt->nbChars) {
4763 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004764 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4765 "detected an error in element content\n",
4766 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004767 }
4768 NEXT;
4769 break;
4770 }
4771
4772 break;
4773 }
4774 case XML_PARSER_END_TAG:
4775 if (avail < 2)
4776 goto done;
4777 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004778 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004779 goto done;
4780 htmlParseEndTag(ctxt);
4781 if (ctxt->nameNr == 0) {
4782 ctxt->instate = XML_PARSER_EPILOG;
4783 } else {
4784 ctxt->instate = XML_PARSER_CONTENT;
4785 }
4786 ctxt->checkIndex = 0;
4787#ifdef DEBUG_PUSH
4788 xmlGenericError(xmlGenericErrorContext,
4789 "HPP: entering CONTENT\n");
4790#endif
4791 break;
4792 case XML_PARSER_CDATA_SECTION:
Daniel Veillardf403d292003-10-05 13:51:35 +00004793 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4794 "HPP: internal error, state == CDATA\n",
4795 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004796 ctxt->instate = XML_PARSER_CONTENT;
4797 ctxt->checkIndex = 0;
4798#ifdef DEBUG_PUSH
4799 xmlGenericError(xmlGenericErrorContext,
4800 "HPP: entering CONTENT\n");
4801#endif
4802 break;
4803 case XML_PARSER_DTD:
Daniel Veillardf403d292003-10-05 13:51:35 +00004804 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4805 "HPP: internal error, state == DTD\n",
4806 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004807 ctxt->instate = XML_PARSER_CONTENT;
4808 ctxt->checkIndex = 0;
4809#ifdef DEBUG_PUSH
4810 xmlGenericError(xmlGenericErrorContext,
4811 "HPP: entering CONTENT\n");
4812#endif
4813 break;
4814 case XML_PARSER_COMMENT:
Daniel Veillardf403d292003-10-05 13:51:35 +00004815 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4816 "HPP: internal error, state == COMMENT\n",
4817 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004818 ctxt->instate = XML_PARSER_CONTENT;
4819 ctxt->checkIndex = 0;
4820#ifdef DEBUG_PUSH
4821 xmlGenericError(xmlGenericErrorContext,
4822 "HPP: entering CONTENT\n");
4823#endif
4824 break;
4825 case XML_PARSER_PI:
Daniel Veillardf403d292003-10-05 13:51:35 +00004826 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4827 "HPP: internal error, state == PI\n",
4828 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004829 ctxt->instate = XML_PARSER_CONTENT;
4830 ctxt->checkIndex = 0;
4831#ifdef DEBUG_PUSH
4832 xmlGenericError(xmlGenericErrorContext,
4833 "HPP: entering CONTENT\n");
4834#endif
4835 break;
4836 case XML_PARSER_ENTITY_DECL:
Daniel Veillardf403d292003-10-05 13:51:35 +00004837 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4838 "HPP: internal error, state == ENTITY_DECL\n",
4839 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004840 ctxt->instate = XML_PARSER_CONTENT;
4841 ctxt->checkIndex = 0;
4842#ifdef DEBUG_PUSH
4843 xmlGenericError(xmlGenericErrorContext,
4844 "HPP: entering CONTENT\n");
4845#endif
4846 break;
4847 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00004848 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4849 "HPP: internal error, state == ENTITY_VALUE\n",
4850 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004851 ctxt->instate = XML_PARSER_CONTENT;
4852 ctxt->checkIndex = 0;
4853#ifdef DEBUG_PUSH
4854 xmlGenericError(xmlGenericErrorContext,
4855 "HPP: entering DTD\n");
4856#endif
4857 break;
4858 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00004859 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4860 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
4861 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004862 ctxt->instate = XML_PARSER_START_TAG;
4863 ctxt->checkIndex = 0;
4864#ifdef DEBUG_PUSH
4865 xmlGenericError(xmlGenericErrorContext,
4866 "HPP: entering START_TAG\n");
4867#endif
4868 break;
4869 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00004870 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4871 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
4872 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004873 ctxt->instate = XML_PARSER_CONTENT;
4874 ctxt->checkIndex = 0;
4875#ifdef DEBUG_PUSH
4876 xmlGenericError(xmlGenericErrorContext,
4877 "HPP: entering CONTENT\n");
4878#endif
4879 break;
4880 case XML_PARSER_IGNORE:
Daniel Veillardf403d292003-10-05 13:51:35 +00004881 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4882 "HPP: internal error, state == XML_PARSER_IGNORE\n",
4883 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004884 ctxt->instate = XML_PARSER_CONTENT;
4885 ctxt->checkIndex = 0;
4886#ifdef DEBUG_PUSH
4887 xmlGenericError(xmlGenericErrorContext,
4888 "HPP: entering CONTENT\n");
4889#endif
4890 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00004891 case XML_PARSER_PUBLIC_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00004892 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4893 "HPP: internal error, state == XML_PARSER_LITERAL\n",
4894 NULL, NULL);
Daniel Veillard044fc6b2002-03-04 17:09:44 +00004895 ctxt->instate = XML_PARSER_CONTENT;
4896 ctxt->checkIndex = 0;
4897#ifdef DEBUG_PUSH
4898 xmlGenericError(xmlGenericErrorContext,
4899 "HPP: entering CONTENT\n");
4900#endif
4901 break;
4902
Owen Taylor3473f882001-02-23 17:55:21 +00004903 }
4904 }
4905done:
4906 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004907 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004908 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4909 /*
4910 * SAX: end of the document processing.
4911 */
4912 ctxt->instate = XML_PARSER_EOF;
4913 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4914 ctxt->sax->endDocument(ctxt->userData);
4915 }
4916 }
4917 if ((ctxt->myDoc != NULL) &&
4918 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4919 (ctxt->instate == XML_PARSER_EPILOG))) {
4920 xmlDtdPtr dtd;
4921 dtd = xmlGetIntSubset(ctxt->myDoc);
4922 if (dtd == NULL)
4923 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00004924 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00004925 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4926 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4927 }
4928#ifdef DEBUG_PUSH
4929 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4930#endif
4931 return(ret);
4932}
4933
4934/**
Owen Taylor3473f882001-02-23 17:55:21 +00004935 * htmlParseChunk:
Daniel Veillardf403d292003-10-05 13:51:35 +00004936 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00004937 * @chunk: an char array
4938 * @size: the size in byte of the chunk
4939 * @terminate: last chunk indicator
4940 *
4941 * Parse a Chunk of memory
4942 *
4943 * Returns zero if no error, the xmlParserErrors otherwise.
4944 */
4945int
4946htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4947 int terminate) {
4948 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4949 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4950 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4951 int cur = ctxt->input->cur - ctxt->input->base;
4952
4953 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4954 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4955 ctxt->input->cur = ctxt->input->base + cur;
4956#ifdef DEBUG_PUSH
4957 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4958#endif
4959
Daniel Veillard14f752c2003-08-09 11:44:50 +00004960#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00004961 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4962 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00004963#endif
Owen Taylor3473f882001-02-23 17:55:21 +00004964 } else if (ctxt->instate != XML_PARSER_EOF) {
Daniel Veillard14f752c2003-08-09 11:44:50 +00004965 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
4966 xmlParserInputBufferPtr in = ctxt->input->buf;
4967 if ((in->encoder != NULL) && (in->buffer != NULL) &&
4968 (in->raw != NULL)) {
4969 int nbchars;
4970
4971 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
4972 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004973 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
4974 "encoder error\n", NULL, NULL);
Daniel Veillard14f752c2003-08-09 11:44:50 +00004975 return(XML_ERR_INVALID_ENCODING);
4976 }
4977 }
4978 }
Owen Taylor3473f882001-02-23 17:55:21 +00004979 }
Daniel Veillard14f752c2003-08-09 11:44:50 +00004980 htmlParseTryOrFinish(ctxt, terminate);
Owen Taylor3473f882001-02-23 17:55:21 +00004981 if (terminate) {
4982 if ((ctxt->instate != XML_PARSER_EOF) &&
4983 (ctxt->instate != XML_PARSER_EPILOG) &&
4984 (ctxt->instate != XML_PARSER_MISC)) {
4985 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004986 ctxt->wellFormed = 0;
4987 }
4988 if (ctxt->instate != XML_PARSER_EOF) {
4989 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4990 ctxt->sax->endDocument(ctxt->userData);
4991 }
4992 ctxt->instate = XML_PARSER_EOF;
4993 }
4994 return((xmlParserErrors) ctxt->errNo);
4995}
Daniel Veillard73b013f2003-09-30 12:36:01 +00004996#endif /* LIBXML_PUSH_ENABLED */
Owen Taylor3473f882001-02-23 17:55:21 +00004997
4998/************************************************************************
4999 * *
5000 * User entry points *
5001 * *
5002 ************************************************************************/
5003
5004/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005005 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005006 * @sax: a SAX handler
5007 * @user_data: The user data returned on SAX callbacks
5008 * @chunk: a pointer to an array of chars
5009 * @size: number of chars in the array
5010 * @filename: an optional file name or URI
5011 * @enc: an optional encoding
5012 *
5013 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00005014 * The value of @filename is used for fetching external entities
5015 * and error/warning reports.
5016 *
5017 * Returns the new parser context or NULL
5018 */
5019htmlParserCtxtPtr
5020htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5021 const char *chunk, int size, const char *filename,
5022 xmlCharEncoding enc) {
5023 htmlParserCtxtPtr ctxt;
5024 htmlParserInputPtr inputStream;
5025 xmlParserInputBufferPtr buf;
5026
Daniel Veillardd0463562001-10-13 09:15:48 +00005027 xmlInitParser();
5028
Owen Taylor3473f882001-02-23 17:55:21 +00005029 buf = xmlAllocParserInputBuffer(enc);
5030 if (buf == NULL) return(NULL);
5031
Daniel Veillardf403d292003-10-05 13:51:35 +00005032 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005033 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005034 xmlFreeParserInputBuffer(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005035 return(NULL);
5036 }
Daniel Veillard77a90a72003-03-22 00:04:05 +00005037 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5038 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00005039 if (sax != NULL) {
Daniel Veillard092643b2003-09-25 14:29:29 +00005040 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
Owen Taylor3473f882001-02-23 17:55:21 +00005041 xmlFree(ctxt->sax);
5042 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5043 if (ctxt->sax == NULL) {
5044 xmlFree(buf);
5045 xmlFree(ctxt);
5046 return(NULL);
5047 }
5048 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5049 if (user_data != NULL)
5050 ctxt->userData = user_data;
5051 }
5052 if (filename == NULL) {
5053 ctxt->directory = NULL;
5054 } else {
5055 ctxt->directory = xmlParserGetDirectory(filename);
5056 }
5057
5058 inputStream = htmlNewInputStream(ctxt);
5059 if (inputStream == NULL) {
5060 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005061 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005062 return(NULL);
5063 }
5064
5065 if (filename == NULL)
5066 inputStream->filename = NULL;
5067 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00005068 inputStream->filename = (char *)
5069 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00005070 inputStream->buf = buf;
5071 inputStream->base = inputStream->buf->buffer->content;
5072 inputStream->cur = inputStream->buf->buffer->content;
Daniel Veillard5f704af2003-03-05 10:01:43 +00005073 inputStream->end =
5074 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005075
5076 inputPush(ctxt, inputStream);
5077
5078 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5079 (ctxt->input->buf != NULL)) {
Daniel Veillard5f704af2003-03-05 10:01:43 +00005080 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5081 int cur = ctxt->input->cur - ctxt->input->base;
5082
Owen Taylor3473f882001-02-23 17:55:21 +00005083 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00005084
5085 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5086 ctxt->input->cur = ctxt->input->base + cur;
5087 ctxt->input->end =
5088 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005089#ifdef DEBUG_PUSH
5090 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5091#endif
5092 }
5093
5094 return(ctxt);
5095}
5096
5097/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005098 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005099 * @cur: a pointer to an array of xmlChar
5100 * @encoding: a free form C string describing the HTML document encoding, or NULL
5101 * @sax: the SAX handler block
5102 * @userData: if using SAX, this pointer will be provided on callbacks.
5103 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005104 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5105 * to handle parse events. If sax is NULL, fallback to the default DOM
5106 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00005107 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005108 * Returns the resulting document tree unless SAX is NULL or the document is
5109 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005110 */
5111
5112htmlDocPtr
5113htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5114 htmlDocPtr ret;
5115 htmlParserCtxtPtr ctxt;
5116
Daniel Veillardd0463562001-10-13 09:15:48 +00005117 xmlInitParser();
5118
Owen Taylor3473f882001-02-23 17:55:21 +00005119 if (cur == NULL) return(NULL);
5120
5121
5122 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5123 if (ctxt == NULL) return(NULL);
5124 if (sax != NULL) {
Daniel Veillardb19ba832003-08-14 00:33:46 +00005125 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
Owen Taylor3473f882001-02-23 17:55:21 +00005126 ctxt->sax = sax;
5127 ctxt->userData = userData;
5128 }
5129
5130 htmlParseDocument(ctxt);
5131 ret = ctxt->myDoc;
5132 if (sax != NULL) {
5133 ctxt->sax = NULL;
5134 ctxt->userData = NULL;
5135 }
5136 htmlFreeParserCtxt(ctxt);
5137
5138 return(ret);
5139}
5140
5141/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005142 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005143 * @cur: a pointer to an array of xmlChar
5144 * @encoding: a free form C string describing the HTML document encoding, or NULL
5145 *
5146 * parse an HTML in-memory document and build a tree.
5147 *
5148 * Returns the resulting document tree
5149 */
5150
5151htmlDocPtr
5152htmlParseDoc(xmlChar *cur, const char *encoding) {
5153 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5154}
5155
5156
5157/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005158 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005159 * @filename: the filename
5160 * @encoding: a free form C string describing the HTML document encoding, or NULL
5161 *
5162 * Create a parser context for a file content.
5163 * Automatic support for ZLIB/Compress compressed document is provided
5164 * by default if found at compile-time.
5165 *
5166 * Returns the new parser context or NULL
5167 */
5168htmlParserCtxtPtr
5169htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5170{
5171 htmlParserCtxtPtr ctxt;
5172 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005173 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00005174 /* htmlCharEncoding enc; */
5175 xmlChar *content, *content_line = (xmlChar *) "charset=";
5176
Daniel Veillardf403d292003-10-05 13:51:35 +00005177 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005178 if (ctxt == NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +00005179 return(NULL);
5180 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005181 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5182 if (canonicFilename == NULL) {
Daniel Veillard87247e82004-01-13 20:42:02 +00005183#ifdef LIBXML_SAX1_ENABLED
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005184 if (xmlDefaultSAXHandler.error != NULL) {
5185 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5186 }
Daniel Veillard87247e82004-01-13 20:42:02 +00005187#endif
Daniel Veillard104caa32003-05-13 22:54:05 +00005188 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005189 return(NULL);
5190 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005191
5192 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5193 xmlFree(canonicFilename);
5194 if (inputStream == NULL) {
5195 xmlFreeParserCtxt(ctxt);
5196 return(NULL);
5197 }
Owen Taylor3473f882001-02-23 17:55:21 +00005198
5199 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005200
Owen Taylor3473f882001-02-23 17:55:21 +00005201 /* set encoding */
5202 if (encoding) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00005203 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
Owen Taylor3473f882001-02-23 17:55:21 +00005204 if (content) {
5205 strcpy ((char *)content, (char *)content_line);
5206 strcat ((char *)content, (char *)encoding);
5207 htmlCheckEncoding (ctxt, content);
5208 xmlFree (content);
5209 }
5210 }
5211
5212 return(ctxt);
5213}
5214
5215/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005216 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005217 * @filename: the filename
5218 * @encoding: a free form C string describing the HTML document encoding, or NULL
5219 * @sax: the SAX handler block
5220 * @userData: if using SAX, this pointer will be provided on callbacks.
5221 *
5222 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5223 * compressed document is provided by default if found at compile-time.
5224 * It use the given SAX function block to handle the parsing callback.
5225 * If sax is NULL, fallback to the default DOM tree building routines.
5226 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005227 * Returns the resulting document tree unless SAX is NULL or the document is
5228 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005229 */
5230
5231htmlDocPtr
5232htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5233 void *userData) {
5234 htmlDocPtr ret;
5235 htmlParserCtxtPtr ctxt;
5236 htmlSAXHandlerPtr oldsax = NULL;
5237
Daniel Veillardd0463562001-10-13 09:15:48 +00005238 xmlInitParser();
5239
Owen Taylor3473f882001-02-23 17:55:21 +00005240 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5241 if (ctxt == NULL) return(NULL);
5242 if (sax != NULL) {
5243 oldsax = ctxt->sax;
5244 ctxt->sax = sax;
5245 ctxt->userData = userData;
5246 }
5247
5248 htmlParseDocument(ctxt);
5249
5250 ret = ctxt->myDoc;
5251 if (sax != NULL) {
5252 ctxt->sax = oldsax;
5253 ctxt->userData = NULL;
5254 }
5255 htmlFreeParserCtxt(ctxt);
5256
5257 return(ret);
5258}
5259
5260/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005261 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005262 * @filename: the filename
5263 * @encoding: a free form C string describing the HTML document encoding, or NULL
5264 *
5265 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5266 * compressed document is provided by default if found at compile-time.
5267 *
5268 * Returns the resulting document tree
5269 */
5270
5271htmlDocPtr
5272htmlParseFile(const char *filename, const char *encoding) {
5273 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5274}
5275
5276/**
5277 * htmlHandleOmittedElem:
5278 * @val: int 0 or 1
5279 *
5280 * Set and return the previous value for handling HTML omitted tags.
5281 *
5282 * Returns the last value for 0 for no handling, 1 for auto insertion.
5283 */
5284
5285int
5286htmlHandleOmittedElem(int val) {
5287 int old = htmlOmittedDefaultValue;
5288
5289 htmlOmittedDefaultValue = val;
5290 return(old);
5291}
5292
Daniel Veillard930dfb62003-02-05 10:17:38 +00005293/**
5294 * htmlElementAllowedHere:
5295 * @parent: HTML parent element
5296 * @elt: HTML element
5297 *
5298 * Checks whether an HTML element may be a direct child of a parent element.
5299 * Note - doesn't check for deprecated elements
5300 *
5301 * Returns 1 if allowed; 0 otherwise.
5302 */
5303int
5304htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5305 const char** p ;
5306
5307 if ( ! elt || ! parent || ! parent->subelts )
5308 return 0 ;
5309
5310 for ( p = parent->subelts; *p; ++p )
5311 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5312 return 1 ;
5313
5314 return 0 ;
5315}
5316/**
5317 * htmlElementStatusHere:
5318 * @parent: HTML parent element
5319 * @elt: HTML element
5320 *
5321 * Checks whether an HTML element may be a direct child of a parent element.
5322 * and if so whether it is valid or deprecated.
5323 *
5324 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5325 */
5326htmlStatus
5327htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5328 if ( ! parent || ! elt )
5329 return HTML_INVALID ;
5330 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5331 return HTML_INVALID ;
5332
5333 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5334}
5335/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005336 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00005337 * @elt: HTML element
5338 * @attr: HTML attribute
5339 * @legacy: whether to allow deprecated attributes
5340 *
5341 * Checks whether an attribute is valid for an element
5342 * Has full knowledge of Required and Deprecated attributes
5343 *
5344 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5345 */
5346htmlStatus
5347htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5348 const char** p ;
5349
5350 if ( !elt || ! attr )
5351 return HTML_INVALID ;
5352
5353 if ( elt->attrs_req )
5354 for ( p = elt->attrs_req; *p; ++p)
5355 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5356 return HTML_REQUIRED ;
5357
5358 if ( elt->attrs_opt )
5359 for ( p = elt->attrs_opt; *p; ++p)
5360 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5361 return HTML_VALID ;
5362
5363 if ( legacy && elt->attrs_depr )
5364 for ( p = elt->attrs_depr; *p; ++p)
5365 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5366 return HTML_DEPRECATED ;
5367
5368 return HTML_INVALID ;
5369}
5370/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005371 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00005372 * @node: an htmlNodePtr in a tree
5373 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00005374 * for Element nodes)
5375 *
5376 * Checks whether the tree node is valid. Experimental (the author
5377 * only uses the HTML enhancements in a SAX parser)
5378 *
5379 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5380 * legacy allowed) or htmlElementStatusHere (otherwise).
5381 * for Attribute nodes, a return from htmlAttrAllowed
5382 * for other nodes, HTML_NA (no checks performed)
5383 */
5384htmlStatus
5385htmlNodeStatus(const htmlNodePtr node, int legacy) {
5386 if ( ! node )
5387 return HTML_INVALID ;
5388
5389 switch ( node->type ) {
5390 case XML_ELEMENT_NODE:
5391 return legacy
5392 ? ( htmlElementAllowedHere (
5393 htmlTagLookup(node->parent->name) , node->name
5394 ) ? HTML_VALID : HTML_INVALID )
5395 : htmlElementStatusHere(
5396 htmlTagLookup(node->parent->name) ,
5397 htmlTagLookup(node->name) )
5398 ;
5399 case XML_ATTRIBUTE_NODE:
5400 return htmlAttrAllowed(
5401 htmlTagLookup(node->parent->name) , node->name, legacy) ;
5402 default: return HTML_NA ;
5403 }
5404}
Daniel Veillard9475a352003-09-26 12:47:50 +00005405/************************************************************************
5406 * *
5407 * New set (2.6.0) of simpler and more flexible APIs *
5408 * *
5409 ************************************************************************/
5410/**
5411 * DICT_FREE:
5412 * @str: a string
5413 *
5414 * Free a string if it is not owned by the "dict" dictionnary in the
5415 * current scope
5416 */
5417#define DICT_FREE(str) \
5418 if ((str) && ((!dict) || \
5419 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
5420 xmlFree((char *)(str));
5421
5422/**
5423 * htmlCtxtReset:
Daniel Veillardf403d292003-10-05 13:51:35 +00005424 * @ctxt: an HTML parser context
Daniel Veillard9475a352003-09-26 12:47:50 +00005425 *
5426 * Reset a parser context
5427 */
5428void
5429htmlCtxtReset(htmlParserCtxtPtr ctxt)
5430{
5431 xmlParserInputPtr input;
5432 xmlDictPtr dict = ctxt->dict;
5433
5434 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5435 xmlFreeInputStream(input);
5436 }
5437 ctxt->inputNr = 0;
5438 ctxt->input = NULL;
5439
5440 ctxt->spaceNr = 0;
5441 ctxt->spaceTab[0] = -1;
5442 ctxt->space = &ctxt->spaceTab[0];
5443
5444
5445 ctxt->nodeNr = 0;
5446 ctxt->node = NULL;
5447
5448 ctxt->nameNr = 0;
5449 ctxt->name = NULL;
5450
5451 DICT_FREE(ctxt->version);
5452 ctxt->version = NULL;
5453 DICT_FREE(ctxt->encoding);
5454 ctxt->encoding = NULL;
5455 DICT_FREE(ctxt->directory);
5456 ctxt->directory = NULL;
5457 DICT_FREE(ctxt->extSubURI);
5458 ctxt->extSubURI = NULL;
5459 DICT_FREE(ctxt->extSubSystem);
5460 ctxt->extSubSystem = NULL;
5461 if (ctxt->myDoc != NULL)
5462 xmlFreeDoc(ctxt->myDoc);
5463 ctxt->myDoc = NULL;
5464
5465 ctxt->standalone = -1;
5466 ctxt->hasExternalSubset = 0;
5467 ctxt->hasPErefs = 0;
5468 ctxt->html = 1;
5469 ctxt->external = 0;
5470 ctxt->instate = XML_PARSER_START;
5471 ctxt->token = 0;
5472
5473 ctxt->wellFormed = 1;
5474 ctxt->nsWellFormed = 1;
5475 ctxt->valid = 1;
5476 ctxt->vctxt.userData = ctxt;
5477 ctxt->vctxt.error = xmlParserValidityError;
5478 ctxt->vctxt.warning = xmlParserValidityWarning;
5479 ctxt->record_info = 0;
5480 ctxt->nbChars = 0;
5481 ctxt->checkIndex = 0;
5482 ctxt->inSubset = 0;
5483 ctxt->errNo = XML_ERR_OK;
5484 ctxt->depth = 0;
5485 ctxt->charset = XML_CHAR_ENCODING_UTF8;
5486 ctxt->catalogs = NULL;
5487 xmlInitNodeInfoSeq(&ctxt->node_seq);
5488
5489 if (ctxt->attsDefault != NULL) {
5490 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
5491 ctxt->attsDefault = NULL;
5492 }
5493 if (ctxt->attsSpecial != NULL) {
5494 xmlHashFree(ctxt->attsSpecial, NULL);
5495 ctxt->attsSpecial = NULL;
5496 }
5497}
5498
5499/**
5500 * htmlCtxtUseOptions:
5501 * @ctxt: an HTML parser context
5502 * @options: a combination of htmlParserOption(s)
5503 *
5504 * Applies the options to the parser context
5505 *
5506 * Returns 0 in case of success, the set of unknown or unimplemented options
5507 * in case of error.
5508 */
5509int
5510htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
5511{
5512 if (options & HTML_PARSE_NOWARNING) {
5513 ctxt->sax->warning = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005514 ctxt->vctxt.warning = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00005515 options -= XML_PARSE_NOWARNING;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005516 ctxt->options |= XML_PARSE_NOWARNING;
Daniel Veillard9475a352003-09-26 12:47:50 +00005517 }
5518 if (options & HTML_PARSE_NOERROR) {
5519 ctxt->sax->error = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005520 ctxt->vctxt.error = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00005521 ctxt->sax->fatalError = NULL;
5522 options -= XML_PARSE_NOERROR;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005523 ctxt->options |= XML_PARSE_NOERROR;
Daniel Veillard9475a352003-09-26 12:47:50 +00005524 }
5525 if (options & HTML_PARSE_PEDANTIC) {
5526 ctxt->pedantic = 1;
5527 options -= XML_PARSE_PEDANTIC;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005528 ctxt->options |= XML_PARSE_PEDANTIC;
Daniel Veillard9475a352003-09-26 12:47:50 +00005529 } else
5530 ctxt->pedantic = 0;
5531 if (options & XML_PARSE_NOBLANKS) {
5532 ctxt->keepBlanks = 0;
5533 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5534 options -= XML_PARSE_NOBLANKS;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005535 ctxt->options |= XML_PARSE_NOBLANKS;
Daniel Veillard9475a352003-09-26 12:47:50 +00005536 } else
5537 ctxt->keepBlanks = 1;
5538 ctxt->dictNames = 0;
5539 return (options);
5540}
5541
5542/**
5543 * htmlDoRead:
5544 * @ctxt: an HTML parser context
5545 * @URL: the base URL to use for the document
5546 * @encoding: the document encoding, or NULL
5547 * @options: a combination of htmlParserOption(s)
5548 * @reuse: keep the context for reuse
5549 *
5550 * Common front-end for the htmlRead functions
5551 *
5552 * Returns the resulting document tree or NULL
5553 */
5554static htmlDocPtr
5555htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
5556 int options, int reuse)
5557{
5558 htmlDocPtr ret;
5559
5560 htmlCtxtUseOptions(ctxt, options);
5561 ctxt->html = 1;
5562 if (encoding != NULL) {
5563 xmlCharEncodingHandlerPtr hdlr;
5564
5565 hdlr = xmlFindCharEncodingHandler(encoding);
5566 if (hdlr != NULL)
5567 xmlSwitchToEncoding(ctxt, hdlr);
5568 }
5569 if ((URL != NULL) && (ctxt->input != NULL) &&
5570 (ctxt->input->filename == NULL))
5571 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
5572 htmlParseDocument(ctxt);
5573 ret = ctxt->myDoc;
5574 ctxt->myDoc = NULL;
5575 if (!reuse) {
5576 if ((ctxt->dictNames) &&
5577 (ret != NULL) &&
5578 (ret->dict == ctxt->dict))
5579 ctxt->dict = NULL;
5580 xmlFreeParserCtxt(ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00005581 }
5582 return (ret);
5583}
5584
5585/**
5586 * htmlReadDoc:
5587 * @cur: a pointer to a zero terminated string
5588 * @URL: the base URL to use for the document
5589 * @encoding: the document encoding, or NULL
5590 * @options: a combination of htmlParserOption(s)
5591 *
5592 * parse an XML in-memory document and build a tree.
5593 *
5594 * Returns the resulting document tree
5595 */
5596htmlDocPtr
5597htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
5598{
5599 htmlParserCtxtPtr ctxt;
5600
5601 if (cur == NULL)
5602 return (NULL);
5603
5604 ctxt = xmlCreateDocParserCtxt(cur);
5605 if (ctxt == NULL)
5606 return (NULL);
5607 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5608}
5609
5610/**
5611 * htmlReadFile:
5612 * @filename: a file or URL
5613 * @encoding: the document encoding, or NULL
5614 * @options: a combination of htmlParserOption(s)
5615 *
5616 * parse an XML file from the filesystem or the network.
5617 *
5618 * Returns the resulting document tree
5619 */
5620htmlDocPtr
5621htmlReadFile(const char *filename, const char *encoding, int options)
5622{
5623 htmlParserCtxtPtr ctxt;
5624
5625 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5626 if (ctxt == NULL)
5627 return (NULL);
5628 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
5629}
5630
5631/**
5632 * htmlReadMemory:
5633 * @buffer: a pointer to a char array
5634 * @size: the size of the array
5635 * @URL: the base URL to use for the document
5636 * @encoding: the document encoding, or NULL
5637 * @options: a combination of htmlParserOption(s)
5638 *
5639 * parse an XML in-memory document and build a tree.
5640 *
5641 * Returns the resulting document tree
5642 */
5643htmlDocPtr
5644htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
5645{
5646 htmlParserCtxtPtr ctxt;
5647
5648 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
5649 if (ctxt == NULL)
5650 return (NULL);
William M. Brackd43cdcd2004-08-03 15:13:29 +00005651 if (ctxt->sax != NULL)
5652 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Daniel Veillard9475a352003-09-26 12:47:50 +00005653 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5654}
5655
5656/**
5657 * htmlReadFd:
5658 * @fd: an open file descriptor
5659 * @URL: the base URL to use for the document
5660 * @encoding: the document encoding, or NULL
5661 * @options: a combination of htmlParserOption(s)
5662 *
5663 * parse an XML from a file descriptor and build a tree.
5664 *
5665 * Returns the resulting document tree
5666 */
5667htmlDocPtr
5668htmlReadFd(int fd, const char *URL, const char *encoding, int options)
5669{
5670 htmlParserCtxtPtr ctxt;
5671 xmlParserInputBufferPtr input;
5672 xmlParserInputPtr stream;
5673
5674 if (fd < 0)
5675 return (NULL);
5676
5677 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
5678 if (input == NULL)
5679 return (NULL);
5680 ctxt = xmlNewParserCtxt();
5681 if (ctxt == NULL) {
5682 xmlFreeParserInputBuffer(input);
5683 return (NULL);
5684 }
5685 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5686 if (stream == NULL) {
5687 xmlFreeParserInputBuffer(input);
5688 xmlFreeParserCtxt(ctxt);
5689 return (NULL);
5690 }
5691 inputPush(ctxt, stream);
5692 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5693}
5694
5695/**
5696 * htmlReadIO:
5697 * @ioread: an I/O read function
5698 * @ioclose: an I/O close function
5699 * @ioctx: an I/O handler
5700 * @URL: the base URL to use for the document
5701 * @encoding: the document encoding, or NULL
5702 * @options: a combination of htmlParserOption(s)
5703 *
5704 * parse an HTML document from I/O functions and source and build a tree.
5705 *
5706 * Returns the resulting document tree
5707 */
5708htmlDocPtr
5709htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
5710 void *ioctx, const char *URL, const char *encoding, int options)
5711{
5712 htmlParserCtxtPtr ctxt;
5713 xmlParserInputBufferPtr input;
5714 xmlParserInputPtr stream;
5715
5716 if (ioread == NULL)
5717 return (NULL);
5718
5719 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
5720 XML_CHAR_ENCODING_NONE);
5721 if (input == NULL)
5722 return (NULL);
5723 ctxt = xmlNewParserCtxt();
5724 if (ctxt == NULL) {
5725 xmlFreeParserInputBuffer(input);
5726 return (NULL);
5727 }
5728 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5729 if (stream == NULL) {
5730 xmlFreeParserInputBuffer(input);
5731 xmlFreeParserCtxt(ctxt);
5732 return (NULL);
5733 }
5734 inputPush(ctxt, stream);
5735 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5736}
5737
5738/**
5739 * htmlCtxtReadDoc:
5740 * @ctxt: an HTML parser context
5741 * @cur: a pointer to a zero terminated string
5742 * @URL: the base URL to use for the document
5743 * @encoding: the document encoding, or NULL
5744 * @options: a combination of htmlParserOption(s)
5745 *
5746 * parse an XML in-memory document and build a tree.
5747 * This reuses the existing @ctxt parser context
5748 *
5749 * Returns the resulting document tree
5750 */
5751htmlDocPtr
5752htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
5753 const char *URL, const char *encoding, int options)
5754{
5755 xmlParserInputPtr stream;
5756
5757 if (cur == NULL)
5758 return (NULL);
5759 if (ctxt == NULL)
5760 return (NULL);
5761
5762 htmlCtxtReset(ctxt);
5763
5764 stream = xmlNewStringInputStream(ctxt, cur);
5765 if (stream == NULL) {
5766 return (NULL);
5767 }
5768 inputPush(ctxt, stream);
5769 return (htmlDoRead(ctxt, URL, encoding, options, 1));
5770}
5771
5772/**
5773 * htmlCtxtReadFile:
5774 * @ctxt: an HTML parser context
5775 * @filename: a file or URL
5776 * @encoding: the document encoding, or NULL
5777 * @options: a combination of htmlParserOption(s)
5778 *
5779 * parse an XML file from the filesystem or the network.
5780 * This reuses the existing @ctxt parser context
5781 *
5782 * Returns the resulting document tree
5783 */
5784htmlDocPtr
5785htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
5786 const char *encoding, int options)
5787{
5788 xmlParserInputPtr stream;
5789
5790 if (filename == NULL)
5791 return (NULL);
5792 if (ctxt == NULL)
5793 return (NULL);
5794
5795 htmlCtxtReset(ctxt);
5796
5797 stream = xmlNewInputFromFile(ctxt, filename);
5798 if (stream == NULL) {
5799 return (NULL);
5800 }
5801 inputPush(ctxt, stream);
5802 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
5803}
5804
5805/**
5806 * htmlCtxtReadMemory:
5807 * @ctxt: an HTML parser context
5808 * @buffer: a pointer to a char array
5809 * @size: the size of the array
5810 * @URL: the base URL to use for the document
5811 * @encoding: the document encoding, or NULL
5812 * @options: a combination of htmlParserOption(s)
5813 *
5814 * parse an XML in-memory document and build a tree.
5815 * This reuses the existing @ctxt parser context
5816 *
5817 * Returns the resulting document tree
5818 */
5819htmlDocPtr
5820htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
5821 const char *URL, const char *encoding, int options)
5822{
5823 xmlParserInputBufferPtr input;
5824 xmlParserInputPtr stream;
5825
5826 if (ctxt == NULL)
5827 return (NULL);
5828 if (buffer == NULL)
5829 return (NULL);
5830
5831 htmlCtxtReset(ctxt);
5832
5833 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5834 if (input == NULL) {
5835 return(NULL);
5836 }
5837
5838 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5839 if (stream == NULL) {
5840 xmlFreeParserInputBuffer(input);
5841 return(NULL);
5842 }
5843
5844 inputPush(ctxt, stream);
5845 return (htmlDoRead(ctxt, URL, encoding, options, 1));
5846}
5847
5848/**
5849 * htmlCtxtReadFd:
5850 * @ctxt: an HTML parser context
5851 * @fd: an open file descriptor
5852 * @URL: the base URL to use for the document
5853 * @encoding: the document encoding, or NULL
5854 * @options: a combination of htmlParserOption(s)
5855 *
5856 * parse an XML from a file descriptor and build a tree.
5857 * This reuses the existing @ctxt parser context
5858 *
5859 * Returns the resulting document tree
5860 */
5861htmlDocPtr
5862htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
5863 const char *URL, const char *encoding, int options)
5864{
5865 xmlParserInputBufferPtr input;
5866 xmlParserInputPtr stream;
5867
5868 if (fd < 0)
5869 return (NULL);
5870 if (ctxt == NULL)
5871 return (NULL);
5872
5873 htmlCtxtReset(ctxt);
5874
5875
5876 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
5877 if (input == NULL)
5878 return (NULL);
5879 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5880 if (stream == NULL) {
5881 xmlFreeParserInputBuffer(input);
5882 return (NULL);
5883 }
5884 inputPush(ctxt, stream);
5885 return (htmlDoRead(ctxt, URL, encoding, options, 1));
5886}
5887
5888/**
5889 * htmlCtxtReadIO:
5890 * @ctxt: an HTML parser context
5891 * @ioread: an I/O read function
5892 * @ioclose: an I/O close function
5893 * @ioctx: an I/O handler
5894 * @URL: the base URL to use for the document
5895 * @encoding: the document encoding, or NULL
5896 * @options: a combination of htmlParserOption(s)
5897 *
5898 * parse an HTML document from I/O functions and source and build a tree.
5899 * This reuses the existing @ctxt parser context
5900 *
5901 * Returns the resulting document tree
5902 */
5903htmlDocPtr
5904htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
5905 xmlInputCloseCallback ioclose, void *ioctx,
5906 const char *URL,
5907 const char *encoding, int options)
5908{
5909 xmlParserInputBufferPtr input;
5910 xmlParserInputPtr stream;
5911
5912 if (ioread == NULL)
5913 return (NULL);
5914 if (ctxt == NULL)
5915 return (NULL);
5916
5917 htmlCtxtReset(ctxt);
5918
5919 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
5920 XML_CHAR_ENCODING_NONE);
5921 if (input == NULL)
5922 return (NULL);
5923 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5924 if (stream == NULL) {
5925 xmlFreeParserInputBuffer(input);
5926 return (NULL);
5927 }
5928 inputPush(ctxt, stream);
5929 return (htmlDoRead(ctxt, URL, encoding, options, 1));
5930}
5931
Owen Taylor3473f882001-02-23 17:55:21 +00005932#endif /* LIBXML_HTML_ENABLED */